Coverage Report

Created: 2025-09-27 07:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/cram/cram_io.c
Line
Count
Source
1
/*
2
Copyright (c) 2012-2025 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * CRAM I/O primitives.
33
 *
34
 * - ITF8 encoding and decoding.
35
 * - Block based I/O
36
 * - Zlib inflating and deflating (memory)
37
 * - CRAM basic data structure reading and writing
38
 * - File opening / closing
39
 * - Reference sequence handling
40
 */
41
42
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
43
#include <config.h>
44
45
#include <stdio.h>
46
#include <errno.h>
47
#include <assert.h>
48
#include <stdlib.h>
49
#include <string.h>
50
#include <unistd.h>
51
#include <zlib.h>
52
#ifdef HAVE_LIBBZ2
53
#include <bzlib.h>
54
#endif
55
#ifdef HAVE_LIBLZMA
56
#ifdef HAVE_LZMA_H
57
#include <lzma.h>
58
#else
59
#include "../os/lzma_stub.h"
60
#endif
61
#endif
62
#include <sys/types.h>
63
#include <sys/stat.h>
64
#include <math.h>
65
#include <stdint.h>
66
67
#ifdef HAVE_LIBDEFLATE
68
#include <libdeflate.h>
69
#define crc32(a,b,c) libdeflate_crc32((a),(b),(c))
70
#endif
71
72
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
73
#include "../fuzz_settings.h"
74
#endif
75
76
#include "cram.h"
77
#include "os.h"
78
#include "../htslib/hts.h"
79
#include "../hts_internal.h"
80
#include "open_trace_file.h"
81
82
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
83
#include <htscodecs/rANS_static.h>
84
#include <htscodecs/rANS_static4x16.h>
85
#include <htscodecs/arith_dynamic.h>
86
#include <htscodecs/tokenise_name3.h>
87
#include <htscodecs/fqzcomp_qual.h>
88
#include <htscodecs/varint.h> // CRAM v4.0 variable-size integers
89
#else
90
#include "../htscodecs/htscodecs/rANS_static.h"
91
#include "../htscodecs/htscodecs/rANS_static4x16.h"
92
#include "../htscodecs/htscodecs/arith_dynamic.h"
93
#include "../htscodecs/htscodecs/tokenise_name3.h"
94
#include "../htscodecs/htscodecs/fqzcomp_qual.h"
95
#include "../htscodecs/htscodecs/varint.h"
96
#endif
97
98
//#define REF_DEBUG
99
100
#ifdef REF_DEBUG
101
#include <sys/syscall.h>
102
#define gettid() (int)syscall(SYS_gettid)
103
104
#define RP(...) fprintf (stderr, __VA_ARGS__)
105
#else
106
#define RP(...)
107
#endif
108
109
#include "../htslib/hfile.h"
110
#include "../htslib/bgzf.h"
111
#include "../htslib/faidx.h"
112
#include "../hts_internal.h"
113
114
#ifndef PATH_MAX
115
#define PATH_MAX FILENAME_MAX
116
#endif
117
118
383k
#define TRIAL_SPAN 70
119
383k
#define NTRIALS 3
120
121
4.15k
#define CRAM_DEFAULT_LEVEL 5
122
123
/* ----------------------------------------------------------------------
124
 * ITF8 encoding and decoding.
125
 *
126
 * Also see the itf8_get and itf8_put macros in cram_io.h
127
 */
128
129
/*
130
 * LEGACY: consider using itf8_decode_crc.
131
 *
132
 * Reads an integer in ITF-8 encoding from 'cp' and stores it in
133
 * *val.
134
 *
135
 * Returns the number of bytes read on success
136
 *        -1 on failure
137
 */
138
0
int itf8_decode(cram_fd *fd, int32_t *val_p) {
139
0
    static int nbytes[16] = {
140
0
        0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
141
0
        1,1,1,1,                                        // 1000xxxx - 1011xxxx
142
0
        2,2,                                            // 1100xxxx - 1101xxxx
143
0
        3,                                              // 1110xxxx
144
0
        4,                                              // 1111xxxx
145
0
    };
146
147
0
    static int nbits[16] = {
148
0
        0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
149
0
        0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
150
0
        0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
151
0
        0x0f,                                           // 1110xxxx
152
0
        0x0f,                                           // 1111xxxx
153
0
    };
154
155
0
    int32_t val = hgetc(fd->fp);
156
0
    if (val == -1)
157
0
        return -1;
158
159
0
    int i = nbytes[val>>4];
160
0
    val &= nbits[val>>4];
161
162
0
    switch(i) {
163
0
    case 0:
164
0
        *val_p = val;
165
0
        return 1;
166
167
0
    case 1:
168
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
169
0
        *val_p = val;
170
0
        return 2;
171
172
0
    case 2:
173
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
174
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
175
0
        *val_p = val;
176
0
        return 3;
177
178
0
    case 3:
179
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
180
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
181
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
182
0
        *val_p = val;
183
0
        return 4;
184
185
0
    case 4: // really 3.5 more, why make it different?
186
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
187
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
188
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
189
0
        val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f);
190
0
        *val_p = val;
191
0
    }
192
193
0
    return 5;
194
0
}
195
196
192k
int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
197
192k
    static int nbytes[16] = {
198
192k
        0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
199
192k
        1,1,1,1,                                        // 1000xxxx - 1011xxxx
200
192k
        2,2,                                            // 1100xxxx - 1101xxxx
201
192k
        3,                                              // 1110xxxx
202
192k
        4,                                              // 1111xxxx
203
192k
    };
204
205
192k
    static int nbits[16] = {
206
192k
        0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
207
192k
        0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
208
192k
        0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
209
192k
        0x0f,                                           // 1110xxxx
210
192k
        0x0f,                                           // 1111xxxx
211
192k
    };
212
192k
    unsigned char c[5];
213
214
192k
    int32_t val = hgetc(fd->fp);
215
192k
    if (val == -1)
216
6
        return -1;
217
218
192k
    c[0]=val;
219
220
192k
    int i = nbytes[val>>4];
221
192k
    val &= nbits[val>>4];
222
223
192k
    if (i > 0) {
224
11.6k
        if (hread(fd->fp, &c[1], i) < i)
225
0
            return -1;
226
11.6k
    }
227
228
192k
    switch(i) {
229
181k
    case 0:
230
181k
        *val_p = val;
231
181k
        *crc = crc32(*crc, c, 1);
232
181k
        return 1;
233
234
5.54k
    case 1:
235
5.54k
        val = (val<<8) | c[1];
236
5.54k
        *val_p = val;
237
5.54k
        *crc = crc32(*crc, c, 2);
238
5.54k
        return 2;
239
240
1.06k
    case 2:
241
1.06k
        val = (val<<8) | c[1];
242
1.06k
        val = (val<<8) | c[2];
243
1.06k
        *val_p = val;
244
1.06k
        *crc = crc32(*crc, c, 3);
245
1.06k
        return 3;
246
247
967
    case 3:
248
967
        val = (val<<8) | c[1];
249
967
        val = (val<<8) | c[2];
250
967
        val = (val<<8) | c[3];
251
967
        *val_p = val;
252
967
        *crc = crc32(*crc, c, 4);
253
967
        return 4;
254
255
4.12k
    case 4: // really 3.5 more, why make it different?
256
4.12k
        {
257
4.12k
            uint32_t uv = val;
258
4.12k
            uv = (uv<<8) |  c[1];
259
4.12k
            uv = (uv<<8) |  c[2];
260
4.12k
            uv = (uv<<8) |  c[3];
261
4.12k
            uv = (uv<<4) | (c[4] & 0x0f);
262
            // Avoid implementation-defined behaviour on negative values
263
4.12k
            *val_p = uv < 0x80000000UL ? (int32_t) uv : -((int32_t) (0xffffffffUL - uv)) - 1;
264
4.12k
            *crc = crc32(*crc, c, 5);
265
4.12k
        }
266
192k
    }
267
268
4.12k
    return 5;
269
192k
}
270
271
/*
272
 * Stores a value to memory in ITF-8 format.
273
 *
274
 * Returns the number of bytes required to store the number.
275
 * This is a maximum of 5 bytes.
276
 */
277
18.9M
static inline int itf8_put(char *cp, int32_t val) {
278
18.9M
    unsigned char *up = (unsigned char *)cp;
279
18.9M
    if        (!(val & ~0x00000007f)) { // 1 byte
280
17.5M
        *up = val;
281
17.5M
        return 1;
282
17.5M
    } else if (!(val & ~0x00003fff)) { // 2 byte
283
367k
        *up++ = (val >> 8 ) | 0x80;
284
367k
        *up   = val & 0xff;
285
367k
        return 2;
286
964k
    } else if (!(val & ~0x01fffff)) { // 3 byte
287
22.4k
        *up++ = (val >> 16) | 0xc0;
288
22.4k
        *up++ = (val >> 8 ) & 0xff;
289
22.4k
        *up   = val & 0xff;
290
22.4k
        return 3;
291
941k
    } else if (!(val & ~0x0fffffff)) { // 4 byte
292
730k
        *up++ = (val >> 24) | 0xe0;
293
730k
        *up++ = (val >> 16) & 0xff;
294
730k
        *up++ = (val >> 8 ) & 0xff;
295
730k
        *up   = val & 0xff;
296
730k
        return 4;
297
730k
    } else {                           // 5 byte
298
211k
        *up++ = 0xf0 | ((val>>28) & 0xff);
299
211k
        *up++ = (val >> 20) & 0xff;
300
211k
        *up++ = (val >> 12) & 0xff;
301
211k
        *up++ = (val >> 4 ) & 0xff;
302
211k
        *up = val & 0x0f;
303
211k
        return 5;
304
211k
    }
305
18.9M
}
306
307
308
/* 64-bit itf8 variant */
309
141k
static inline int ltf8_put(char *cp, int64_t val) {
310
141k
    unsigned char *up = (unsigned char *)cp;
311
141k
    if        (!(val & ~((1LL<<7)-1))) {
312
89.7k
        *up = val;
313
89.7k
        return 1;
314
89.7k
    } else if (!(val & ~((1LL<<(6+8))-1))) {
315
50.6k
        *up++ = (val >> 8 ) | 0x80;
316
50.6k
        *up   = val & 0xff;
317
50.6k
        return 2;
318
50.6k
    } else if (!(val & ~((1LL<<(5+2*8))-1))) {
319
907
        *up++ = (val >> 16) | 0xc0;
320
907
        *up++ = (val >> 8 ) & 0xff;
321
907
        *up   = val & 0xff;
322
907
        return 3;
323
907
    } else if (!(val & ~((1LL<<(4+3*8))-1))) {
324
48
        *up++ = (val >> 24) | 0xe0;
325
48
        *up++ = (val >> 16) & 0xff;
326
48
        *up++ = (val >> 8 ) & 0xff;
327
48
        *up   = val & 0xff;
328
48
        return 4;
329
48
    } else if (!(val & ~((1LL<<(3+4*8))-1))) {
330
0
        *up++ = (val >> 32) | 0xf0;
331
0
        *up++ = (val >> 24) & 0xff;
332
0
        *up++ = (val >> 16) & 0xff;
333
0
        *up++ = (val >> 8 ) & 0xff;
334
0
        *up   = val & 0xff;
335
0
        return 5;
336
0
    } else if (!(val & ~((1LL<<(2+5*8))-1))) {
337
0
        *up++ = (val >> 40) | 0xf8;
338
0
        *up++ = (val >> 32) & 0xff;
339
0
        *up++ = (val >> 24) & 0xff;
340
0
        *up++ = (val >> 16) & 0xff;
341
0
        *up++ = (val >> 8 ) & 0xff;
342
0
        *up   = val & 0xff;
343
0
        return 6;
344
0
    } else if (!(val & ~((1LL<<(1+6*8))-1))) {
345
0
        *up++ = (val >> 48) | 0xfc;
346
0
        *up++ = (val >> 40) & 0xff;
347
0
        *up++ = (val >> 32) & 0xff;
348
0
        *up++ = (val >> 24) & 0xff;
349
0
        *up++ = (val >> 16) & 0xff;
350
0
        *up++ = (val >> 8 ) & 0xff;
351
0
        *up   = val & 0xff;
352
0
        return 7;
353
0
    } else if (!(val & ~((1LL<<(7*8))-1))) {
354
0
        *up++ = (val >> 56) | 0xfe;
355
0
        *up++ = (val >> 48) & 0xff;
356
0
        *up++ = (val >> 40) & 0xff;
357
0
        *up++ = (val >> 32) & 0xff;
358
0
        *up++ = (val >> 24) & 0xff;
359
0
        *up++ = (val >> 16) & 0xff;
360
0
        *up++ = (val >> 8 ) & 0xff;
361
0
        *up   = val & 0xff;
362
0
        return 8;
363
0
    } else {
364
0
        *up++ = 0xff;
365
0
        *up++ = (val >> 56) & 0xff;
366
0
        *up++ = (val >> 48) & 0xff;
367
0
        *up++ = (val >> 40) & 0xff;
368
0
        *up++ = (val >> 32) & 0xff;
369
0
        *up++ = (val >> 24) & 0xff;
370
0
        *up++ = (val >> 16) & 0xff;
371
0
        *up++ = (val >> 8 ) & 0xff;
372
0
        *up   = val & 0xff;
373
0
        return 9;
374
0
    }
375
141k
}
376
377
/*
378
 * Encodes and writes a single integer in ITF-8 format.
379
 * Returns 0 on success
380
 *        -1 on failure
381
 */
382
0
int itf8_encode(cram_fd *fd, int32_t val) {
383
0
    char buf[5];
384
0
    int len = itf8_put(buf, val);
385
0
    return hwrite(fd->fp, buf, len) == len ? 0 : -1;
386
0
}
387
388
const int itf8_bytes[16] = {
389
    1, 1, 1, 1,  1, 1, 1, 1,
390
    2, 2, 2, 2,  3, 3, 4, 5
391
};
392
393
const int ltf8_bytes[256] = {
394
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
395
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
396
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
397
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
398
399
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
400
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
401
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
402
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
403
404
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
405
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
406
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
407
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
408
409
    3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
410
    3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
411
412
    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
413
414
    5, 5, 5, 5,  5, 5, 5, 5,  6, 6, 6, 6,  7, 7, 8, 9
415
};
416
417
/*
418
 * LEGACY: consider using ltf8_decode_crc.
419
 */
420
0
int ltf8_decode(cram_fd *fd, int64_t *val_p) {
421
0
    int c = hgetc(fd->fp);
422
0
    int64_t val = (unsigned char)c;
423
0
    if (c == -1)
424
0
        return -1;
425
426
0
    if (val < 0x80) {
427
0
        *val_p =   val;
428
0
        return 1;
429
430
0
    } else if (val < 0xc0) {
431
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
432
0
        *val_p = val & (((1LL<<(6+8)))-1);
433
0
        return 2;
434
435
0
    } else if (val < 0xe0) {
436
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
437
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
438
0
        *val_p = val & ((1LL<<(5+2*8))-1);
439
0
        return 3;
440
441
0
    } else if (val < 0xf0) {
442
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
443
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
444
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
445
0
        *val_p = val & ((1LL<<(4+3*8))-1);
446
0
        return 4;
447
448
0
    } else if (val < 0xf8) {
449
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
450
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
451
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
452
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
453
0
        *val_p = val & ((1LL<<(3+4*8))-1);
454
0
        return 5;
455
456
0
    } else if (val < 0xfc) {
457
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
458
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
459
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
460
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
461
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
462
0
        *val_p = val & ((1LL<<(2+5*8))-1);
463
0
        return 6;
464
465
0
    } else if (val < 0xfe) {
466
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
467
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
468
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
469
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
470
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
471
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
472
0
        *val_p = val & ((1LL<<(1+6*8))-1);
473
0
        return 7;
474
475
0
    } else if (val < 0xff) {
476
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
477
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
478
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
479
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
480
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
481
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
482
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
483
0
        *val_p = val & ((1LL<<(7*8))-1);
484
0
        return 8;
485
486
0
    } else {
487
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
488
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
489
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
490
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
491
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
492
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
493
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
494
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
495
0
        *val_p = val;
496
0
    }
497
498
0
    return 9;
499
0
}
500
501
412
int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
502
412
    unsigned char c[9];
503
412
    int64_t val = hgetc(fd->fp);
504
412
    if (val < 0)
505
0
        return -1;
506
507
412
    c[0] = val;
508
509
412
    if (val < 0x80) {
510
362
        *val_p =   val;
511
362
        *crc = crc32(*crc, c, 1);
512
362
        return 1;
513
514
362
    } else if (val < 0xc0) {
515
22
        int v = hgetc(fd->fp);
516
22
        if (v < 0)
517
0
            return -1;
518
22
        val = (val<<8) | (c[1]=v);
519
22
        *val_p = val & (((1LL<<(6+8)))-1);
520
22
        *crc = crc32(*crc, c, 2);
521
22
        return 2;
522
523
28
    } else if (val < 0xe0) {
524
0
        if (hread(fd->fp, &c[1], 2) < 2)
525
0
            return -1;
526
0
        val = (val<<8) | c[1];
527
0
        val = (val<<8) | c[2];
528
0
        *val_p = val & ((1LL<<(5+2*8))-1);
529
0
        *crc = crc32(*crc, c, 3);
530
0
        return 3;
531
532
28
    } else if (val < 0xf0) {
533
12
        if (hread(fd->fp, &c[1], 3) < 3)
534
0
            return -1;
535
12
        val = (val<<8) | c[1];
536
12
        val = (val<<8) | c[2];
537
12
        val = (val<<8) | c[3];
538
12
        *val_p = val & ((1LL<<(4+3*8))-1);
539
12
        *crc = crc32(*crc, c, 4);
540
12
        return 4;
541
542
16
    } else if (val < 0xf8) {
543
0
        if (hread(fd->fp, &c[1], 4) < 4)
544
0
            return -1;
545
0
        val = (val<<8) | c[1];
546
0
        val = (val<<8) | c[2];
547
0
        val = (val<<8) | c[3];
548
0
        val = (val<<8) | c[4];
549
0
        *val_p = val & ((1LL<<(3+4*8))-1);
550
0
        *crc = crc32(*crc, c, 5);
551
0
        return 5;
552
553
16
    } else if (val < 0xfc) {
554
0
        if (hread(fd->fp, &c[1], 5) < 5)
555
0
            return -1;
556
0
        val = (val<<8) | c[1];
557
0
        val = (val<<8) | c[2];
558
0
        val = (val<<8) | c[3];
559
0
        val = (val<<8) | c[4];
560
0
        val = (val<<8) | c[5];
561
0
        *val_p = val & ((1LL<<(2+5*8))-1);
562
0
        *crc = crc32(*crc, c, 6);
563
0
        return 6;
564
565
16
    } else if (val < 0xfe) {
566
0
        if (hread(fd->fp, &c[1], 6) < 6)
567
0
            return -1;
568
0
        val = (val<<8) | c[1];
569
0
        val = (val<<8) | c[2];
570
0
        val = (val<<8) | c[3];
571
0
        val = (val<<8) | c[4];
572
0
        val = (val<<8) | c[5];
573
0
        val = (val<<8) | c[6];
574
0
        *val_p = val & ((1LL<<(1+6*8))-1);
575
0
        *crc = crc32(*crc, c, 7);
576
0
        return 7;
577
578
16
    } else if (val < 0xff) {
579
16
        uint64_t uval = val;
580
16
        if (hread(fd->fp, &c[1], 7) < 7)
581
0
            return -1;
582
16
        uval = (uval<<8) | c[1];
583
16
        uval = (uval<<8) | c[2];
584
16
        uval = (uval<<8) | c[3];
585
16
        uval = (uval<<8) | c[4];
586
16
        uval = (uval<<8) | c[5];
587
16
        uval = (uval<<8) | c[6];
588
16
        uval = (uval<<8) | c[7];
589
16
        *val_p = uval & ((1ULL<<(7*8))-1);
590
16
        *crc = crc32(*crc, c, 8);
591
16
        return 8;
592
593
16
    } else {
594
0
        uint64_t uval;
595
0
        if (hread(fd->fp, &c[1], 8) < 8)
596
0
            return -1;
597
0
        uval =             c[1];
598
0
        uval = (uval<<8) | c[2];
599
0
        uval = (uval<<8) | c[3];
600
0
        uval = (uval<<8) | c[4];
601
0
        uval = (uval<<8) | c[5];
602
0
        uval = (uval<<8) | c[6];
603
0
        uval = (uval<<8) | c[7];
604
0
        uval = (uval<<8) | c[8];
605
0
        *crc = crc32(*crc, c, 9);
606
        // Avoid implementation-defined behaviour on negative values
607
0
        *val_p = c[1] < 0x80 ? (int64_t) uval : -((int64_t) (0xffffffffffffffffULL - uval)) - 1;
608
0
    }
609
610
0
    return 9;
611
412
}
612
613
/*
614
 * Pushes a value in ITF8 format onto the end of a block.
615
 * This shouldn't be used for high-volume data as it is not the fastest
616
 * method.
617
 *
618
 * Returns the number of bytes written
619
 */
620
12.2M
int itf8_put_blk(cram_block *blk, int32_t val) {
621
12.2M
    char buf[5];
622
12.2M
    int sz;
623
624
12.2M
    sz = itf8_put(buf, val);
625
12.2M
    BLOCK_APPEND(blk, buf, sz);
626
12.2M
    return sz;
627
628
0
 block_err:
629
0
    return -1;
630
12.2M
}
631
632
0
int ltf8_put_blk(cram_block *blk, int64_t val) {
633
0
    char buf[9];
634
0
    int sz;
635
636
0
    sz = ltf8_put(buf, val);
637
0
    BLOCK_APPEND(blk, buf, sz);
638
0
    return sz;
639
640
0
 block_err:
641
0
    return -1;
642
0
}
643
644
175k
static int64_t safe_itf8_get(char **cp, const char *endp, int *err) {
645
175k
    const unsigned char *up = (unsigned char *)*cp;
646
647
175k
    if (endp && endp - *cp < 5 &&
648
16.3k
        (*cp >= endp || endp - *cp < itf8_bytes[up[0]>>4])) {
649
6.83k
        if (err) *err = 1;
650
6.83k
        return 0;
651
6.83k
    }
652
653
168k
    if (up[0] < 0x80) {
654
151k
        (*cp)++;
655
151k
        return up[0];
656
151k
    } else if (up[0] < 0xc0) {
657
9.02k
        (*cp)+=2;
658
9.02k
        return ((up[0] <<8) |  up[1])                           & 0x3fff;
659
9.02k
    } else if (up[0] < 0xe0) {
660
1.64k
        (*cp)+=3;
661
1.64k
        return ((up[0]<<16) | (up[1]<< 8) |  up[2])             & 0x1fffff;
662
6.73k
    } else if (up[0] < 0xf0) {
663
2.87k
        (*cp)+=4;
664
2.87k
        uint32_t uv = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
665
2.87k
        return (int32_t)uv;
666
3.86k
    } else {
667
3.86k
        (*cp)+=5;
668
3.86k
        uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
669
3.86k
        return (int32_t)uv;
670
3.86k
    }
671
168k
}
672
673
1.23k
static int64_t safe_ltf8_get(char **cp, const char *endp, int *err) {
674
1.23k
    unsigned char *up = (unsigned char *)*cp;
675
676
1.23k
    if (endp && endp - *cp < 9 &&
677
1.23k
        (*cp >= endp || endp - *cp < ltf8_bytes[up[0]])) {
678
388
        if (err) *err = 1;
679
388
        return 0;
680
388
    }
681
682
848
    if (up[0] < 0x80) {
683
525
        (*cp)++;
684
525
        return up[0];
685
525
    } else if (up[0] < 0xc0) {
686
39
        (*cp)+=2;
687
39
        return (((uint64_t)up[0]<< 8) |
688
39
                 (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
689
284
    } else if (up[0] < 0xe0) {
690
63
        (*cp)+=3;
691
63
        return (((uint64_t)up[0]<<16) |
692
63
                ((uint64_t)up[1]<< 8) |
693
63
                 (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
694
221
    } else if (up[0] < 0xf0) {
695
60
        (*cp)+=4;
696
60
        return (((uint64_t)up[0]<<24) |
697
60
                ((uint64_t)up[1]<<16) |
698
60
                ((uint64_t)up[2]<< 8) |
699
60
                 (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
700
161
    } else if (up[0] < 0xf8) {
701
143
        (*cp)+=5;
702
143
        return (((uint64_t)up[0]<<32) |
703
143
                ((uint64_t)up[1]<<24) |
704
143
                ((uint64_t)up[2]<<16) |
705
143
                ((uint64_t)up[3]<< 8) |
706
143
                 (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
707
143
    } else if (up[0] < 0xfc) {
708
18
        (*cp)+=6;
709
18
        return (((uint64_t)up[0]<<40) |
710
18
                ((uint64_t)up[1]<<32) |
711
18
                ((uint64_t)up[2]<<24) |
712
18
                ((uint64_t)up[3]<<16) |
713
18
                ((uint64_t)up[4]<< 8) |
714
18
                 (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
715
18
    } else if (up[0] < 0xfe) {
716
0
        (*cp)+=7;
717
0
        return (((uint64_t)up[0]<<48) |
718
0
                ((uint64_t)up[1]<<40) |
719
0
                ((uint64_t)up[2]<<32) |
720
0
                ((uint64_t)up[3]<<24) |
721
0
                ((uint64_t)up[4]<<16) |
722
0
                ((uint64_t)up[5]<< 8) |
723
0
                 (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
724
0
    } else if (up[0] < 0xff) {
725
0
        (*cp)+=8;
726
0
        return (((uint64_t)up[1]<<48) |
727
0
                ((uint64_t)up[2]<<40) |
728
0
                ((uint64_t)up[3]<<32) |
729
0
                ((uint64_t)up[4]<<24) |
730
0
                ((uint64_t)up[5]<<16) |
731
0
                ((uint64_t)up[6]<< 8) |
732
0
                 (uint64_t)up[7]) & ((1LL<<(7*8))-1);
733
0
    } else {
734
0
        (*cp)+=9;
735
0
        return (((uint64_t)up[1]<<56) |
736
0
                ((uint64_t)up[2]<<48) |
737
0
                ((uint64_t)up[3]<<40) |
738
0
                ((uint64_t)up[4]<<32) |
739
0
                ((uint64_t)up[5]<<24) |
740
0
                ((uint64_t)up[6]<<16) |
741
0
                ((uint64_t)up[7]<< 8) |
742
0
                 (uint64_t)up[8]);
743
0
    }
744
848
}
745
746
// Wrapper for now
747
6.67M
static int safe_itf8_put(char *cp, char *cp_end, int32_t val) {
748
6.67M
    return itf8_put(cp, val);
749
6.67M
}
750
751
141k
static int safe_ltf8_put(char *cp, char *cp_end, int64_t val) {
752
141k
    return ltf8_put(cp, val);
753
141k
}
754
755
1.50M
static int itf8_size(int64_t v) {
756
1.50M
    return ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5);
757
1.50M
}
758
759
//-----------------------------------------------------------------------------
760
761
// CRAM v4.0 onwards uses a different variable sized integer encoding
762
// that is size agnostic.
763
764
// Local interface to varint.h inline version, so we can use in func ptr.
765
// Note a lot of these use the unsigned interface but take signed int64_t.
766
// This is because the old CRAM ITF8 inteface had signed -1 as unsigned
767
// 0xffffffff.
768
726
static int uint7_size(int64_t v) {
769
726
    return var_size_u64(v);
770
726
}
771
772
0
static int64_t uint7_get_32(char **cp, const char *endp, int *err) {
773
0
    uint32_t val;
774
0
    int nb = var_get_u32((uint8_t *)(*cp), (const uint8_t *)endp, &val);
775
0
    (*cp) += nb;
776
0
    if (!nb && err) *err = 1;
777
0
    return val;
778
0
}
779
780
0
static int64_t sint7_get_32(char **cp, const char *endp, int *err) {
781
0
    int32_t val;
782
0
    int nb = var_get_s32((uint8_t *)(*cp), (const uint8_t *)endp, &val);
783
0
    (*cp) += nb;
784
0
    if (!nb && err) *err = 1;
785
0
    return val;
786
0
}
787
788
0
static int64_t uint7_get_64(char **cp, const char *endp, int *err) {
789
0
    uint64_t val;
790
0
    int nb = var_get_u64((uint8_t *)(*cp), (const uint8_t *)endp, &val);
791
0
    (*cp) += nb;
792
0
    if (!nb && err) *err = 1;
793
0
    return val;
794
0
}
795
796
0
static int64_t sint7_get_64(char **cp, const char *endp, int *err) {
797
0
    int64_t val;
798
0
    int nb = var_get_s64((uint8_t *)(*cp), (const uint8_t *)endp, &val);
799
0
    (*cp) += nb;
800
0
    if (!nb && err) *err = 1;
801
0
    return val;
802
0
}
803
804
0
static int uint7_put_32(char *cp, char *endp, int32_t val) {
805
0
    return var_put_u32((uint8_t *)cp, (uint8_t *)endp, val);
806
0
}
807
808
0
static int sint7_put_32(char *cp, char *endp, int32_t val) {
809
0
    return var_put_s32((uint8_t *)cp, (uint8_t *)endp, val);
810
0
}
811
812
0
static int uint7_put_64(char *cp, char *endp, int64_t val) {
813
0
    return var_put_u64((uint8_t *)cp, (uint8_t *)endp, val);
814
0
}
815
816
0
static int sint7_put_64(char *cp, char *endp, int64_t val) {
817
0
    return var_put_s64((uint8_t *)cp, (uint8_t *)endp, val);
818
0
}
819
820
// Put direct to to cram_block
821
0
static int uint7_put_blk_32(cram_block *blk, int32_t v) {
822
0
    uint8_t buf[10];
823
0
    int sz = var_put_u32(buf, buf+10, v);
824
0
    BLOCK_APPEND(blk, buf, sz);
825
0
    return sz;
826
827
0
 block_err:
828
0
    return -1;
829
0
}
830
831
0
static int sint7_put_blk_32(cram_block *blk, int32_t v) {
832
0
    uint8_t buf[10];
833
0
    int sz = var_put_s32(buf, buf+10, v);
834
0
    BLOCK_APPEND(blk, buf, sz);
835
0
    return sz;
836
837
0
 block_err:
838
0
    return -1;
839
0
}
840
841
0
static int uint7_put_blk_64(cram_block *blk, int64_t v) {
842
0
    uint8_t buf[10];
843
0
    int sz = var_put_u64(buf, buf+10, v);
844
0
    BLOCK_APPEND(blk, buf, sz);
845
0
    return sz;
846
847
0
 block_err:
848
0
    return -1;
849
0
}
850
851
0
static int sint7_put_blk_64(cram_block *blk, int64_t v) {
852
0
    uint8_t buf[10];
853
0
    int sz = var_put_s64(buf, buf+10, v);
854
0
    BLOCK_APPEND(blk, buf, sz);
855
0
    return sz;
856
857
0
 block_err:
858
0
    return -1;
859
0
}
860
861
// Decode 32-bits with CRC update from cram_fd
862
5.87k
static int uint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
863
5.87k
    uint8_t b[5], i = 0;
864
5.87k
    int c;
865
5.87k
    uint32_t v = 0;
866
867
#ifdef VARINT2
868
    b[0] = hgetc(fd->fp);
869
    if (b[0] < 177) {
870
    } else if (b[0] < 241) {
871
        b[1] = hgetc(fd->fp);
872
    } else if (b[0] < 249) {
873
        b[1] = hgetc(fd->fp);
874
        b[2] = hgetc(fd->fp);
875
    } else {
876
        int n = b[0]+2, z = 1;
877
        while (n-- >= 249)
878
            b[z++] = hgetc(fd->fp);
879
    }
880
    i = var_get_u32(b, NULL, &v);
881
#else
882
//    // Little endian
883
//    int s = 0;
884
//    do {
885
//        b[i++] = c = hgetc(fd->fp);
886
//        if (c < 0)
887
//            return -1;
888
//        v |= (c & 0x7f) << s;
889
//      s += 7;
890
//    } while (i < 5 && (c & 0x80));
891
892
    // Big endian, see also htscodecs/varint.h
893
6.86k
    do {
894
6.86k
        b[i++] = c = hgetc(fd->fp);
895
6.86k
        if (c < 0)
896
0
            return -1;
897
6.86k
        v = (v<<7) | (c & 0x7f);
898
6.86k
    } while (i < 5 && (c & 0x80));
899
5.87k
#endif
900
5.87k
    *crc = crc32(*crc, b, i);
901
902
5.87k
    *val_p = v;
903
5.87k
    return i;
904
5.87k
}
905
906
// Decode 32-bits with CRC update from cram_fd
907
832
static int sint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
908
832
    uint8_t b[5], i = 0;
909
832
    int c;
910
832
    uint32_t v = 0;
911
912
#ifdef VARINT2
913
    b[0] = hgetc(fd->fp);
914
    if (b[0] < 177) {
915
    } else if (b[0] < 241) {
916
        b[1] = hgetc(fd->fp);
917
    } else if (b[0] < 249) {
918
        b[1] = hgetc(fd->fp);
919
        b[2] = hgetc(fd->fp);
920
    } else {
921
        int n = b[0]+2, z = 1;
922
        while (n-- >= 249)
923
            b[z++] = hgetc(fd->fp);
924
    }
925
    i = var_get_u32(b, NULL, &v);
926
#else
927
//    // Little endian
928
//    int s = 0;
929
//    do {
930
//        b[i++] = c = hgetc(fd->fp);
931
//        if (c < 0)
932
//            return -1;
933
//        v |= (c & 0x7f) << s;
934
//      s += 7;
935
//    } while (i < 5 && (c & 0x80));
936
937
    // Big endian, see also htscodecs/varint.h
938
1.13k
    do {
939
1.13k
        b[i++] = c = hgetc(fd->fp);
940
1.13k
        if (c < 0)
941
0
            return -1;
942
1.13k
        v = (v<<7) | (c & 0x7f);
943
1.13k
    } while (i < 5 && (c & 0x80));
944
832
#endif
945
832
    *crc = crc32(*crc, b, i);
946
947
832
    *val_p = (v>>1) ^ -(v&1);
948
832
    return i;
949
832
}
950
951
952
// Decode 64-bits with CRC update from cram_fd
953
3.31k
static int uint7_decode_crc64(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
954
3.31k
    uint8_t b[10], i = 0;
955
3.31k
    int c;
956
3.31k
    uint64_t v = 0;
957
958
#ifdef VARINT2
959
    b[0] = hgetc(fd->fp);
960
    if (b[0] < 177) {
961
    } else if (b[0] < 241) {
962
        b[1] = hgetc(fd->fp);
963
    } else if (b[0] < 249) {
964
        b[1] = hgetc(fd->fp);
965
        b[2] = hgetc(fd->fp);
966
    } else {
967
        int n = b[0]+2, z = 1;
968
        while (n-- >= 249)
969
            b[z++] = hgetc(fd->fp);
970
    }
971
    i = var_get_u64(b, NULL, &v);
972
#else
973
//    // Little endian
974
//    int s = 0;
975
//    do {
976
//        b[i++] = c = hgetc(fd->fp);
977
//        if (c < 0)
978
//            return -1;
979
//        v |= (c & 0x7f) << s;
980
//      s += 7;
981
//    } while (i < 10 && (c & 0x80));
982
983
    // Big endian, see also htscodecs/varint.h
984
3.55k
    do {
985
3.55k
        b[i++] = c = hgetc(fd->fp);
986
3.55k
        if (c < 0)
987
3
            return -1;
988
3.55k
        v = (v<<7) | (c & 0x7f);
989
3.55k
    } while (i < 5 && (c & 0x80));
990
3.31k
#endif
991
3.31k
    *crc = crc32(*crc, b, i);
992
993
3.31k
    *val_p = v;
994
3.31k
    return i;
995
3.31k
}
996
997
//-----------------------------------------------------------------------------
998
999
/*
1000
 * Decodes a 32-bit little endian value from fd and stores in val.
1001
 *
1002
 * Returns the number of bytes read on success
1003
 *         -1 on failure
1004
 */
1005
3.30k
static int int32_decode(cram_fd *fd, int32_t *val) {
1006
3.30k
    int32_t i;
1007
3.30k
    if (4 != hread(fd->fp, &i, 4))
1008
3
        return -1;
1009
1010
3.30k
    *val = le_int4(i);
1011
3.30k
    return 4;
1012
3.30k
}
1013
1014
/*
1015
 * Encodes a 32-bit little endian value 'val' and writes to fd.
1016
 *
1017
 * Returns the number of bytes written on success
1018
 *         -1 on failure
1019
 */
1020
393k
static int int32_encode(cram_fd *fd, int32_t val) {
1021
393k
    uint32_t v = le_int4(val);
1022
393k
    if (4 != hwrite(fd->fp, &v, 4))
1023
0
        return -1;
1024
1025
393k
    return 4;
1026
393k
}
1027
1028
/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
1029
33
int int32_get_blk(cram_block *b, int32_t *val) {
1030
33
    if (b->uncomp_size - BLOCK_SIZE(b) < 4)
1031
0
        return -1;
1032
1033
33
    uint32_t v =
1034
33
         ((uint32_t) b->data[b->byte  ])        |
1035
33
        (((uint32_t) b->data[b->byte+1]) <<  8) |
1036
33
        (((uint32_t) b->data[b->byte+2]) << 16) |
1037
33
        (((uint32_t) b->data[b->byte+3]) << 24);
1038
    // Avoid implementation-defined behaviour on negative values
1039
33
    *val = v < 0x80000000U ? (int32_t) v : -((int32_t) (0xffffffffU - v)) - 1;
1040
33
    BLOCK_SIZE(b) += 4;
1041
33
    return 4;
1042
33
}
1043
1044
/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
1045
2.32k
int int32_put_blk(cram_block *b, int32_t val) {
1046
2.32k
    unsigned char cp[4];
1047
2.32k
    uint32_t v = val;
1048
2.32k
    cp[0] = ( v      & 0xff);
1049
2.32k
    cp[1] = ((v>>8)  & 0xff);
1050
2.32k
    cp[2] = ((v>>16) & 0xff);
1051
2.32k
    cp[3] = ((v>>24) & 0xff);
1052
1053
2.32k
    BLOCK_APPEND(b, cp, 4);
1054
2.32k
    return 0;
1055
1056
0
 block_err:
1057
0
    return -1;
1058
2.32k
}
1059
1060
#ifdef HAVE_LIBDEFLATE
1061
/* ----------------------------------------------------------------------
1062
 * libdeflate compression code, with interface to match
1063
 * zlib_mem_{in,de}flate for simplicity elsewhere.
1064
 */
1065
1066
// Named the same as the version that uses zlib as we always use libdeflate for
1067
// decompression when available.
1068
char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1069
    struct libdeflate_decompressor *z = libdeflate_alloc_decompressor();
1070
    if (!z) {
1071
        hts_log_error("Call to libdeflate_alloc_decompressor failed");
1072
        return NULL;
1073
    }
1074
1075
    uint8_t *data = NULL, *new_data;
1076
    if (!*size)
1077
        *size = csize*2;
1078
    for(;;) {
1079
        new_data = realloc(data, *size);
1080
        if (!new_data) {
1081
            hts_log_error("Memory allocation failure");
1082
            goto fail;
1083
        }
1084
        data = new_data;
1085
1086
        int ret = libdeflate_gzip_decompress(z, cdata, csize, data, *size, size);
1087
1088
        // Auto grow output buffer size if needed and try again.
1089
        // Fortunately for all bar one call of this we know the size already.
1090
        if (ret == LIBDEFLATE_INSUFFICIENT_SPACE) {
1091
            (*size) *= 1.5;
1092
            continue;
1093
        }
1094
1095
        if (ret != LIBDEFLATE_SUCCESS) {
1096
            hts_log_error("Inflate operation failed: %d", ret);
1097
            goto fail;
1098
        } else {
1099
            break;
1100
        }
1101
    }
1102
1103
    libdeflate_free_decompressor(z);
1104
    return (char *)data;
1105
1106
 fail:
1107
    libdeflate_free_decompressor(z);
1108
    free(data);
1109
    return NULL;
1110
}
1111
1112
// Named differently as we use both zlib/libdeflate for compression.
1113
static char *libdeflate_deflate(char *data, size_t size, size_t *cdata_size,
1114
                                int level, int strat) {
1115
    level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default
1116
    level *= 1.23;     // NB levels go up to 12 here; 5 onwards is +1
1117
    level += level>=8; // 5,6,7->6,7,8  8->10  9->12
1118
    if (level > 12) level = 12;
1119
1120
    if (strat == Z_RLE) // not supported by libdeflate
1121
        level = 1;
1122
1123
    struct libdeflate_compressor *z = libdeflate_alloc_compressor(level);
1124
    if (!z) {
1125
        hts_log_error("Call to libdeflate_alloc_compressor failed");
1126
        return NULL;
1127
    }
1128
1129
    unsigned char *cdata = NULL; /* Compressed output */
1130
    size_t cdata_alloc;
1131
    cdata = malloc(cdata_alloc = size*1.05+100);
1132
    if (!cdata) {
1133
        hts_log_error("Memory allocation failure");
1134
        libdeflate_free_compressor(z);
1135
        return NULL;
1136
    }
1137
1138
    *cdata_size = libdeflate_gzip_compress(z, data, size, cdata, cdata_alloc);
1139
    libdeflate_free_compressor(z);
1140
1141
    if (*cdata_size == 0) {
1142
        hts_log_error("Call to libdeflate_gzip_compress failed");
1143
        free(cdata);
1144
        return NULL;
1145
    }
1146
1147
    return (char *)cdata;
1148
}
1149
1150
#else
1151
1152
/* ----------------------------------------------------------------------
1153
 * zlib compression code - from Gap5's tg_iface_g.c
1154
 * They're static here as they're only used within the cram_compress_block
1155
 * and cram_uncompress_block functions, which are the external interface.
1156
 */
1157
2
char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1158
2
    z_stream s;
1159
2
    unsigned char *data = NULL; /* Uncompressed output */
1160
2
    int data_alloc = 0;
1161
2
    int err;
1162
1163
    /* Starting point at uncompressed size, and scale after that */
1164
2
    data = malloc(data_alloc = csize*1.2+100);
1165
2
    if (!data)
1166
0
        return NULL;
1167
1168
    /* Initialise zlib stream */
1169
2
    s.zalloc = Z_NULL; /* use default allocation functions */
1170
2
    s.zfree  = Z_NULL;
1171
2
    s.opaque = Z_NULL;
1172
2
    s.next_in  = (unsigned char *)cdata;
1173
2
    s.avail_in = csize;
1174
2
    s.total_in = 0;
1175
2
    s.next_out  = data;
1176
2
    s.avail_out = data_alloc;
1177
2
    s.total_out = 0;
1178
1179
    //err = inflateInit(&s);
1180
2
    err = inflateInit2(&s, 15 + 32);
1181
2
    if (err != Z_OK) {
1182
0
        hts_log_error("Call to zlib inflateInit failed: %s", s.msg);
1183
0
        free(data);
1184
0
        return NULL;
1185
0
    }
1186
1187
    /* Decode to 'data' array */
1188
2
    for (;s.avail_in;) {
1189
2
        unsigned char *data_tmp;
1190
2
        int alloc_inc;
1191
1192
2
        s.next_out = &data[s.total_out];
1193
2
        err = inflate(&s, Z_NO_FLUSH);
1194
2
        if (err == Z_STREAM_END)
1195
0
            break;
1196
1197
2
        if (err != Z_OK) {
1198
2
            hts_log_error("Call to zlib inflate failed: %s", s.msg);
1199
2
            free(data);
1200
2
            inflateEnd(&s);
1201
2
            return NULL;
1202
2
        }
1203
1204
        /* More to come, so realloc based on growth so far */
1205
0
        alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
1206
0
        data = realloc((data_tmp = data), data_alloc += alloc_inc);
1207
0
        if (!data) {
1208
0
            free(data_tmp);
1209
0
            inflateEnd(&s);
1210
0
            return NULL;
1211
0
        }
1212
0
        s.avail_out += alloc_inc;
1213
0
    }
1214
0
    inflateEnd(&s);
1215
1216
0
    *size = s.total_out;
1217
0
    return (char *)data;
1218
2
}
1219
#endif
1220
1221
#if !defined(HAVE_LIBDEFLATE) || LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR ==  1 && LIBDEFLATE_VERSION_MINOR <= 8)
1222
static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
1223
268k
                              int level, int strat) {
1224
268k
    z_stream s;
1225
268k
    unsigned char *cdata = NULL; /* Compressed output */
1226
268k
    int cdata_alloc = 0;
1227
268k
    int cdata_pos = 0;
1228
268k
    int err;
1229
1230
268k
    cdata = malloc(cdata_alloc = size*1.05+100);
1231
268k
    if (!cdata)
1232
0
        return NULL;
1233
268k
    cdata_pos = 0;
1234
1235
    /* Initialise zlib stream */
1236
268k
    s.zalloc = Z_NULL; /* use default allocation functions */
1237
268k
    s.zfree  = Z_NULL;
1238
268k
    s.opaque = Z_NULL;
1239
268k
    s.next_in  = (unsigned char *)data;
1240
268k
    s.avail_in = size;
1241
268k
    s.total_in = 0;
1242
268k
    s.next_out  = cdata;
1243
268k
    s.avail_out = cdata_alloc;
1244
268k
    s.total_out = 0;
1245
268k
    s.data_type = Z_BINARY;
1246
1247
268k
    err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
1248
268k
    if (err != Z_OK) {
1249
0
        hts_log_error("Call to zlib deflateInit2 failed: %s", s.msg);
1250
0
        return NULL;
1251
0
    }
1252
1253
    /* Encode to 'cdata' array */
1254
536k
    for (;s.avail_in;) {
1255
268k
        s.next_out = &cdata[cdata_pos];
1256
268k
        s.avail_out = cdata_alloc - cdata_pos;
1257
268k
        if (cdata_alloc - cdata_pos <= 0) {
1258
0
            hts_log_error("Deflate produced larger output than expected");
1259
0
            return NULL;
1260
0
        }
1261
268k
        err = deflate(&s, Z_NO_FLUSH);
1262
268k
        cdata_pos = cdata_alloc - s.avail_out;
1263
268k
        if (err != Z_OK) {
1264
0
            hts_log_error("Call to zlib deflate failed: %s", s.msg);
1265
0
            break;
1266
0
        }
1267
268k
    }
1268
268k
    if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
1269
0
        hts_log_error("Call to zlib deflate failed: %s", s.msg);
1270
0
    }
1271
268k
    *cdata_size = s.total_out;
1272
1273
268k
    if (deflateEnd(&s) != Z_OK) {
1274
0
        hts_log_error("Call to zlib deflate failed: %s", s.msg);
1275
0
    }
1276
268k
    return (char *)cdata;
1277
268k
}
1278
#endif
1279
1280
#ifdef HAVE_LIBLZMA
1281
/* ------------------------------------------------------------------------ */
1282
/*
1283
 * Data compression routines using liblzma (xz)
1284
 *
1285
 * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
1286
 * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
1287
 * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
1288
 * as compression times.
1289
 *
1290
 * For now we disable this functionality. If it's to be reenabled make sure you
1291
 * improve the mem_inflate implementation as it's just a test hack at the
1292
 * moment.
1293
 */
1294
1295
static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
1296
0
                              int level) {
1297
0
    char *out;
1298
0
    size_t out_size = lzma_stream_buffer_bound(size);
1299
0
    *cdata_size = 0;
1300
1301
0
    out = malloc(out_size);
1302
1303
    /* Single call compression */
1304
0
    if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
1305
0
                                           (uint8_t *)data, size,
1306
0
                                           (uint8_t *)out, cdata_size,
1307
0
                                           out_size))
1308
0
        return NULL;
1309
1310
0
    return out;
1311
0
}
1312
1313
3
static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
1314
3
    lzma_stream strm = LZMA_STREAM_INIT;
1315
3
    size_t out_size = 0, out_pos = 0;
1316
3
    char *out = NULL, *new_out;
1317
3
    int r;
1318
1319
    /* Initiate the decoder */
1320
3
    if (LZMA_OK != lzma_stream_decoder(&strm, lzma_easy_decoder_memusage(9), 0))
1321
0
        return NULL;
1322
1323
    /* Decode loop */
1324
3
    strm.avail_in = csize;
1325
3
    strm.next_in = (uint8_t *)cdata;
1326
1327
4
    for (;strm.avail_in;) {
1328
3
        if (strm.avail_in > out_size - out_pos) {
1329
3
            out_size += strm.avail_in * 4 + 32768;
1330
3
            new_out = realloc(out, out_size);
1331
3
            if (!new_out)
1332
0
                goto fail;
1333
3
            out = new_out;
1334
3
        }
1335
3
        strm.avail_out = out_size - out_pos;
1336
3
        strm.next_out = (uint8_t *)&out[out_pos];
1337
1338
3
        r = lzma_code(&strm, LZMA_RUN);
1339
3
        if (LZMA_OK != r && LZMA_STREAM_END != r) {
1340
2
            hts_log_error("LZMA decode failure (error %d)", r);
1341
2
            goto fail;
1342
2
        }
1343
1344
1
        out_pos = strm.total_out;
1345
1346
1
        if (r == LZMA_STREAM_END)
1347
0
            break;
1348
1
    }
1349
1350
    /* finish up any unflushed data; necessary? */
1351
1
    r = lzma_code(&strm, LZMA_FINISH);
1352
1
    if (r != LZMA_OK && r != LZMA_STREAM_END) {
1353
0
        hts_log_error("Call to lzma_code failed with error %d", r);
1354
0
        goto fail;
1355
0
    }
1356
1357
1
    new_out = realloc(out, strm.total_out > 0 ? strm.total_out : 1);
1358
1
    if (new_out)
1359
1
        out = new_out;
1360
1
    *size = strm.total_out;
1361
1362
1
    lzma_end(&strm);
1363
1364
1
    return out;
1365
1366
2
 fail:
1367
2
    lzma_end(&strm);
1368
2
    free(out);
1369
2
    return NULL;
1370
1
}
1371
#endif
1372
1373
/* ----------------------------------------------------------------------
1374
 * CRAM blocks - the dynamically growable data block. We have code to
1375
 * create, update, (un)compress and read/write.
1376
 *
1377
 * These are derived from the deflate_interlaced.c blocks, but with the
1378
 * CRAM extension of content types and IDs.
1379
 */
1380
1381
/*
1382
 * Allocates a new cram_block structure with a specified content_type and
1383
 * id.
1384
 *
1385
 * Returns block pointer on success
1386
 *         NULL on failure
1387
 */
1388
cram_block *cram_new_block(enum cram_content_type content_type,
1389
1.18M
                           int content_id) {
1390
1.18M
    cram_block *b = malloc(sizeof(*b));
1391
1.18M
    if (!b)
1392
0
        return NULL;
1393
1.18M
    b->method = b->orig_method = RAW;
1394
1.18M
    b->content_type = content_type;
1395
1.18M
    b->content_id = content_id;
1396
1.18M
    b->comp_size = 0;
1397
1.18M
    b->uncomp_size = 0;
1398
1.18M
    b->data = NULL;
1399
1.18M
    b->alloc = 0;
1400
1.18M
    b->byte = 0;
1401
1.18M
    b->bit = 7; // MSB
1402
1.18M
    b->crc32 = 0;
1403
1.18M
    b->idx = 0;
1404
1.18M
    b->m = NULL;
1405
1406
1.18M
    return b;
1407
1.18M
}
1408
1409
/*
1410
 * Reads a block from a cram file.
1411
 * Returns cram_block pointer on success.
1412
 *         NULL on failure
1413
 */
1414
4.59k
cram_block *cram_read_block(cram_fd *fd) {
1415
4.59k
    cram_block *b = malloc(sizeof(*b));
1416
4.59k
    unsigned char c;
1417
4.59k
    uint32_t crc = 0;
1418
4.59k
    if (!b)
1419
0
        return NULL;
1420
1421
    //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
1422
1423
4.59k
    if (-1 == (b->method      = hgetc(fd->fp))) { free(b); return NULL; }
1424
4.59k
    c = b->method; crc = crc32(crc, &c, 1);
1425
4.59k
    if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; }
1426
4.59k
    c = b->content_type; crc = crc32(crc, &c, 1);
1427
4.59k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->content_id, &crc))  { free(b); return NULL; }
1428
4.59k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->comp_size, &crc))   { free(b); return NULL; }
1429
4.59k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; }
1430
1431
    //fprintf(stderr, "  method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
1432
    //      b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
1433
1434
4.59k
    if (b->method == RAW) {
1435
3.21k
        if (b->uncomp_size < 0 || b->comp_size != b->uncomp_size) {
1436
14
            free(b);
1437
14
            return NULL;
1438
14
        }
1439
3.20k
        b->alloc = b->uncomp_size;
1440
3.20k
        if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
1441
3.20k
        if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) {
1442
6
            free(b->data);
1443
6
            free(b);
1444
6
            return NULL;
1445
6
        }
1446
3.20k
    } else {
1447
1.37k
        if (b->comp_size < 0 || b->uncomp_size < 0) {
1448
3
            free(b);
1449
3
            return NULL;
1450
3
        }
1451
1.37k
        b->alloc = b->comp_size;
1452
1.37k
        if (!(b->data = malloc(b->comp_size)))  { free(b); return NULL; }
1453
1.37k
        if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) {
1454
5
            free(b->data);
1455
5
            free(b);
1456
5
            return NULL;
1457
5
        }
1458
1.37k
    }
1459
1460
4.56k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1461
519
        if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
1462
0
            free(b->data);
1463
0
            free(b);
1464
0
            return NULL;
1465
0
        }
1466
1467
519
        b->crc32_checked = fd->ignore_md5;
1468
519
        b->crc_part = crc;
1469
4.04k
    } else {
1470
4.04k
        b->crc32_checked = 1; // CRC not present
1471
4.04k
    }
1472
1473
4.56k
    b->orig_method = b->method;
1474
4.56k
    b->idx = 0;
1475
4.56k
    b->byte = 0;
1476
4.56k
    b->bit = 7; // MSB
1477
1478
4.56k
    return b;
1479
4.56k
}
1480
1481
1482
/*
1483
 * Computes the size of a cram block, including the block
1484
 * header itself.
1485
 */
1486
0
uint32_t cram_block_size(cram_block *b) {
1487
0
    unsigned char dat[100], *cp = dat;;
1488
0
    uint32_t sz;
1489
1490
0
    *cp++ = b->method;
1491
0
    *cp++ = b->content_type;
1492
0
    cp += itf8_put((char*)cp, b->content_id);
1493
0
    cp += itf8_put((char*)cp, b->comp_size);
1494
0
    cp += itf8_put((char*)cp, b->uncomp_size);
1495
1496
0
    sz = cp-dat + 4;
1497
0
    sz += b->method == RAW ? b->uncomp_size : b->comp_size;
1498
1499
0
    return sz;
1500
0
}
1501
1502
/*
1503
 * Writes a CRAM block.
1504
 * Returns 0 on success
1505
 *        -1 on failure
1506
 */
1507
393k
int cram_write_block(cram_fd *fd, cram_block *b) {
1508
393k
    char vardata[100];
1509
393k
    int vardata_o = 0;
1510
1511
393k
    assert(b->method != RAW || (b->comp_size == b->uncomp_size));
1512
1513
393k
    if (hputc(b->method,       fd->fp)  == EOF) return -1;
1514
393k
    if (hputc(b->content_type, fd->fp)  == EOF) return -1;
1515
393k
    vardata_o += fd->vv.varint_put32(vardata          , vardata+100, b->content_id);
1516
393k
    vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->comp_size);
1517
393k
    vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->uncomp_size);
1518
393k
    if (vardata_o != hwrite(fd->fp, vardata, vardata_o))
1519
0
        return -1;
1520
1521
393k
    if (b->data) {
1522
350k
        if (b->method == RAW) {
1523
297k
            if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size))
1524
0
                return -1;
1525
297k
        } else {
1526
52.1k
            if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size))
1527
0
                return -1;
1528
52.1k
        }
1529
350k
    } else {
1530
        // Absent blocks should be size 0
1531
43.3k
        assert(b->method == RAW && b->uncomp_size == 0);
1532
43.3k
    }
1533
1534
393k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1535
393k
        char dat[100], *cp = (char *)dat;
1536
393k
        uint32_t crc;
1537
1538
393k
        *cp++ = b->method;
1539
393k
        *cp++ = b->content_type;
1540
393k
        cp += fd->vv.varint_put32(cp, dat+100, b->content_id);
1541
393k
        cp += fd->vv.varint_put32(cp, dat+100, b->comp_size);
1542
393k
        cp += fd->vv.varint_put32(cp, dat+100, b->uncomp_size);
1543
393k
        crc = crc32(0L, (uc *)dat, cp-dat);
1544
1545
393k
        if (b->method == RAW) {
1546
341k
            b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
1547
341k
        } else {
1548
52.1k
            b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
1549
52.1k
        }
1550
1551
393k
        if (-1 == int32_encode(fd, b->crc32))
1552
0
            return -1;
1553
393k
    }
1554
1555
393k
    return 0;
1556
393k
}
1557
1558
/*
1559
 * Frees a CRAM block, deallocating internal data too.
1560
 */
1561
1.62M
void cram_free_block(cram_block *b) {
1562
1.62M
    if (!b)
1563
434k
        return;
1564
1.18M
    if (b->data)
1565
800k
        free(b->data);
1566
1.18M
    free(b);
1567
1.18M
}
1568
1569
/*
1570
 * Uncompresses a CRAM block, if compressed.
1571
 */
1572
3.26k
int cram_uncompress_block(cram_block *b) {
1573
3.26k
    char *uncomp;
1574
3.26k
    size_t uncomp_size = 0;
1575
1576
3.26k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1577
    // Pretend the CRC was OK so the fuzzer doesn't have to get it right
1578
3.26k
    b->crc32_checked = 1;
1579
3.26k
#endif
1580
1581
3.26k
    if (b->crc32_checked == 0) {
1582
0
        uint32_t crc = crc32(b->crc_part, b->data ? b->data : (uc *)"", b->alloc);
1583
0
        b->crc32_checked = 1;
1584
0
        if (crc != b->crc32) {
1585
0
            hts_log_error("Block CRC32 failure");
1586
0
            return -1;
1587
0
        }
1588
0
    }
1589
1590
3.26k
    if (b->uncomp_size == 0) {
1591
        // blank block
1592
2.80k
        b->method = RAW;
1593
2.80k
        return 0;
1594
2.80k
    }
1595
3.26k
    assert(b->uncomp_size >= 0); // cram_read_block should ensure this
1596
1597
460
    switch (b->method) {
1598
13
    case RAW:
1599
13
        return 0;
1600
1601
2
    case GZIP:
1602
2
        uncomp_size = b->uncomp_size;
1603
2
        uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1604
1605
2
        if (!uncomp)
1606
2
            return -1;
1607
0
        if (uncomp_size != b->uncomp_size) {
1608
0
            free(uncomp);
1609
0
            return -1;
1610
0
        }
1611
0
        free(b->data);
1612
0
        b->data = (unsigned char *)uncomp;
1613
0
        b->alloc = uncomp_size;
1614
0
        b->method = RAW;
1615
0
        break;
1616
1617
0
#ifdef HAVE_LIBBZ2
1618
0
    case BZIP2: {
1619
0
        unsigned int usize = b->uncomp_size;
1620
0
        if (!(uncomp = malloc(usize)))
1621
0
            return -1;
1622
0
        if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
1623
0
                                                (char *)b->data, b->comp_size,
1624
0
                                                0, 0)) {
1625
0
            free(uncomp);
1626
0
            return -1;
1627
0
        }
1628
0
        free(b->data);
1629
0
        b->data = (unsigned char *)uncomp;
1630
0
        b->alloc = usize;
1631
0
        b->method = RAW;
1632
0
        b->uncomp_size = usize; // Just in case it differs
1633
0
        break;
1634
0
    }
1635
#else
1636
    case BZIP2:
1637
        hts_log_error("Bzip2 compression is not compiled into this version. Please rebuild and try again");
1638
        return -1;
1639
#endif
1640
1641
0
#ifdef HAVE_LIBLZMA
1642
3
    case LZMA:
1643
3
        uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1644
3
        if (!uncomp)
1645
2
            return -1;
1646
1
        if (uncomp_size != b->uncomp_size) {
1647
1
            free(uncomp);
1648
1
            return -1;
1649
1
        }
1650
0
        free(b->data);
1651
0
        b->data = (unsigned char *)uncomp;
1652
0
        b->alloc = uncomp_size;
1653
0
        b->method = RAW;
1654
0
        break;
1655
#else
1656
    case LZMA:
1657
        hts_log_error("Lzma compression is not compiled into this version. Please rebuild and try again");
1658
        return -1;
1659
        break;
1660
#endif
1661
1662
6
    case RANS: {
1663
6
        unsigned int usize = b->uncomp_size, usize2;
1664
6
        uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2);
1665
6
        if (!uncomp)
1666
6
            return -1;
1667
0
        if (usize != usize2) {
1668
0
            free(uncomp);
1669
0
            return -1;
1670
0
        }
1671
0
        free(b->data);
1672
0
        b->data = (unsigned char *)uncomp;
1673
0
        b->alloc = usize2;
1674
0
        b->method = RAW;
1675
0
        b->uncomp_size = usize2; // Just in case it differs
1676
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1677
0
        break;
1678
0
    }
1679
1680
93
    case FQZ: {
1681
93
        uncomp_size = b->uncomp_size;
1682
93
        uncomp = fqz_decompress((char *)b->data, b->comp_size, &uncomp_size, NULL, 0);
1683
93
        if (!uncomp)
1684
65
            return -1;
1685
28
        free(b->data);
1686
28
        b->data = (unsigned char *)uncomp;
1687
28
        b->alloc = uncomp_size;
1688
28
        b->method = RAW;
1689
28
        b->uncomp_size = uncomp_size;
1690
28
        break;
1691
93
    }
1692
1693
0
    case RANS_PR0: {
1694
0
        unsigned int usize = b->uncomp_size, usize2;
1695
0
        uncomp = (char *)rans_uncompress_4x16(b->data, b->comp_size, &usize2);
1696
0
        if (!uncomp)
1697
0
            return -1;
1698
0
        if (usize != usize2) {
1699
0
            free(uncomp);
1700
0
            return -1;
1701
0
        }
1702
0
        b->orig_method = RANSPR;
1703
0
        free(b->data);
1704
0
        b->data = (unsigned char *)uncomp;
1705
0
        b->alloc = usize2;
1706
0
        b->method = RAW;
1707
0
        b->uncomp_size = usize2; // Just incase it differs
1708
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1709
0
        break;
1710
0
    }
1711
1712
20
    case ARITH_PR0: {
1713
20
        unsigned int usize = b->uncomp_size, usize2;
1714
20
        uncomp = (char *)arith_uncompress_to(b->data, b->comp_size, NULL, &usize2);
1715
20
        if (!uncomp)
1716
4
            return -1;
1717
16
        if (usize != usize2) {
1718
0
            free(uncomp);
1719
0
            return -1;
1720
0
        }
1721
16
        b->orig_method = ARITH;
1722
16
        free(b->data);
1723
16
        b->data = (unsigned char *)uncomp;
1724
16
        b->alloc = usize2;
1725
16
        b->method = RAW;
1726
16
        b->uncomp_size = usize2; // Just incase it differs
1727
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1728
16
        break;
1729
16
    }
1730
1731
320
    case TOK3: {
1732
320
        uint32_t out_len;
1733
320
        uint8_t *cp = tok3_decode_names(b->data, b->comp_size, &out_len);
1734
320
        if (!cp)
1735
318
            return -1;
1736
2
        b->orig_method = TOK3;
1737
2
        b->method = RAW;
1738
2
        free(b->data);
1739
2
        b->data = cp;
1740
2
        b->alloc = out_len;
1741
2
        b->uncomp_size = out_len;
1742
2
        break;
1743
320
    }
1744
1745
3
    default:
1746
3
        return -1;
1747
460
    }
1748
1749
46
    return 0;
1750
460
}
1751
1752
static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size,
1753
                                     int content_id, size_t *out_size,
1754
                                     enum cram_block_method_int method,
1755
742k
                                     int level, int strat) {
1756
742k
    switch (method) {
1757
74.9k
    case GZIP:
1758
125k
    case GZIP_RLE:
1759
268k
    case GZIP_1:
1760
        // Read names bizarrely benefit from zlib over libdeflate for
1761
        // mid-range compression levels.  Focusing purely of ratio or
1762
        // speed, libdeflate still wins.  It also seems to win for
1763
        // other data series too.
1764
        //
1765
        // Eg RN at level 5;  libdeflate=55.9MB  zlib=51.6MB
1766
#ifdef HAVE_LIBDEFLATE
1767
#  if (LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR == 1 && LIBDEFLATE_VERSION_MINOR <= 8))
1768
        if (content_id == DS_RN && level >= 4 && level <= 7)
1769
            return zlib_mem_deflate(in, in_size, out_size, level, strat);
1770
        else
1771
#  endif
1772
            return libdeflate_deflate(in, in_size, out_size, level, strat);
1773
#else
1774
268k
        return zlib_mem_deflate(in, in_size, out_size, level, strat);
1775
0
#endif
1776
1777
0
    case BZIP2: {
1778
0
#ifdef HAVE_LIBBZ2
1779
0
        unsigned int comp_size = in_size*1.01 + 600;
1780
0
        char *comp = malloc(comp_size);
1781
0
        if (!comp)
1782
0
            return NULL;
1783
1784
0
        if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
1785
0
                                              in, in_size,
1786
0
                                              level, 0, 30)) {
1787
0
            free(comp);
1788
0
            return NULL;
1789
0
        }
1790
0
        *out_size = comp_size;
1791
0
        return comp;
1792
#else
1793
        return NULL;
1794
#endif
1795
0
    }
1796
1797
0
    case FQZ:
1798
0
    case FQZ_b:
1799
0
    case FQZ_c:
1800
0
    case FQZ_d: {
1801
        // Extract the necessary portion of the slice into an fqz_slice struct.
1802
        // These previously were the same thing, but this permits us to detach
1803
        // the codec from the rest of this CRAM implementation.
1804
0
        fqz_slice *f = malloc(2*s->hdr->num_records * sizeof(uint32_t) + sizeof(fqz_slice));
1805
0
        if (!f)
1806
0
            return NULL;
1807
0
        f->num_records = s->hdr->num_records;
1808
0
        f->len = (uint32_t *)(((char *)f) + sizeof(fqz_slice));
1809
0
        f->flags = f->len + s->hdr->num_records;
1810
0
        int i;
1811
0
        for (i = 0; i < s->hdr->num_records; i++) {
1812
0
            f->flags[i] = s->crecs[i].flags;
1813
0
            f->len[i] = (i+1 < s->hdr->num_records
1814
0
                         ? s->crecs[i+1].qual - s->crecs[i].qual
1815
0
                         : s->block[DS_QS]->uncomp_size - s->crecs[i].qual);
1816
0
        }
1817
0
        char *comp = fqz_compress(strat & 0xff /* cram vers */, f,
1818
0
                                  in, in_size, out_size, strat >> 8, NULL);
1819
0
        free(f);
1820
0
        return comp;
1821
0
    }
1822
1823
0
    case LZMA:
1824
0
#ifdef HAVE_LIBLZMA
1825
0
        return lzma_mem_deflate(in, in_size, out_size, level);
1826
#else
1827
        return NULL;
1828
#endif
1829
1830
0
    case RANS0:
1831
0
    case RANS1: {
1832
0
        unsigned int out_size_i;
1833
0
        unsigned char *cp;
1834
0
        cp = rans_compress((unsigned char *)in, in_size, &out_size_i,
1835
0
                           method == RANS0 ? 0 : 1);
1836
0
        *out_size = out_size_i;
1837
0
        return (char *)cp;
1838
0
    }
1839
1840
156k
    case RANS_PR0:
1841
207k
    case RANS_PR1:
1842
283k
    case RANS_PR64:
1843
334k
    case RANS_PR9:
1844
407k
    case RANS_PR128:
1845
407k
    case RANS_PR129:
1846
407k
    case RANS_PR192:
1847
458k
    case RANS_PR193: {
1848
458k
        unsigned int out_size_i;
1849
458k
        unsigned char *cp;
1850
1851
        // see enum cram_block. We map RANS_* methods to order bit-fields
1852
458k
        static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1853
1854
458k
        int m = method == RANS_PR0 ? 0 : methmap[method - RANS_PR1];
1855
458k
        cp = rans_compress_4x16((unsigned char *)in, in_size, &out_size_i,
1856
458k
                                m | RANS_ORDER_SIMD_AUTO);
1857
458k
        *out_size = out_size_i;
1858
458k
        return (char *)cp;
1859
407k
    }
1860
1861
0
    case ARITH_PR0:
1862
0
    case ARITH_PR1:
1863
0
    case ARITH_PR64:
1864
0
    case ARITH_PR9:
1865
0
    case ARITH_PR128:
1866
0
    case ARITH_PR129:
1867
0
    case ARITH_PR192:
1868
0
    case ARITH_PR193: {
1869
0
        unsigned int out_size_i;
1870
0
        unsigned char *cp;
1871
1872
        // see enum cram_block. We map ARITH_* methods to order bit-fields
1873
0
        static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1874
1875
0
        cp = arith_compress_to((unsigned char *)in, in_size, NULL, &out_size_i,
1876
0
                               method == ARITH_PR0 ? 0 : methmap[method - ARITH_PR1]);
1877
0
        *out_size = out_size_i;
1878
0
        return (char *)cp;
1879
0
    }
1880
1881
15.4k
    case TOK3:
1882
15.4k
    case TOKA: {
1883
15.4k
        int out_len;
1884
15.4k
        int lev = level;
1885
15.4k
        if (method == TOK3 && lev > 3)
1886
15.4k
            lev = 3;
1887
15.4k
        uint8_t *cp = tok3_encode_names(in, in_size, lev, strat, &out_len, NULL);
1888
15.4k
        *out_size = out_len;
1889
15.4k
        return (char *)cp;
1890
15.4k
    }
1891
1892
0
    case RAW:
1893
0
        break;
1894
1895
0
    default:
1896
0
        return NULL;
1897
742k
    }
1898
1899
0
    return NULL;
1900
742k
}
1901
1902
/*
1903
 * A copy of cram_compress_block2 with added recursion detection.
1904
 * This is only called for error handling where the auto-tuning has failed.
1905
 * The simplest way of doing this is recusion + an additional argument, but
1906
 * we didn't want to complicate the existing code hence this is static.
1907
 */
1908
static int cram_compress_block3(cram_fd *fd, cram_slice *s,
1909
                                cram_block *b, cram_metrics *metrics,
1910
                                int method, int level,
1911
686k
                                int recurse) {
1912
1913
686k
    if (!b)
1914
42.9k
        return 0;
1915
1916
643k
    int orig_method = method;
1917
643k
    char *comp = NULL;
1918
643k
    size_t comp_size = 0;
1919
643k
    int strat;
1920
1921
    // Internally we have parameterised methods that externally map
1922
    // to the same CRAM method value.
1923
    // See enum_cram_block_method_int in cram_structs.h.
1924
643k
    int methmap[] = {
1925
        // Externally defined values
1926
643k
        RAW, GZIP, BZIP2, LZMA, RANS, RANSPR, ARITH, FQZ, TOK3,
1927
1928
        // Reserved for possible expansion
1929
643k
        0, 0,
1930
1931
        // Internally parameterised versions matching back to above
1932
        // external values
1933
643k
        GZIP, GZIP,
1934
643k
        FQZ, FQZ, FQZ,
1935
643k
        RANS,
1936
643k
        RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR,
1937
643k
        TOK3,
1938
643k
        ARITH,  ARITH,  ARITH,  ARITH,  ARITH,  ARITH,  ARITH,
1939
643k
    };
1940
1941
643k
    if (b->method != RAW) {
1942
        // Maybe already compressed if s->block[0] was compressed and
1943
        // we have e.g. s->block[DS_BA] set to s->block[0] due to only
1944
        // one base type present and hence using E_HUFFMAN on block 0.
1945
        // A second explicit attempt to compress the same block then
1946
        // occurs.
1947
0
        return 0;
1948
0
    }
1949
1950
643k
    if (method == -1) {
1951
2.32k
        method = 1<<GZIP;
1952
2.32k
        if (fd->use_bz2)
1953
0
            method |= 1<<BZIP2;
1954
2.32k
        if (fd->use_lzma)
1955
0
            method |= 1<<LZMA;
1956
2.32k
    }
1957
1958
643k
    if (level == -1)
1959
2.32k
        level = fd->level;
1960
1961
    //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
1962
1963
643k
    if (method == RAW || level == 0 || b->uncomp_size == 0) {
1964
338k
        b->method = RAW;
1965
338k
        b->comp_size = b->uncomp_size;
1966
        //fprintf(stderr, "Skip block id %d\n", b->content_id);
1967
338k
        return 0;
1968
338k
    }
1969
1970
305k
#ifndef ABS
1971
305k
#    define ABS(a) ((a)>=0?(a):-(a))
1972
305k
#endif
1973
1974
305k
    if (metrics) {
1975
296k
        pthread_mutex_lock(&fd->metrics_lock);
1976
        // Sudden changes in size trigger a retrial.  These are mainly
1977
        // triggered when switching to sorted / unsorted, where the number
1978
        // of elements in a slice radically changes.
1979
        //
1980
        // We also get large fluctuations based on genome coordinate for
1981
        // e.g. SA:Z and SC series, but we consider the typical scale of
1982
        // delta between blocks and use this to look for abnormality.
1983
1984
        // Equivalent to (but minus possible integer overflow)
1985
        //   (b->uncomp_size + 1000)/4 > metrics->input_avg_sz+1000 ||
1986
        //    b->uncomp_size + 1000    < (metrics->input_avg_sz+1000)/4)
1987
296k
        if (metrics->input_avg_sz &&
1988
87.5k
            (b->uncomp_size/4 - 750 > metrics->input_avg_sz ||
1989
86.8k
             b->uncomp_size         < metrics->input_avg_sz/4 - 750) &&
1990
1.28k
            ABS(b->uncomp_size-metrics->input_avg_sz)/10
1991
1.28k
                > metrics->input_avg_delta) {
1992
81
            metrics->next_trial = 0;
1993
81
        }
1994
1995
296k
        if (metrics->trial > 0 || --metrics->next_trial <= 0) {
1996
66.4k
            int m, unpackable = metrics->unpackable;
1997
66.4k
            size_t sz_best = b->uncomp_size;
1998
66.4k
            size_t sz[CRAM_MAX_METHOD] = {0};
1999
66.4k
            int method_best = 0; // RAW
2000
66.4k
            char *c_best = NULL, *c = NULL;
2001
2002
66.4k
            metrics->input_avg_delta =
2003
66.4k
                0.9 * (metrics->input_avg_delta +
2004
66.4k
                       ABS(b->uncomp_size - metrics->input_avg_sz));
2005
2006
66.4k
            metrics->input_avg_sz += b->uncomp_size*.2;
2007
66.4k
            metrics->input_avg_sz *= 0.8;
2008
2009
66.4k
            if (metrics->revised_method)
2010
32.4k
                method = metrics->revised_method;
2011
34.0k
            else
2012
34.0k
                metrics->revised_method = method;
2013
2014
66.4k
            if (metrics->next_trial <= 0) {
2015
2.10k
                metrics->next_trial = TRIAL_SPAN;
2016
2.10k
                metrics->trial = NTRIALS;
2017
69.5k
                for (m = 0; m < CRAM_MAX_METHOD; m++)
2018
67.4k
                    metrics->sz[m] /= 2;
2019
2.10k
                metrics->unpackable = 0;
2020
2.10k
            }
2021
2022
            // Compress this block using the best method
2023
66.4k
            if (unpackable && CRAM_MAJOR_VERS(fd->version) > 3) {
2024
                // No point trying bit-pack if 17+ symbols.
2025
0
                if (method & (1<<RANS_PR128))
2026
0
                    method = (method|(1<<RANS_PR0))&~(1<<RANS_PR128);
2027
0
                if (method & (1<<RANS_PR129))
2028
0
                    method = (method|(1<<RANS_PR1))&~(1<<RANS_PR129);
2029
0
                if (method & (1<<RANS_PR192))
2030
0
                    method = (method|(1<<RANS_PR64))&~(1<<RANS_PR192);
2031
0
                if (method & (1<<RANS_PR193))
2032
0
                    method = (method|(1<<RANS_PR64)|(1<<RANS_PR1))&~(1<<RANS_PR193);
2033
2034
0
                if (method & (1<<ARITH_PR128))
2035
0
                    method = (method|(1<<ARITH_PR0))&~(1<<ARITH_PR128);
2036
0
                if (method & (1<<ARITH_PR129))
2037
0
                    method = (method|(1<<ARITH_PR1))&~(1<<ARITH_PR129);
2038
0
                if (method & (1<<ARITH_PR192))
2039
0
                    method = (method|(1<<ARITH_PR64))&~(1<<ARITH_PR192);
2040
0
                if (method & (1u<<ARITH_PR193))
2041
0
                    method = (method|(1<<ARITH_PR64)|(1<<ARITH_PR1))&~(1u<<ARITH_PR193);
2042
0
            }
2043
2044
            // Libdeflate doesn't have a Z_RLE strategy.
2045
            // We treat it as level 1, but iff we haven't also
2046
            // explicitly listed that in the method list.
2047
#ifdef HAVE_LIBDEFLATE
2048
            if ((method & (1<<GZIP_RLE)) && (method & (1<<GZIP_1)))
2049
                method &= ~(1<<GZIP_RLE);
2050
#endif
2051
2052
66.4k
            pthread_mutex_unlock(&fd->metrics_lock);
2053
2054
2.19M
            for (m = 0; m < CRAM_MAX_METHOD; m++) {
2055
2.12M
                if (method & (1u<<m)) {
2056
503k
                    int lvl = level;
2057
503k
                    switch (m) {
2058
65.9k
                    case GZIP:     strat = Z_FILTERED; break;
2059
66.3k
                    case GZIP_1:   strat = Z_DEFAULT_STRATEGY; lvl = 1; break;
2060
50.7k
                    case GZIP_RLE: strat = Z_RLE; break;
2061
0
                    case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
2062
0
                    case FQZ_b:    strat = CRAM_MAJOR_VERS(fd->version)+256; break;
2063
0
                    case FQZ_c:    strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
2064
0
                    case FQZ_d:    strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
2065
15.3k
                    case TOK3:     strat = 0; break;
2066
0
                    case TOKA:     strat = 1; break;
2067
304k
                    default:       strat = 0;
2068
503k
                    }
2069
2070
503k
                    c = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2071
503k
                                                b->content_id, &sz[m], m, lvl, strat);
2072
2073
503k
                    if (c && sz_best > sz[m]) {
2074
41.6k
                        sz_best = sz[m];
2075
41.6k
                        method_best = m;
2076
41.6k
                        if (c_best)
2077
21.8k
                            free(c_best);
2078
41.6k
                        c_best = c;
2079
461k
                    } else if (c) {
2080
460k
                        free(c);
2081
460k
                    } else {
2082
1.06k
                        sz[m] = UINT_MAX; // arbitrarily worse than raw
2083
1.06k
                    }
2084
1.62M
                } else {
2085
1.62M
                    sz[m] = UINT_MAX; // arbitrarily worse than raw
2086
1.62M
                }
2087
2.12M
            }
2088
2089
66.4k
            if (c_best) {
2090
19.8k
                free(b->data);
2091
19.8k
                b->data = (unsigned char *)c_best;
2092
19.8k
                b->method = method_best; // adjusted to methmap[method_best] later
2093
19.8k
                b->comp_size = sz_best;
2094
19.8k
            }
2095
2096
            // Accumulate stats for all methods tried
2097
66.4k
            pthread_mutex_lock(&fd->metrics_lock);
2098
2.19M
            for (m = 0; m < CRAM_MAX_METHOD; m++)
2099
                // don't be overly sure on small blocks.
2100
                // +2000 means eg bzip2 vs gzip (1.07 to 1.04) or gz vs rans1
2101
                // needs to be at least 60 bytes smaller to overcome the
2102
                // fixed size addition.
2103
2.12M
                metrics->sz[m] += sz[m]+2000;
2104
2105
            // When enough trials performed, find the best on average
2106
66.4k
            if (--metrics->trial == 0) {
2107
16.7k
                int best_method = RAW;
2108
16.7k
                int best_sz = INT_MAX;
2109
2110
                // Relative costs of methods. See enum_cram_block_method_int
2111
                // and methmap
2112
16.7k
                double meth_cost[32] = {
2113
                    // Externally defined methods
2114
16.7k
                    1,    // 0  raw
2115
16.7k
                    1.04, // 1  gzip (Z_FILTERED)
2116
16.7k
                    1.07, // 2  bzip2
2117
16.7k
                    1.08, // 3  lzma
2118
16.7k
                    1.00, // 4  rans    (O0)
2119
16.7k
                    1.00, // 5  ranspr  (O0)
2120
16.7k
                    1.04, // 6  arithpr (O0)
2121
16.7k
                    1.05, // 7  fqz
2122
16.7k
                    1.05, // 8  tok3 (rans)
2123
16.7k
                    1.00, 1.00, // 9,10 reserved
2124
2125
                    // Paramterised versions of above
2126
16.7k
                    1.01, // gzip rle
2127
16.7k
                    1.01, // gzip -1
2128
2129
16.7k
                    1.05, 1.05, 1.05, // FQZ_b,c,d
2130
2131
16.7k
                    1.01, // rans O1
2132
2133
16.7k
                    1.01, // rans_pr1
2134
16.7k
                    1.00, // rans_pr64; if smaller, usually fast
2135
16.7k
                    1.03, // rans_pr65/9
2136
16.7k
                    1.00, // rans_pr128
2137
16.7k
                    1.01, // rans_pr129
2138
16.7k
                    1.00, // rans_pr192
2139
16.7k
                    1.01, // rans_pr193
2140
2141
16.7k
                    1.07, // tok3 arith
2142
2143
16.7k
                    1.04, // arith_pr1
2144
16.7k
                    1.04, // arith_pr64
2145
16.7k
                    1.04, // arith_pr9
2146
16.7k
                    1.03, // arith_pr128
2147
16.7k
                    1.04, // arith_pr129
2148
16.7k
                    1.04, // arith_pr192
2149
16.7k
                    1.04, // arith_pr193
2150
16.7k
                };
2151
2152
                // Scale methods by cost based on compression level
2153
16.7k
                if (fd->level <= 1) {
2154
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2155
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1)*4;
2156
16.7k
                } else if (fd->level <= 3) {
2157
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2158
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1);
2159
16.7k
                } else if (fd->level <= 6) {
2160
552k
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2161
535k
                        metrics->sz[m] *= 1+(meth_cost[m]-1)/2;
2162
16.7k
                } else if (fd->level <= 7) {
2163
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2164
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1)/3;
2165
0
                } // else cost is ignored
2166
2167
                // Ensure these are never used; BSC and ZSTD
2168
16.7k
                metrics->sz[9] = metrics->sz[10] = INT_MAX;
2169
2170
552k
                for (m = 0; m < CRAM_MAX_METHOD; m++) {
2171
535k
                    if ((!metrics->sz[m]) || (!(method & (1u<<m))))
2172
417k
                        continue;
2173
2174
118k
                    if (best_sz > metrics->sz[m])
2175
39.1k
                        best_sz = metrics->sz[m], best_method = m;
2176
118k
                }
2177
2178
16.7k
                if (best_method != metrics->method) {
2179
                    //metrics->trial = (NTRIALS+1)/2; // be sure
2180
                    //metrics->next_trial /= 1.5;
2181
7.80k
                    metrics->consistency = 0;
2182
8.94k
                } else {
2183
8.94k
                    metrics->next_trial *= MIN(2, 1+metrics->consistency/4.0);
2184
8.94k
                    metrics->consistency++;
2185
8.94k
                }
2186
2187
16.7k
                metrics->method = best_method;
2188
16.7k
                switch (best_method) {
2189
140
                case GZIP:     strat = Z_FILTERED; break;
2190
5.37k
                case GZIP_1:   strat = Z_DEFAULT_STRATEGY; break;
2191
2
                case GZIP_RLE: strat = Z_RLE; break;
2192
0
                case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
2193
0
                case FQZ_b:    strat = CRAM_MAJOR_VERS(fd->version)+256; break;
2194
0
                case FQZ_c:    strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
2195
0
                case FQZ_d:    strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
2196
189
                case TOK3:     strat = 0; break;
2197
0
                case TOKA:     strat = 1; break;
2198
11.0k
                default:       strat = 0;
2199
16.7k
                }
2200
16.7k
                metrics->strat  = strat;
2201
2202
                // If we see at least MAXFAIL trials in a row for a specific
2203
                // compression method with more than MAXDELTA aggregate
2204
                // size then we drop this from the list of methods used
2205
                // for this block type.
2206
168k
#define MAXDELTA 0.20
2207
389k
#define MAXFAILS 4
2208
552k
                for (m = 0; m < CRAM_MAX_METHOD; m++) {
2209
535k
                    if (best_method == m) {
2210
16.7k
                        metrics->cnt[m] = 0;
2211
16.7k
                        metrics->extra[m] = 0;
2212
519k
                    } else if (best_sz < metrics->sz[m]) {
2213
389k
                        double r = (double)metrics->sz[m] / best_sz - 1;
2214
389k
                        int mul = 1+(fd->level>=7);
2215
389k
                        if (++metrics->cnt[m] >= MAXFAILS*mul &&
2216
168k
                            (metrics->extra[m] += r) >= MAXDELTA*mul)
2217
65.3k
                            method &= ~(1u<<m);
2218
2219
                        // Special case for fqzcomp as it rarely changes
2220
389k
                        if (m == FQZ || m == FQZ_b || m == FQZ_c || m == FQZ_d) {
2221
63.3k
                            if (metrics->sz[m] > best_sz)
2222
63.3k
                                method &= ~(1u<<m);
2223
63.3k
                        }
2224
389k
                    }
2225
535k
                }
2226
2227
                //if (fd->verbose > 1 && method != metrics->revised_method)
2228
                //    fprintf(stderr, "%d: revising method from %x to %x\n",
2229
                //          b->content_id, metrics->revised_method, method);
2230
16.7k
                metrics->revised_method = method;
2231
16.7k
            }
2232
66.4k
            pthread_mutex_unlock(&fd->metrics_lock);
2233
230k
        } else {
2234
230k
            metrics->input_avg_delta =
2235
230k
                0.9 * (metrics->input_avg_delta +
2236
230k
                       ABS(b->uncomp_size - metrics->input_avg_sz));
2237
2238
230k
            metrics->input_avg_sz += b->uncomp_size*.2;
2239
230k
            metrics->input_avg_sz *= 0.8;
2240
2241
230k
            strat = metrics->strat;
2242
230k
            method = metrics->method;
2243
2244
230k
            pthread_mutex_unlock(&fd->metrics_lock);
2245
230k
            comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2246
230k
                                           b->content_id, &comp_size, method,
2247
230k
                                           method == GZIP_1 ? 1 : level,
2248
230k
                                           strat);
2249
230k
            if (!comp) {
2250
                // Our cached best method failed, but maybe another works?
2251
                // Rerun with trial mode engaged again.
2252
134
                if (!recurse) {
2253
134
                    hts_log_warning("Compressed block ID %d method %s failed, "
2254
134
                                    "redoing trial", b->content_id,
2255
134
                                    cram_block_method2str(method));
2256
134
                    pthread_mutex_lock(&fd->metrics_lock);
2257
134
                    metrics->trial = NTRIALS;
2258
134
                    metrics->next_trial = TRIAL_SPAN;
2259
134
                    metrics->revised_method = orig_method;
2260
134
                    pthread_mutex_unlock(&fd->metrics_lock);
2261
134
                    return cram_compress_block3(fd, s, b, metrics, method,
2262
134
                                                level, 1);
2263
134
                }
2264
0
                return -1;
2265
134
            }
2266
2267
230k
            if (comp_size < b->uncomp_size) {
2268
29.5k
                free(b->data);
2269
29.5k
                b->data = (unsigned char *)comp;
2270
29.5k
                b->comp_size = comp_size;
2271
29.5k
                b->method = method;
2272
200k
            } else {
2273
200k
                free(comp);
2274
200k
            }
2275
230k
        }
2276
2277
296k
    } else {
2278
        // no cached metrics, so just do zlib?
2279
8.83k
        comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2280
8.83k
                                       b->content_id, &comp_size, GZIP, level, Z_FILTERED);
2281
8.83k
        if (!comp) {
2282
0
            hts_log_error("Compression failed!");
2283
0
            return -1;
2284
0
        }
2285
2286
8.83k
        if (comp_size < b->uncomp_size) {
2287
2.75k
            free(b->data);
2288
2.75k
            b->data = (unsigned char *)comp;
2289
2.75k
            b->comp_size = comp_size;
2290
2.75k
            b->method = GZIP;
2291
6.07k
        } else {
2292
6.07k
            free(comp);
2293
6.07k
        }
2294
8.83k
        strat = Z_FILTERED;
2295
8.83k
    }
2296
2297
305k
    hts_log_info("Compressed block ID %d from %d to %d by method %s",
2298
305k
                 b->content_id, b->uncomp_size, b->comp_size,
2299
305k
                 cram_block_method2str(b->method));
2300
2301
305k
    b->method = methmap[b->method];
2302
2303
305k
    return 0;
2304
305k
}
2305
2306
/*
2307
 * Compresses a block using a selection of compression codecs and options.
2308
 * The best is learnt and used for subsequent slices, periodically resampling.
2309
 *
2310
 * Method and level -1 implies defaults, as specified in cram_fd.
2311
 */
2312
int cram_compress_block2(cram_fd *fd, cram_slice *s,
2313
                         cram_block *b, cram_metrics *metrics,
2314
686k
                         int method, int level) {
2315
686k
    return cram_compress_block3(fd, s, b, metrics, method, level, 0);
2316
686k
}
2317
2318
int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
2319
2.32k
                        int method, int level) {
2320
2.32k
    return cram_compress_block2(fd, NULL, b, metrics, method, level);
2321
2.32k
}
2322
2323
200k
cram_metrics *cram_new_metrics(void) {
2324
200k
    cram_metrics *m = calloc(1, sizeof(*m));
2325
200k
    if (!m)
2326
0
        return NULL;
2327
200k
    m->trial = NTRIALS-1;
2328
200k
    m->next_trial = TRIAL_SPAN/2; // learn quicker at start
2329
200k
    m->method = RAW;
2330
200k
    m->strat = 0;
2331
200k
    m->revised_method = 0;
2332
200k
    m->unpackable = 0;
2333
2334
200k
    return m;
2335
200k
}
2336
2337
305k
char *cram_block_method2str(enum cram_block_method_int m) {
2338
305k
    switch(m) {
2339
253k
    case RAW:         return "RAW";
2340
5.73k
    case GZIP:        return "GZIP";
2341
0
    case BZIP2:       return "BZIP2";
2342
0
    case LZMA:        return "LZMA";
2343
0
    case RANS0:       return "RANS0";
2344
0
    case RANS1:       return "RANS1";
2345
21
    case GZIP_RLE:    return "GZIP_RLE";
2346
5.17k
    case GZIP_1:      return "GZIP_1";
2347
0
    case FQZ:         return "FQZ";
2348
0
    case FQZ_b:       return "FQZ_b";
2349
0
    case FQZ_c:       return "FQZ_c";
2350
0
    case FQZ_d:       return "FQZ_d";
2351
1.39k
    case RANS_PR0:    return "RANS_PR0";
2352
409
    case RANS_PR1:    return "RANS_PR1";
2353
17.3k
    case RANS_PR64:   return "RANS_PR64";
2354
0
    case RANS_PR9:    return "RANS_PR9";
2355
21.5k
    case RANS_PR128:  return "RANS_PR128";
2356
0
    case RANS_PR129:  return "RANS_PR129";
2357
0
    case RANS_PR192:  return "RANS_PR192";
2358
467
    case RANS_PR193:  return "RANS_PR193";
2359
135
    case TOK3:        return "TOK3_R";
2360
0
    case TOKA:        return "TOK3_A";
2361
0
    case ARITH_PR0:   return "ARITH_PR0";
2362
0
    case ARITH_PR1:   return "ARITH_PR1";
2363
0
    case ARITH_PR64:  return "ARITH_PR64";
2364
0
    case ARITH_PR9:   return "ARITH_PR9";
2365
0
    case ARITH_PR128: return "ARITH_PR128";
2366
0
    case ARITH_PR129: return "ARITH_PR129";
2367
0
    case ARITH_PR192: return "ARITH_PR192";
2368
0
    case ARITH_PR193: return "ARITH_PR193";
2369
0
    case BM_ERROR: break;
2370
305k
    }
2371
0
    return "?";
2372
305k
}
2373
2374
4
char *cram_content_type2str(enum cram_content_type t) {
2375
4
    switch (t) {
2376
0
    case FILE_HEADER:         return "FILE_HEADER";
2377
2
    case COMPRESSION_HEADER:  return "COMPRESSION_HEADER";
2378
0
    case MAPPED_SLICE:        return "MAPPED_SLICE";
2379
0
    case UNMAPPED_SLICE:      return "UNMAPPED_SLICE";
2380
0
    case EXTERNAL:            return "EXTERNAL";
2381
0
    case CORE:                return "CORE";
2382
0
    case CT_ERROR:            break;
2383
4
    }
2384
2
    return "?";
2385
4
}
2386
2387
/* ----------------------------------------------------------------------
2388
 * Reference sequence handling
2389
 *
2390
 * These revolve around the refs_t structure, which may potentially be
2391
 * shared between multiple cram_fd.
2392
 *
2393
 * We start with refs_create() to allocate an empty refs_t and then
2394
 * populate it with @SQ line data using refs_from_header(). This is done on
2395
 * cram_open().  Also at start up we can call cram_load_reference() which
2396
 * is used with "scramble -r foo.fa". This replaces the fd->refs with the
2397
 * new one specified. In either case refs2id() is then called which
2398
 * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
2399
 *
2400
 * Later, possibly within a thread, we will want to know the actual ref
2401
 * seq itself, obtained by calling cram_get_ref().  This may use the
2402
 * UR: or M5: fields or the filename specified in the original
2403
 * cram_load_reference() call.
2404
 *
2405
 * Given the potential for multi-threaded reference usage, we have
2406
 * reference counting (sorry for the confusing double use of "ref") to
2407
 * track the number of callers interested in any specific reference.
2408
 */
2409
2410
/*
2411
 * Frees/unmaps a reference sequence and associated file handles.
2412
 */
2413
2.68k
static void ref_entry_free_seq(ref_entry *e) {
2414
2.68k
    if (e->mf)
2415
0
        mfclose(e->mf);
2416
2.68k
    if (e->seq && !e->mf)
2417
0
        free(e->seq);
2418
2419
2.68k
    e->seq = NULL;
2420
2.68k
    e->mf = NULL;
2421
2.68k
}
2422
2423
4.12k
void refs_free(refs_t *r) {
2424
4.12k
    RP("refs_free()\n");
2425
2426
4.12k
    if (--r->count > 0)
2427
0
        return;
2428
2429
4.12k
    if (!r)
2430
0
        return;
2431
2432
4.12k
    if (r->pool)
2433
4.12k
        string_pool_destroy(r->pool);
2434
2435
4.12k
    if (r->h_meta) {
2436
4.12k
        khint_t k;
2437
2438
12.1k
        for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) {
2439
8.03k
            ref_entry *e;
2440
2441
8.03k
            if (!kh_exist(r->h_meta, k))
2442
5.35k
                continue;
2443
2.68k
            if (!(e = kh_val(r->h_meta, k)))
2444
0
                continue;
2445
2.68k
            ref_entry_free_seq(e);
2446
2.68k
            free(e);
2447
2.68k
        }
2448
2449
4.12k
        kh_destroy(refs, r->h_meta);
2450
4.12k
    }
2451
2452
4.12k
    if (r->ref_id)
2453
2.83k
        free(r->ref_id);
2454
2455
4.12k
    if (r->fp)
2456
0
        bgzf_close(r->fp);
2457
2458
4.12k
    pthread_mutex_destroy(&r->lock);
2459
2460
4.12k
    free(r);
2461
4.12k
}
2462
2463
4.12k
static refs_t *refs_create(void) {
2464
4.12k
    refs_t *r = calloc(1, sizeof(*r));
2465
2466
4.12k
    RP("refs_create()\n");
2467
2468
4.12k
    if (!r)
2469
0
        return NULL;
2470
2471
4.12k
    if (!(r->pool = string_pool_create(8192)))
2472
0
        goto err;
2473
2474
4.12k
    r->ref_id = NULL; // see refs2id() to populate.
2475
4.12k
    r->count = 1;
2476
4.12k
    r->last = NULL;
2477
4.12k
    r->last_id = -1;
2478
2479
4.12k
    if (!(r->h_meta = kh_init(refs)))
2480
0
        goto err;
2481
2482
4.12k
    pthread_mutex_init(&r->lock, NULL);
2483
2484
4.12k
    return r;
2485
2486
0
 err:
2487
0
    refs_free(r);
2488
0
    return NULL;
2489
4.12k
}
2490
2491
/*
2492
 * Opens a reference fasta file as a BGZF stream, allowing for
2493
 * compressed files.  It automatically builds a .fai file if
2494
 * required and if compressed a .gzi bgzf index too.
2495
 *
2496
 * Returns a BGZF handle on success;
2497
 *         NULL on failure.
2498
 */
2499
1.05k
static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) {
2500
1.05k
    BGZF *fp;
2501
2502
1.05k
    if (strncmp(fn, "file://", 7) == 0)
2503
0
        fn += 7;
2504
2505
1.05k
    if (!is_md5 && !hisremote(fn)) {
2506
608
        char fai_file[PATH_MAX];
2507
2508
608
        snprintf(fai_file, PATH_MAX, "%s.fai", fn);
2509
608
        if (access(fai_file, R_OK) != 0)
2510
608
            if (fai_build(fn) != 0)
2511
608
                return NULL;
2512
608
    }
2513
2514
443
    if (!(fp = bgzf_open(fn, mode))) {
2515
443
        perror(fn);
2516
443
        return NULL;
2517
443
    }
2518
2519
0
    if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) {
2520
0
        hts_log_error("Unable to load .gzi index '%s.gzi'", fn);
2521
0
        bgzf_close(fp);
2522
0
        return NULL;
2523
0
    }
2524
2525
0
    return fp;
2526
0
}
2527
2528
/*
2529
 * Loads a FAI file for a reference.fasta.
2530
 * "is_err" indicates whether failure to load is worthy of emitting an
2531
 * error message. In some cases (eg with embedded references) we
2532
 * speculatively load, just in case, and silently ignore errors.
2533
 *
2534
 * Returns the refs_t struct on success (maybe newly allocated);
2535
 *         NULL on failure
2536
 */
2537
1.05k
static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) {
2538
1.05k
    hFILE *fp = NULL;
2539
1.05k
    char fai_fn[PATH_MAX];
2540
1.05k
    char line[8192];
2541
1.05k
    refs_t *r = r_orig;
2542
1.05k
    size_t fn_l = strlen(fn);
2543
1.05k
    int id = 0, id_alloc = 0;
2544
2545
1.05k
    RP("refs_load_fai %s\n", fn);
2546
2547
1.05k
    if (!r)
2548
0
        if (!(r = refs_create()))
2549
0
            goto err;
2550
2551
1.05k
    if (r->fp)
2552
0
        if (bgzf_close(r->fp) != 0)
2553
0
            goto err;
2554
1.05k
    r->fp = NULL;
2555
2556
    /* Look for a FASTA##idx##FAI format */
2557
1.05k
    char *fn_delim = strstr(fn, HTS_IDX_DELIM);
2558
1.05k
    if (fn_delim) {
2559
24
        if (!(r->fn = string_ndup(r->pool, fn, fn_delim - fn)))
2560
0
            goto err;
2561
24
        fn_delim += strlen(HTS_IDX_DELIM);
2562
24
        snprintf(fai_fn, PATH_MAX, "%s", fn_delim);
2563
1.02k
    } else {
2564
        /* An index file was provided, instead of the actual reference file */
2565
1.02k
        if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0) {
2566
0
            if (!r->fn) {
2567
0
                if (!(r->fn = string_ndup(r->pool, fn, fn_l-4)))
2568
0
                    goto err;
2569
0
            }
2570
0
            snprintf(fai_fn, PATH_MAX, "%s", fn);
2571
1.02k
        } else {
2572
        /* Only the reference file provided. Get the index file name from it */
2573
1.02k
            if (!(r->fn = string_dup(r->pool, fn)))
2574
0
                goto err;
2575
1.02k
            snprintf(fai_fn, PATH_MAX, "%.*s.fai", PATH_MAX-5, fn);
2576
1.02k
        }
2577
1.02k
    }
2578
2579
1.05k
    if (!(r->fp = bgzf_open_ref(r->fn, "r", 0))) {
2580
1.05k
        hts_log_error("Failed to open reference file '%s'", r->fn);
2581
1.05k
        goto err;
2582
1.05k
    }
2583
2584
0
    if (!(fp = hopen(fai_fn, "r"))) {
2585
0
        hts_log_error("Failed to open index file '%s'", fai_fn);
2586
0
        if (is_err)
2587
0
            perror(fai_fn);
2588
0
        goto err;
2589
0
    }
2590
0
    while (hgets(line, 8192, fp) != NULL) {
2591
0
        ref_entry *e = malloc(sizeof(*e));
2592
0
        char *cp;
2593
0
        int n;
2594
0
        khint_t k;
2595
2596
0
        if (!e)
2597
0
            return NULL;
2598
2599
        // id
2600
0
        for (cp = line; *cp && !isspace_c(*cp); cp++)
2601
0
            ;
2602
0
        *cp++ = 0;
2603
0
        e->name = string_dup(r->pool, line);
2604
2605
        // length
2606
0
        while (*cp && isspace_c(*cp))
2607
0
            cp++;
2608
0
        e->length = strtoll(cp, &cp, 10);
2609
2610
        // offset
2611
0
        while (*cp && isspace_c(*cp))
2612
0
            cp++;
2613
0
        e->offset = strtoll(cp, &cp, 10);
2614
2615
        // bases per line
2616
0
        while (*cp && isspace_c(*cp))
2617
0
            cp++;
2618
0
        e->bases_per_line = strtol(cp, &cp, 10);
2619
2620
        // line length
2621
0
        while (*cp && isspace_c(*cp))
2622
0
            cp++;
2623
0
        e->line_length = strtol(cp, &cp, 10);
2624
2625
        // filename
2626
0
        e->fn = r->fn;
2627
2628
0
        e->count = 0;
2629
0
        e->seq = NULL;
2630
0
        e->mf = NULL;
2631
0
        e->is_md5 = 0;
2632
0
        e->validated_md5 = 0;
2633
2634
0
        k = kh_put(refs, r->h_meta, e->name, &n);
2635
0
        if (-1 == n)  {
2636
0
            free(e);
2637
0
            return NULL;
2638
0
        }
2639
2640
0
        if (n) {
2641
0
            kh_val(r->h_meta, k) = e;
2642
0
        } else {
2643
0
            ref_entry *re = kh_val(r->h_meta, k);
2644
0
            if (re && (re->count != 0 || re->length != 0)) {
2645
                /* Keep old */
2646
0
                free(e);
2647
0
            } else {
2648
                /* Replace old */
2649
0
                if (re)
2650
0
                    free(re);
2651
0
                kh_val(r->h_meta, k) = e;
2652
0
            }
2653
0
        }
2654
2655
0
        if (id >= id_alloc) {
2656
0
            ref_entry **new_refs;
2657
0
            int x;
2658
2659
0
            id_alloc = id_alloc ?id_alloc*2 : 16;
2660
0
            new_refs = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
2661
0
            if (!new_refs)
2662
0
                goto err;
2663
0
            r->ref_id = new_refs;
2664
2665
0
            for (x = id; x < id_alloc; x++)
2666
0
                r->ref_id[x] = NULL;
2667
0
        }
2668
0
        r->ref_id[id] = e;
2669
0
        r->nref = ++id;
2670
0
    }
2671
2672
0
    if(hclose(fp) < 0)
2673
0
        goto err;
2674
0
    return r;
2675
2676
1.05k
 err:
2677
1.05k
    if (fp)
2678
0
        hclose_abruptly(fp);
2679
2680
1.05k
    if (!r_orig)
2681
0
        refs_free(r);
2682
2683
1.05k
    return NULL;
2684
0
}
2685
2686
/*
2687
 * Verifies that the CRAM @SQ lines and .fai files match.
2688
 */
2689
0
static void sanitise_SQ_lines(cram_fd *fd) {
2690
0
    int i;
2691
2692
0
    if (!fd->header || !fd->header->hrecs)
2693
0
        return;
2694
2695
0
    if (!fd->refs || !fd->refs->h_meta)
2696
0
        return;
2697
2698
0
    for (i = 0; i < fd->header->hrecs->nref; i++) {
2699
0
        const char *name = fd->header->hrecs->ref[i].name;
2700
0
        khint_t k = kh_get(refs, fd->refs->h_meta, name);
2701
0
        ref_entry *r;
2702
2703
        // We may have @SQ lines which have no known .fai, but do not
2704
        // in themselves pose a problem because they are unused in the file.
2705
0
        if (k == kh_end(fd->refs->h_meta))
2706
0
            continue;
2707
2708
0
        if (!(r = (ref_entry *)kh_val(fd->refs->h_meta, k)))
2709
0
            continue;
2710
2711
0
        if (r->length && r->length != fd->header->hrecs->ref[i].len) {
2712
0
            assert(strcmp(r->name, fd->header->hrecs->ref[i].name) == 0);
2713
2714
            // Should we also check MD5sums here to ensure the correct
2715
            // reference was given?
2716
0
            hts_log_warning("Header @SQ length mismatch for ref %s, %"PRIhts_pos" vs %d",
2717
0
                            r->name, fd->header->hrecs->ref[i].len, (int)r->length);
2718
2719
            // Fixing the parsed @SQ header will make MD:Z: strings work
2720
            // and also stop it producing N for the sequence.
2721
0
            fd->header->hrecs->ref[i].len = r->length;
2722
0
        }
2723
0
    }
2724
0
}
2725
2726
/*
2727
 * Indexes references by the order they appear in a BAM file. This may not
2728
 * necessarily be the same order they appear in the fasta reference file.
2729
 *
2730
 * Returns 0 on success
2731
 *        -1 on failure
2732
 */
2733
2.32k
int refs2id(refs_t *r, sam_hdr_t *hdr) {
2734
2.32k
    int i;
2735
2.32k
    sam_hrecs_t *h = hdr->hrecs;
2736
2737
2.32k
    if (r->ref_id)
2738
1.34k
        free(r->ref_id);
2739
2.32k
    if (r->last)
2740
0
        r->last = NULL;
2741
2742
2.32k
    r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
2743
2.32k
    if (!r->ref_id)
2744
0
        return -1;
2745
2746
2.32k
    r->nref = h->nref;
2747
4.00k
    for (i = 0; i < h->nref; i++) {
2748
1.68k
        khint_t k = kh_get(refs, r->h_meta, h->ref[i].name);
2749
1.68k
        if (k != kh_end(r->h_meta)) {
2750
1.68k
            r->ref_id[i] = kh_val(r->h_meta, k);
2751
1.68k
        } else {
2752
0
            hts_log_warning("Unable to find ref name '%s'", h->ref[i].name);
2753
0
        }
2754
1.68k
    }
2755
2756
2.32k
    return 0;
2757
2.32k
}
2758
2759
/*
2760
 * Generates refs_t entries based on @SQ lines in the header.
2761
 * Returns 0 on success
2762
 *         -1 on failure
2763
 */
2764
8.88k
static int refs_from_header(cram_fd *fd) {
2765
8.88k
    if (!fd)
2766
0
        return -1;
2767
2768
8.88k
    refs_t *r = fd->refs;
2769
8.88k
    if (!r)
2770
0
        return -1;
2771
2772
8.88k
    sam_hdr_t *h = fd->header;
2773
8.88k
    if (!h)
2774
2.54k
        return 0;
2775
2776
6.34k
    if (!h->hrecs) {
2777
3.44k
        if (-1 == sam_hdr_fill_hrecs(h))
2778
114
            return -1;
2779
3.44k
    }
2780
2781
6.22k
    if (h->hrecs->nref == 0)
2782
3.01k
        return 0;
2783
2784
    //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
2785
2786
    /* Existing refs are fine, as long as they're compatible with the hdr. */
2787
3.21k
    ref_entry **new_ref_id = realloc(r->ref_id, (r->nref + h->hrecs->nref) * sizeof(*r->ref_id));
2788
3.21k
    if (!new_ref_id)
2789
0
        return -1;
2790
3.21k
    r->ref_id = new_ref_id;
2791
2792
3.21k
    int i, j;
2793
    /* Copy info from h->ref[i] over to r */
2794
7.57k
    for (i = 0, j = r->nref; i < h->hrecs->nref; i++) {
2795
4.36k
        sam_hrec_type_t *ty;
2796
4.36k
        sam_hrec_tag_t *tag;
2797
4.36k
        khint_t k;
2798
4.36k
        int n;
2799
2800
4.36k
        k = kh_get(refs, r->h_meta, h->hrecs->ref[i].name);
2801
4.36k
        if (k != kh_end(r->h_meta))
2802
            // Ref already known about
2803
1.68k
            continue;
2804
2805
2.68k
        if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
2806
0
            return -1;
2807
2808
2.68k
        if (!h->hrecs->ref[i].name)
2809
0
            return -1;
2810
2811
2.68k
        r->ref_id[j]->name = string_dup(r->pool, h->hrecs->ref[i].name);
2812
2.68k
        if (!r->ref_id[j]->name) return -1;
2813
2.68k
        r->ref_id[j]->length = 0; // marker for not yet loaded
2814
2815
        /* Initialise likely filename if known */
2816
2.68k
        if ((ty = sam_hrecs_find_type_id(h->hrecs, "SQ", "SN", h->hrecs->ref[i].name))) {
2817
2.68k
            if ((tag = sam_hrecs_find_key(ty, "M5", NULL)))
2818
32
                r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
2819
2820
2.68k
            if ((tag = sam_hrecs_find_key(ty, "LN", NULL))) {
2821
                // LN tag used when constructing consensus reference
2822
2.68k
                r->ref_id[j]->LN_length = strtoll(tag->str+3, NULL, 0);
2823
                // See fuzz 382922241
2824
2.68k
                if (r->ref_id[j]->LN_length < 0)
2825
476
                    r->ref_id[j]->LN_length = 0;
2826
2.68k
            }
2827
2.68k
        }
2828
2829
2.68k
        k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
2830
2.68k
        if (n <= 0) // already exists or error
2831
0
            return -1;
2832
2.68k
        kh_val(r->h_meta, k) = r->ref_id[j];
2833
2834
2.68k
        j++;
2835
2.68k
    }
2836
3.21k
    r->nref = j;
2837
2838
3.21k
    return 0;
2839
3.21k
}
2840
2841
/*
2842
 * Attaches a header to a cram_fd.
2843
 *
2844
 * This should be used when creating a new cram_fd for writing where
2845
 * we have a header already constructed (eg from a file we've read
2846
 * in).
2847
 */
2848
2.43k
int cram_set_header2(cram_fd *fd, const sam_hdr_t *hdr) {
2849
2.43k
    if (!fd || !hdr )
2850
0
        return -1;
2851
2852
2.43k
    if (fd->header != hdr) {
2853
2.43k
        if (fd->header)
2854
0
            sam_hdr_destroy(fd->header);
2855
2.43k
        fd->header = sam_hdr_dup(hdr);
2856
2.43k
        if (!fd->header)
2857
0
            return -1;
2858
2.43k
    }
2859
2.43k
    return refs_from_header(fd);
2860
2.43k
}
2861
2862
0
int cram_set_header(cram_fd *fd, sam_hdr_t *hdr) {
2863
0
    return cram_set_header2(fd, hdr);
2864
0
}
2865
2866
/*
2867
 * Returns whether the path refers to a directory.
2868
 */
2869
0
static int is_directory(char *fn) {
2870
0
    struct stat buf;
2871
0
    if ( stat(fn,&buf) ) return 0;
2872
0
    return S_ISDIR(buf.st_mode);
2873
0
}
2874
2875
/*
2876
 * Converts a directory and a filename into an expanded path, replacing %s
2877
 * in directory with the filename and %[0-9]+s with portions of the filename
2878
 * Any remaining parts of filename are added to the end with /%s.
2879
 */
2880
0
static int expand_cache_path(char *path, char *dir, const char *fn) {
2881
0
    char *cp, *start = path;
2882
0
    size_t len;
2883
0
    size_t sz = PATH_MAX;
2884
2885
0
    while ((cp = strchr(dir, '%'))) {
2886
0
        if (cp-dir >= sz) return -1;
2887
0
        strncpy(path, dir, cp-dir);
2888
0
        path += cp-dir;
2889
0
        sz -= cp-dir;
2890
2891
0
        if (*++cp == 's') {
2892
0
            len = strlen(fn);
2893
0
            if (len >= sz) return -1;
2894
0
            strcpy(path, fn);
2895
0
            path += len;
2896
0
            sz -= len;
2897
0
            fn += len;
2898
0
            cp++;
2899
0
        } else if (*cp >= '0' && *cp <= '9') {
2900
0
            char *endp;
2901
0
            long l;
2902
2903
0
            l = strtol(cp, &endp, 10);
2904
0
            l = MIN(l, strlen(fn));
2905
0
            if (*endp == 's') {
2906
0
                if (l >= sz) return -1;
2907
0
                strncpy(path, fn, l);
2908
0
                path += l;
2909
0
                fn += l;
2910
0
                sz -= l;
2911
0
                *path = 0;
2912
0
                cp = endp+1;
2913
0
            } else {
2914
0
                if (sz < 3) return -1;
2915
0
                *path++ = '%';
2916
0
                *path++ = *cp++;
2917
0
            }
2918
0
        } else {
2919
0
            if (sz < 3) return -1;
2920
0
            *path++ = '%';
2921
0
            *path++ = *cp++;
2922
0
        }
2923
0
        dir = cp;
2924
0
    }
2925
2926
0
    len = strlen(dir);
2927
0
    if (len >= sz) return -1;
2928
0
    strcpy(path, dir);
2929
0
    path += len;
2930
0
    sz -= len;
2931
2932
0
    len = strlen(fn) + ((*fn && path > start && path[-1] != '/') ? 1 : 0);
2933
0
    if (len >= sz) return -1;
2934
0
    if (*fn && path > start && path[-1] != '/')
2935
0
        *path++ = '/';
2936
0
    strcpy(path, fn);
2937
0
    return 0;
2938
0
}
2939
2940
/*
2941
 * Make the directory containing path and any prefix directories.
2942
 */
2943
0
static void mkdir_prefix(char *path, int mode) {
2944
0
    char *cp = strrchr(path, '/');
2945
0
    if (!cp)
2946
0
        return;
2947
2948
0
    *cp = 0;
2949
0
    if (is_directory(path)) {
2950
0
        *cp = '/';
2951
0
        return;
2952
0
    }
2953
2954
0
    if (mkdir(path, mode) == 0) {
2955
0
        chmod(path, mode);
2956
0
        *cp = '/';
2957
0
        return;
2958
0
    }
2959
2960
0
    mkdir_prefix(path, mode);
2961
0
    mkdir(path, mode);
2962
0
    chmod(path, mode);
2963
0
    *cp = '/';
2964
0
}
2965
2966
/*
2967
 * Queries the M5 string from the header and attempts to populate the
2968
 * reference from this using the REF_PATH environment.
2969
 *
2970
 * Returns 0 on success
2971
 *        -1 on failure
2972
 */
2973
1.58k
static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
2974
1.58k
    char *ref_path = getenv("REF_PATH");
2975
1.58k
    sam_hrec_type_t *ty;
2976
1.58k
    sam_hrec_tag_t *tag;
2977
1.58k
    char path[PATH_MAX];
2978
1.58k
    kstring_t path_tmp = KS_INITIALIZE;
2979
1.58k
    char *local_cache = getenv("REF_CACHE");
2980
1.58k
    mFILE *mf;
2981
1.58k
    int local_path = 0;
2982
2983
1.58k
    hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id);
2984
2985
1.58k
    if (!r->name)
2986
0
        return -1;
2987
2988
1.58k
    if (!(ty = sam_hrecs_find_type_id(fd->header->hrecs, "SQ", "SN", r->name)))
2989
0
        return -1;
2990
2991
1.58k
    if (!(tag = sam_hrecs_find_key(ty, "M5", NULL)))
2992
1.56k
        goto no_M5;
2993
2994
25
    hts_log_info("Querying ref %s", tag->str+3);
2995
2996
    /* Use cache if available */
2997
25
    if (local_cache && *local_cache) {
2998
0
        struct stat sb;
2999
0
        if (expand_cache_path(path, local_cache, tag->str+3) == 0 &&
3000
0
            stat(path, &sb) == 0)
3001
            // Found it in the local cache
3002
0
            local_path = 1;
3003
0
    }
3004
3005
#ifndef HAVE_MMAP
3006
    char *path2;
3007
    /* Search local files in REF_PATH; we can open them and return as above */
3008
    if (!local_path && (path2 = find_path(tag->str+3, ref_path))) {
3009
        int len = snprintf(path, PATH_MAX, "%s", path2);
3010
        free(path2);
3011
        if (len > 0 && len < PATH_MAX) // in case it's too long
3012
            local_path = 1;
3013
    }
3014
#endif
3015
3016
    /* Found via REF_CACHE or local REF_PATH file */
3017
25
    if (local_path) {
3018
0
        struct stat sb;
3019
0
        BGZF *fp;
3020
3021
0
        if (0 == stat(path, &sb)
3022
0
            && S_ISREG(sb.st_mode)
3023
0
            && (fp = bgzf_open(path, "r"))) {
3024
0
            r->length = sb.st_size;
3025
0
            r->offset = r->line_length = r->bases_per_line = 0;
3026
3027
0
            r->fn = string_dup(fd->refs->pool, path);
3028
3029
0
            if (fd->refs->fp)
3030
0
                if (bgzf_close(fd->refs->fp) != 0)
3031
0
                    return -1;
3032
0
            fd->refs->fp = fp;
3033
0
            fd->refs->fn = r->fn;
3034
0
            r->is_md5 = 1;
3035
0
            r->validated_md5 = 1;
3036
3037
            // Fall back to cram_get_ref() where it'll do the actual
3038
            // reading of the file.
3039
0
            return 0;
3040
0
        }
3041
0
    }
3042
3043
3044
    /* Otherwise search full REF_PATH; slower as loads entire file */
3045
25
    int is_local = 0;
3046
25
    if ((mf = open_path_mfile(tag->str+3, ref_path, NULL, &is_local))) {
3047
0
        size_t sz;
3048
0
        r->seq = mfsteal(mf, &sz);
3049
0
        if (r->seq) {
3050
0
            r->mf = NULL;
3051
0
        } else {
3052
            // keep mf around as we couldn't detach
3053
0
            r->seq = mf->data;
3054
0
            r->mf = mf;
3055
0
        }
3056
0
        r->length = sz;
3057
0
        r->is_md5 = 1;
3058
0
        r->validated_md5 = 1;
3059
25
    } else {
3060
25
        refs_t *refs;
3061
25
        const char *fn;
3062
25
        sam_hrec_tag_t *UR_tag;
3063
3064
1.58k
    no_M5:
3065
        /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
3066
1.58k
        if (!(UR_tag = sam_hrecs_find_key(ty, "UR", NULL)))
3067
528
            return -1;
3068
3069
1.05k
        if (strstr(UR_tag->str+3, "://") &&
3070
25
            strncmp(UR_tag->str+3, "file:", 5) != 0) {
3071
            // Documented as omitted, but accidentally supported until now
3072
7
            hts_log_error("UR tags pointing to remote files are not supported");
3073
7
            return -1;
3074
7
        }
3075
3076
1.05k
        fn = (strncmp(UR_tag->str+3, "file:", 5) == 0)
3077
1.05k
            ? UR_tag->str+8
3078
1.05k
            : UR_tag->str+3;
3079
3080
1.05k
        if (fd->refs->fp) {
3081
0
            if (bgzf_close(fd->refs->fp) != 0)
3082
0
                return -1;
3083
0
            fd->refs->fp = NULL;
3084
0
        }
3085
1.05k
        if (!(refs = refs_load_fai(fd->refs, fn, 0)))
3086
1.05k
            return -1;
3087
0
        sanitise_SQ_lines(fd);
3088
3089
0
        fd->refs = refs;
3090
0
        if (fd->refs->fp) {
3091
0
            if (bgzf_close(fd->refs->fp) != 0)
3092
0
                return -1;
3093
0
            fd->refs->fp = NULL;
3094
0
        }
3095
3096
0
        if (!fd->refs->fn)
3097
0
            return -1;
3098
3099
0
        if (-1 == refs2id(fd->refs, fd->header))
3100
0
            return -1;
3101
0
        if (!fd->refs->ref_id || !fd->refs->ref_id[id])
3102
0
            return -1;
3103
3104
        // Local copy already, so fall back to cram_get_ref().
3105
0
        return 0;
3106
0
    }
3107
3108
    /* Populate the local disk cache if required */
3109
0
    if (!is_local && local_cache && *local_cache) {
3110
0
        hFILE *fp;
3111
3112
0
        if (expand_cache_path(path, local_cache, tag->str+3) < 0) {
3113
0
            return 0; // Not fatal - we have the data already so keep going.
3114
0
        }
3115
0
        hts_log_info("Writing cache file '%s'", path);
3116
0
        mkdir_prefix(path, 01777);
3117
3118
0
        fp = hts_open_tmpfile(path, "wx", &path_tmp);
3119
0
        if (!fp) {
3120
0
            perror(path_tmp.s);
3121
0
            free(path_tmp.s);
3122
3123
            // Not fatal - we have the data already so keep going.
3124
0
            return 0;
3125
0
        }
3126
3127
        // Check md5sum
3128
0
        hts_md5_context *md5;
3129
0
        char unsigned md5_buf1[16];
3130
0
        char md5_buf2[33];
3131
3132
0
        if (!(md5 = hts_md5_init())) {
3133
0
            hclose_abruptly(fp);
3134
0
            unlink(path_tmp.s);
3135
0
            free(path_tmp.s);
3136
0
            return -1;
3137
0
        }
3138
0
        hts_md5_update(md5, r->seq, r->length);
3139
0
        hts_md5_final(md5_buf1, md5);
3140
0
        hts_md5_destroy(md5);
3141
0
        hts_md5_hex(md5_buf2, md5_buf1);
3142
3143
0
        if (strncmp(tag->str+3, md5_buf2, 32) != 0) {
3144
0
            hts_log_error("Mismatching md5sum for downloaded reference");
3145
0
            hclose_abruptly(fp);
3146
0
            unlink(path_tmp.s);
3147
0
            free(path_tmp.s);
3148
0
            return -1;
3149
0
        }
3150
3151
0
        ssize_t length_written = hwrite(fp, r->seq, r->length);
3152
0
        if (hclose(fp) < 0 || length_written != r->length ||
3153
0
            chmod(path_tmp.s, 0444) < 0 ||
3154
0
            rename(path_tmp.s, path) < 0) {
3155
0
            hts_log_error("Creating reference at %s failed: %s",
3156
0
                          path, strerror(errno));
3157
0
            unlink(path_tmp.s);
3158
0
        }
3159
0
    }
3160
3161
0
    free(path_tmp.s);
3162
0
    return 0;
3163
0
}
3164
3165
918
static void cram_ref_incr_locked(refs_t *r, int id) {
3166
918
    RP("%d INC REF %d, %d %p\n", gettid(), id,
3167
918
       (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count+1:-999),
3168
918
       id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
3169
3170
918
    if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq)
3171
918
        return;
3172
3173
0
    if (r->last_id == id)
3174
0
        r->last_id = -1;
3175
3176
0
    ++r->ref_id[id]->count;
3177
0
}
3178
3179
918
void cram_ref_incr(refs_t *r, int id) {
3180
918
    pthread_mutex_lock(&r->lock);
3181
918
    cram_ref_incr_locked(r, id);
3182
918
    pthread_mutex_unlock(&r->lock);
3183
918
}
3184
3185
51
static void cram_ref_decr_locked(refs_t *r, int id) {
3186
51
    RP("%d DEC REF %d, %d %p\n", gettid(), id,
3187
51
       (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count-1:-999),
3188
51
       id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
3189
3190
51
    if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) {
3191
51
        return;
3192
51
    }
3193
3194
0
    if (--r->ref_id[id]->count <= 0) {
3195
0
        assert(r->ref_id[id]->count == 0);
3196
0
        if (r->last_id >= 0) {
3197
0
            if (r->ref_id[r->last_id]->count <= 0 &&
3198
0
                r->ref_id[r->last_id]->seq) {
3199
0
                RP("%d FREE REF %d (%p)\n", gettid(),
3200
0
                   r->last_id, r->ref_id[r->last_id]->seq);
3201
0
                ref_entry_free_seq(r->ref_id[r->last_id]);
3202
0
                if (r->ref_id[r->last_id]->is_md5) r->ref_id[r->last_id]->length = 0;
3203
0
            }
3204
0
        }
3205
0
        r->last_id = id;
3206
0
    }
3207
0
}
3208
3209
51
void cram_ref_decr(refs_t *r, int id) {
3210
51
    pthread_mutex_lock(&r->lock);
3211
51
    cram_ref_decr_locked(r, id);
3212
51
    pthread_mutex_unlock(&r->lock);
3213
51
}
3214
3215
/*
3216
 * Used by cram_ref_load and cram_get_ref. The file handle will have
3217
 * already been opened, so we can catch it. The ref_entry *e informs us
3218
 * of whether this is a multi-line fasta file or a raw MD5 style file.
3219
 * Either way we create a single contiguous sequence.
3220
 *
3221
 * Returns all or part of a reference sequence on success (malloced);
3222
 *         NULL on failure.
3223
 */
3224
static char *load_ref_portion(BGZF *fp, ref_entry *e,
3225
0
                              hts_pos_t start, hts_pos_t end) {
3226
0
    off_t offset, len;
3227
0
    char *seq;
3228
3229
0
    if (end < start)
3230
0
        end = start;
3231
3232
    /*
3233
     * Compute locations in file. This is trivial for the MD5 files, but
3234
     * is still necessary for the fasta variants.
3235
     *
3236
     * Note the offset here, as with faidx, has the assumption that white-
3237
     * space (the diff between line_length and bases_per_line) only occurs
3238
     * at the end of a line of text.
3239
     */
3240
0
    offset = e->line_length
3241
0
        ? e->offset + (start-1)/e->bases_per_line * e->line_length +
3242
0
          (start-1) % e->bases_per_line
3243
0
        : start-1;
3244
3245
0
    len = (e->line_length
3246
0
           ? e->offset + (end-1)/e->bases_per_line * e->line_length +
3247
0
             (end-1) % e->bases_per_line
3248
0
           : end-1) - offset + 1;
3249
3250
0
    if (bgzf_useek(fp, offset, SEEK_SET) < 0) {
3251
0
        perror("bgzf_useek() on reference file");
3252
0
        return NULL;
3253
0
    }
3254
3255
0
    if (len == 0 || !(seq = malloc(len))) {
3256
0
        return NULL;
3257
0
    }
3258
3259
0
    if (len != bgzf_read(fp, seq, len)) {
3260
0
        perror("bgzf_read() on reference file");
3261
0
        free(seq);
3262
0
        return NULL;
3263
0
    }
3264
3265
    /* Strip white-space if required. */
3266
0
    if (len != end-start+1) {
3267
0
        hts_pos_t i, j;
3268
0
        char *cp = seq;
3269
0
        char *cp_to;
3270
3271
        // Copy up to the first white-space, and then repeatedly just copy
3272
        // bases_per_line verbatim, and use the slow method to end again.
3273
        //
3274
        // This may seem excessive, but this code can be a significant
3275
        // portion of total CRAM decode CPU time for shallow data sets.
3276
0
        for (i = j = 0; i < len; i++) {
3277
0
            if (!isspace_c(cp[i]))
3278
0
                cp[j++] = cp[i] & ~0x20;
3279
0
            else
3280
0
                break;
3281
0
        }
3282
0
        while (i < len && isspace_c(cp[i]))
3283
0
            i++;
3284
0
        while (i < len - e->line_length) {
3285
0
            hts_pos_t j_end = j + e->bases_per_line;
3286
0
            while (j < j_end)
3287
0
                cp[j++] = cp[i++] & ~0x20; // toupper equiv
3288
0
            i += e->line_length - e->bases_per_line;
3289
0
        }
3290
0
        for (; i < len; i++) {
3291
0
            if (!isspace_c(cp[i]))
3292
0
                cp[j++] = cp[i] & ~0x20;
3293
0
        }
3294
3295
0
        cp_to = cp+j;
3296
3297
0
        if (cp_to - seq != end-start+1) {
3298
0
            hts_log_error("Malformed reference file");
3299
0
            free(seq);
3300
0
            return NULL;
3301
0
        }
3302
0
    } else {
3303
0
        int i;
3304
0
        for (i = 0; i < len; i++) {
3305
0
            seq[i] = toupper_c(seq[i]);
3306
0
        }
3307
0
    }
3308
3309
0
    return seq;
3310
0
}
3311
3312
/*
3313
 * Load the entire reference 'id'.
3314
 * This also increments the reference count by 1.
3315
 *
3316
 * Returns ref_entry on success;
3317
 *         NULL on failure
3318
 */
3319
0
ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) {
3320
0
    ref_entry *e = r->ref_id[id];
3321
0
    hts_pos_t start = 1, end = e->length;
3322
0
    char *seq;
3323
3324
0
    if (e->seq) {
3325
0
        return e;
3326
0
    }
3327
3328
0
    assert(e->count == 0);
3329
3330
0
    if (r->last) {
3331
#ifdef REF_DEBUG
3332
        int idx = 0;
3333
        for (idx = 0; idx < r->nref; idx++)
3334
            if (r->last == r->ref_id[idx])
3335
                break;
3336
        RP("%d cram_ref_load DECR %d\n", gettid(), idx);
3337
#endif
3338
0
        assert(r->last->count > 0);
3339
0
        if (--r->last->count <= 0) {
3340
0
            RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
3341
0
            if (r->last->seq)
3342
0
                ref_entry_free_seq(r->last);
3343
0
        }
3344
0
    }
3345
3346
0
    if (!r->fn)
3347
0
        return NULL;
3348
3349
    /* Open file if it's not already the current open reference */
3350
0
    if (strcmp(r->fn, e->fn) || r->fp == NULL) {
3351
0
        if (r->fp)
3352
0
            if (bgzf_close(r->fp) != 0)
3353
0
                return NULL;
3354
0
        r->fn = e->fn;
3355
0
        if (!(r->fp = bgzf_open_ref(r->fn, "r", is_md5)))
3356
0
            return NULL;
3357
0
    }
3358
3359
0
    RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
3360
3361
0
    if (!(seq = load_ref_portion(r->fp, e, start, end))) {
3362
0
        return NULL;
3363
0
    }
3364
3365
0
    RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
3366
3367
0
    RP("%d INC REF %d, %"PRId64"\n", gettid(), id, (e->count+1));
3368
0
    e->seq = seq;
3369
0
    e->mf = NULL;
3370
0
    e->count++;
3371
3372
    /*
3373
     * Also keep track of last used ref so incr/decr loops on the same
3374
     * sequence don't cause load/free loops.
3375
     */
3376
0
    RP("%d cram_ref_load INCR %d => %"PRId64"\n", gettid(), id, e->count+1);
3377
0
    r->last = e;
3378
0
    e->count++;
3379
3380
0
    return e;
3381
0
}
3382
3383
/*
3384
 * Returns a portion of a reference sequence from start to end inclusive.
3385
 * The returned pointer is owned by either the cram_file fd or by the
3386
 * internal refs_t structure and should not be freed  by the caller.
3387
 *
3388
 * The difference is whether or not this refs_t is in use by just the one
3389
 * cram_fd or by multiples, or whether we have multiple threads accessing
3390
 * references. In either case fd->shared will be true and we start using
3391
 * reference counting to track the number of users of a specific reference
3392
 * sequence.
3393
 *
3394
 * Otherwise the ref seq returned is allocated as part of cram_fd itself
3395
 * and will be freed up on the next call to cram_get_ref or cram_close.
3396
 *
3397
 * To return the entire reference sequence, specify start as 1 and end
3398
 * as 0.
3399
 *
3400
 * To cease using a reference, call cram_ref_decr().
3401
 *
3402
 * Returns reference on success,
3403
 *         NULL on failure
3404
 */
3405
2.67k
char *cram_get_ref(cram_fd *fd, int id, hts_pos_t start, hts_pos_t end) {
3406
2.67k
    ref_entry *r;
3407
2.67k
    char *seq;
3408
2.67k
    int ostart = start;
3409
3410
2.67k
    if (id == -1 || start < 1)
3411
1.08k
        return NULL;
3412
3413
    /* FIXME: axiomatic query of r->seq being true?
3414
     * Or shortcut for unsorted data where we load once and never free?
3415
     */
3416
3417
    //fd->shared_ref = 1; // hard code for now to simplify things
3418
3419
1.58k
    pthread_mutex_lock(&fd->ref_lock);
3420
3421
1.58k
    RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
3422
3423
    /*
3424
     * Unsorted data implies we want to fetch an entire reference at a time.
3425
     * We just deal with this at the moment by claiming we're sharing
3426
     * references instead, which has the same requirement.
3427
     */
3428
1.58k
    if (fd->unsorted)
3429
0
        fd->shared_ref = 1;
3430
3431
3432
    /* Sanity checking: does this ID exist? */
3433
1.58k
    if (id >= fd->refs->nref) {
3434
0
        hts_log_error("No reference found for id %d", id);
3435
0
        pthread_mutex_unlock(&fd->ref_lock);
3436
0
        return NULL;
3437
0
    }
3438
3439
1.58k
    if (!fd->refs || !fd->refs->ref_id[id]) {
3440
0
        hts_log_error("No reference found for id %d", id);
3441
0
        pthread_mutex_unlock(&fd->ref_lock);
3442
0
        return NULL;
3443
0
    }
3444
3445
1.58k
    if (!(r = fd->refs->ref_id[id])) {
3446
0
        hts_log_error("No reference found for id %d", id);
3447
0
        pthread_mutex_unlock(&fd->ref_lock);
3448
0
        return NULL;
3449
0
    }
3450
3451
3452
    /*
3453
     * It has an entry, but may not have been populated yet.
3454
     * Any manually loaded .fai files have their lengths known.
3455
     * A ref entry computed from @SQ lines (M5 or UR field) will have
3456
     * r->length == 0 unless it's been loaded once and verified that we have
3457
     * an on-disk filename for it.
3458
     *
3459
     * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
3460
     * open_path_mfile and libcurl, which isn't multi-thread safe unless I
3461
     * rewrite my code to have one curl handle per thread.
3462
     */
3463
1.58k
    pthread_mutex_lock(&fd->refs->lock);
3464
1.58k
    if (r->length == 0) {
3465
1.58k
        if (fd->ref_fn)
3466
0
            hts_log_warning("Reference file given, but ref '%s' not present",
3467
1.58k
                            r->name);
3468
1.58k
        if (cram_populate_ref(fd, id, r) == -1) {
3469
1.58k
            hts_log_warning("Failed to populate reference \"%s\"",
3470
1.58k
                            r->name);
3471
1.58k
            hts_log_warning("See https://www.htslib.org/doc/reference_seqs.html for further suggestions");
3472
1.58k
            pthread_mutex_unlock(&fd->refs->lock);
3473
1.58k
            pthread_mutex_unlock(&fd->ref_lock);
3474
1.58k
            return NULL;
3475
1.58k
        }
3476
0
        r = fd->refs->ref_id[id];
3477
0
        if (fd->unsorted)
3478
0
            cram_ref_incr_locked(fd->refs, id);
3479
0
    }
3480
3481
3482
    /*
3483
     * We now know that we the filename containing the reference, so check
3484
     * for limits. If it's over half the reference we'll load all of it in
3485
     * memory as this will speed up subsequent calls.
3486
     */
3487
0
    if (end < 1)
3488
0
        end = r->length;
3489
0
    if (end >= r->length)
3490
0
        end  = r->length;
3491
3492
0
    if (end - start >= 0.5*r->length || fd->shared_ref) {
3493
0
        start = 1;
3494
0
        end = r->length;
3495
0
    }
3496
3497
    /*
3498
     * Maybe we have it cached already? If so use it.
3499
     *
3500
     * Alternatively if we don't have the sequence but we're sharing
3501
     * references and/or are asking for the entire length of it, then
3502
     * load the full reference into the refs structure and return
3503
     * a pointer to that one instead.
3504
     */
3505
0
    if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
3506
0
        char *cp;
3507
3508
0
        if (id >= 0) {
3509
0
            if (r->seq) {
3510
0
                cram_ref_incr_locked(fd->refs, id);
3511
0
            } else {
3512
0
                ref_entry *e;
3513
0
                if (!(e = cram_ref_load(fd->refs, id, r->is_md5))) {
3514
0
                    pthread_mutex_unlock(&fd->refs->lock);
3515
0
                    pthread_mutex_unlock(&fd->ref_lock);
3516
0
                    return NULL;
3517
0
                }
3518
3519
                /* unsorted data implies cache ref indefinitely, to avoid
3520
                 * continually loading and unloading.
3521
                 */
3522
0
                if (fd->unsorted)
3523
0
                    cram_ref_incr_locked(fd->refs, id);
3524
0
            }
3525
3526
0
            fd->ref = NULL; /* We never access it directly */
3527
0
            fd->ref_start = 1;
3528
0
            fd->ref_end   = r->length;
3529
0
            fd->ref_id    = id;
3530
3531
0
            cp = fd->refs->ref_id[id]->seq + ostart-1;
3532
0
        } else {
3533
0
            fd->ref = NULL;
3534
0
            cp = NULL;
3535
0
        }
3536
3537
0
        RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
3538
3539
0
        pthread_mutex_unlock(&fd->refs->lock);
3540
0
        pthread_mutex_unlock(&fd->ref_lock);
3541
0
        return cp;
3542
0
    }
3543
3544
    /*
3545
     * Otherwise we're not sharing, we don't have a copy of it already and
3546
     * we're only asking for a small portion of it.
3547
     *
3548
     * In this case load up just that segment ourselves, freeing any old
3549
     * small segments in the process.
3550
     */
3551
3552
    /* Unmapped ref ID */
3553
0
    if (id < 0 || !fd->refs->fn) {
3554
0
        if (fd->ref_free) {
3555
0
            free(fd->ref_free);
3556
0
            fd->ref_free = NULL;
3557
0
        }
3558
0
        fd->ref = NULL;
3559
0
        fd->ref_id = id;
3560
0
        pthread_mutex_unlock(&fd->refs->lock);
3561
0
        pthread_mutex_unlock(&fd->ref_lock);
3562
0
        return NULL;
3563
0
    }
3564
3565
    /* Open file if it's not already the current open reference */
3566
0
    if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
3567
0
        if (fd->refs->fp)
3568
0
            if (bgzf_close(fd->refs->fp) != 0)
3569
0
                return NULL;
3570
0
        fd->refs->fn = r->fn;
3571
0
        if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r", r->is_md5))) {
3572
0
            pthread_mutex_unlock(&fd->refs->lock);
3573
0
            pthread_mutex_unlock(&fd->ref_lock);
3574
0
            return NULL;
3575
0
        }
3576
0
    }
3577
3578
0
    if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
3579
0
        pthread_mutex_unlock(&fd->refs->lock);
3580
0
        pthread_mutex_unlock(&fd->ref_lock);
3581
0
        return NULL;
3582
0
    }
3583
3584
0
    if (fd->ref_free)
3585
0
        free(fd->ref_free);
3586
3587
0
    fd->ref_id    = id;
3588
0
    fd->ref_start = start;
3589
0
    fd->ref_end   = end;
3590
0
    fd->ref_free = fd->ref;
3591
0
    seq = fd->ref;
3592
3593
0
    pthread_mutex_unlock(&fd->refs->lock);
3594
0
    pthread_mutex_unlock(&fd->ref_lock);
3595
3596
0
    return seq ? seq + ostart - start : NULL;
3597
0
}
3598
3599
/*
3600
 * If fd has been opened for reading, it may be permitted to specify 'fn'
3601
 * as NULL and let the code auto-detect the reference by parsing the
3602
 * SAM header @SQ lines.
3603
 */
3604
0
int cram_load_reference(cram_fd *fd, char *fn) {
3605
0
    int ret = 0;
3606
3607
0
    if (fn) {
3608
0
        fd->refs = refs_load_fai(fd->refs, fn,
3609
0
                                 !(fd->embed_ref>0 && fd->mode == 'r'));
3610
0
        fn = fd->refs ? fd->refs->fn : NULL;
3611
0
        if (!fn)
3612
0
            ret = -1;
3613
0
        sanitise_SQ_lines(fd);
3614
0
    }
3615
0
    fd->ref_fn = fn;
3616
3617
0
    if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
3618
0
        if (fd->refs)
3619
0
            refs_free(fd->refs);
3620
0
        if (!(fd->refs = refs_create()))
3621
0
            return -1;
3622
0
        if (-1 == refs_from_header(fd))
3623
0
            return -1;
3624
0
    }
3625
3626
0
    if (fd->header)
3627
0
        if (-1 == refs2id(fd->refs, fd->header))
3628
0
            return -1;
3629
3630
0
    return ret;
3631
0
}
3632
3633
/* ----------------------------------------------------------------------
3634
 * Containers
3635
 */
3636
3637
/*
3638
 * Creates a new container, specifying the maximum number of slices
3639
 * and records permitted.
3640
 *
3641
 * Returns cram_container ptr on success
3642
 *         NULL on failure
3643
 */
3644
46.4k
cram_container *cram_new_container(int nrec, int nslice) {
3645
46.4k
    cram_container *c = calloc(1, sizeof(*c));
3646
46.4k
    enum cram_DS_ID id;
3647
3648
46.4k
    if (!c)
3649
0
        return NULL;
3650
3651
46.4k
    c->curr_ref = -2;
3652
3653
46.4k
    c->max_c_rec = nrec * nslice;
3654
46.4k
    c->curr_c_rec = 0;
3655
3656
46.4k
    c->max_rec = nrec;
3657
46.4k
    c->record_counter = 0;
3658
46.4k
    c->num_bases = 0;
3659
46.4k
    c->s_num_bases = 0;
3660
3661
46.4k
    c->max_slice = nslice;
3662
46.4k
    c->curr_slice = 0;
3663
3664
46.4k
    c->pos_sorted = 1;
3665
46.4k
    c->max_apos   = 0;
3666
46.4k
    c->multi_seq  = 0;
3667
46.4k
    c->qs_seq_orient = 1;
3668
46.4k
    c->no_ref = 0;
3669
46.4k
    c->embed_ref = -1; // automatic selection
3670
3671
46.4k
    c->bams = NULL;
3672
3673
46.4k
    if (!(c->slices = calloc(nslice != 0 ? nslice : 1, sizeof(cram_slice *))))
3674
0
        goto err;
3675
46.4k
    c->slice = NULL;
3676
3677
46.4k
    if (!(c->comp_hdr = cram_new_compression_header()))
3678
0
        goto err;
3679
46.4k
    c->comp_hdr_block = NULL;
3680
3681
1.34M
    for (id = DS_RN; id < DS_TN; id++)
3682
1.30M
        if (!(c->stats[id] = cram_stats_create())) goto err;
3683
3684
    //c->aux_B_stats = cram_stats_create();
3685
3686
46.4k
    if (!(c->tags_used = kh_init(m_tagmap)))
3687
0
        goto err;
3688
46.4k
    c->refs_used = 0;
3689
46.4k
    c->ref_free = 0;
3690
3691
46.4k
    return c;
3692
3693
0
 err:
3694
0
    if (c) {
3695
0
        if (c->slices)
3696
0
            free(c->slices);
3697
0
        free(c);
3698
0
    }
3699
0
    return NULL;
3700
46.4k
}
3701
3702
1.65k
static void free_bam_list(bam_seq_t **bams, int max_rec) {
3703
1.65k
    int i;
3704
16.5M
    for (i = 0; i < max_rec; i++)
3705
16.5M
        bam_free(bams[i]);
3706
3707
1.65k
    free(bams);
3708
1.65k
}
3709
3710
60.6k
void cram_free_container(cram_container *c) {
3711
60.6k
    enum cram_DS_ID id;
3712
60.6k
    int i;
3713
3714
60.6k
    if (!c)
3715
0
        return;
3716
3717
60.6k
    if (c->refs_used)
3718
478
        free(c->refs_used);
3719
3720
60.6k
    if (c->landmark)
3721
46.9k
        free(c->landmark);
3722
3723
60.6k
    if (c->comp_hdr)
3724
46.5k
        cram_free_compression_header(c->comp_hdr);
3725
3726
60.6k
    if (c->comp_hdr_block)
3727
45.0k
        cram_free_block(c->comp_hdr_block);
3728
3729
    // Free the slices; filled out by encoder only
3730
60.6k
    if (c->slices) {
3731
90.5k
        for (i = 0; i < c->max_slice; i++) {
3732
44.1k
            if (c->slices[i])
3733
1.65k
                cram_free_slice(c->slices[i]);
3734
44.1k
            if (c->slices[i] == c->slice)
3735
44.1k
                c->slice = NULL;
3736
44.1k
        }
3737
46.4k
        free(c->slices);
3738
46.4k
    }
3739
3740
    // Free the current slice; set by both encoder & decoder
3741
60.6k
    if (c->slice) {
3742
0
        cram_free_slice(c->slice);
3743
0
        c->slice = NULL;
3744
0
    }
3745
3746
1.75M
    for (id = DS_RN; id < DS_TN; id++)
3747
1.69M
        if (c->stats[id]) cram_stats_free(c->stats[id]);
3748
3749
    //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
3750
3751
60.6k
    if (c->tags_used) {
3752
46.4k
        khint_t k;
3753
3754
357k
        for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
3755
311k
            if (!kh_exist(c->tags_used, k))
3756
166k
                continue;
3757
3758
144k
            cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
3759
144k
            if (tm) {
3760
144k
                cram_codec *c = tm->codec;
3761
3762
144k
                if (c) c->free(c);
3763
3764
                // If tm->blk or tm->blk2 is set, then we haven't yet got to
3765
                // cram_encode_container which copies the blocks to s->aux_block
3766
                // and NULLifies tm->blk*.  In this case we failed to complete
3767
                // the container construction, so we have to free up our partially
3768
                // converted CRAM.
3769
144k
                cram_free_block(tm->blk);
3770
144k
                cram_free_block(tm->blk2);
3771
144k
                free(tm);
3772
144k
            }
3773
144k
        }
3774
3775
46.4k
        kh_destroy(m_tagmap, c->tags_used);
3776
46.4k
    }
3777
3778
60.6k
    if (c->ref_free)
3779
21.7k
        free(c->ref);
3780
3781
60.6k
    if (c->bams)
3782
120
        free_bam_list(c->bams, c->max_c_rec);
3783
3784
60.6k
    free(c);
3785
60.6k
}
3786
3787
/*
3788
 * Reads a container header.
3789
 *
3790
 * Returns cram_container on success
3791
 *         NULL on failure or no container left (fd->err == 0).
3792
 */
3793
14.2k
cram_container *cram_read_container(cram_fd *fd) {
3794
14.2k
    cram_container c2, *c;
3795
14.2k
    int i, s;
3796
14.2k
    size_t rd = 0;
3797
14.2k
    uint32_t crc = 0;
3798
3799
14.2k
    fd->err = 0;
3800
14.2k
    fd->eof = 0;
3801
3802
14.2k
    memset(&c2, 0, sizeof(c2));
3803
14.2k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3804
13.1k
        if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) {
3805
0
            fd->eof = fd->empty_container ? 1 : 2;
3806
0
            return NULL;
3807
13.1k
        } else {
3808
13.1k
            rd+=s;
3809
13.1k
        }
3810
13.1k
    } else if (CRAM_MAJOR_VERS(fd->version) < 4) {
3811
209
        uint32_t len;
3812
209
        if ((s = int32_decode(fd, &c2.length)) == -1) {
3813
0
            if (CRAM_MAJOR_VERS(fd->version) == 2 &&
3814
0
                CRAM_MINOR_VERS(fd->version) == 0)
3815
0
                fd->eof = 1; // EOF blocks arrived in v2.1
3816
0
            else
3817
0
                fd->eof = fd->empty_container ? 1 : 2;
3818
0
            return NULL;
3819
209
        } else {
3820
209
            rd+=s;
3821
209
        }
3822
209
        len = le_int4(c2.length);
3823
209
        crc = crc32(0L, (unsigned char *)&len, 4);
3824
832
    } else {
3825
832
        if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc))   == -1) {
3826
0
            fd->eof = fd->empty_container ? 1 : 2;
3827
0
            return NULL;
3828
832
        } else {
3829
832
            rd+=s;
3830
832
        }
3831
832
    }
3832
14.2k
    if ((s = fd->vv.varint_decode32s_crc(fd, &c2.ref_seq_id, &crc))   == -1) return NULL; else rd+=s;
3833
14.2k
    if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3834
832
        int64_t i64;
3835
832
        if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc))== -1) return NULL; else rd+=s;
3836
829
        c2.ref_seq_start = i64;
3837
829
        if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc)) == -1) return NULL; else rd+=s;
3838
829
        c2.ref_seq_span = i64;
3839
13.3k
    } else {
3840
13.3k
        int32_t i32;
3841
13.3k
        if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s;
3842
13.3k
        c2.ref_seq_start = i32;
3843
13.3k
        if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s;
3844
13.3k
        c2.ref_seq_span = i32;
3845
13.3k
    }
3846
14.2k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_records, &crc))  == -1) return NULL; else rd+=s;
3847
3848
14.2k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3849
13.1k
        c2.record_counter = 0;
3850
13.1k
        c2.num_bases = 0;
3851
13.1k
    } else {
3852
1.03k
        if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3853
1.03k
            if ((s = fd->vv.varint_decode64_crc(fd, &c2.record_counter, &crc)) == -1)
3854
0
                return NULL;
3855
1.03k
            else
3856
1.03k
                rd += s;
3857
1.03k
        } else {
3858
6
            int32_t i32;
3859
6
            if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1)
3860
0
                return NULL;
3861
6
            else
3862
6
                rd += s;
3863
6
            c2.record_counter = i32;
3864
6
        }
3865
3866
1.03k
        if ((s = fd->vv.varint_decode64_crc(fd, &c2.num_bases, &crc))== -1)
3867
0
            return NULL;
3868
1.03k
        else
3869
1.03k
            rd += s;
3870
1.03k
    }
3871
14.2k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_blocks, &crc))   == -1)
3872
0
        return NULL;
3873
14.2k
    else
3874
14.2k
        rd+=s;
3875
14.2k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_landmarks, &crc))== -1)
3876
0
        return NULL;
3877
14.2k
    else
3878
14.2k
        rd+=s;
3879
3880
14.2k
    if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t))
3881
15
        return NULL;
3882
3883
14.1k
    if (!(c = calloc(1, sizeof(*c))))
3884
0
        return NULL;
3885
3886
14.1k
    *c = c2;
3887
14.1k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3888
14.1k
    if (c->num_landmarks > FUZZ_ALLOC_LIMIT/sizeof(int32_t)) {
3889
0
        fd->err = errno = ENOMEM;
3890
0
        cram_free_container(c);
3891
0
        return NULL;
3892
0
    }
3893
14.1k
#endif
3894
14.1k
    if (c->num_landmarks && !(c->landmark = malloc(c->num_landmarks * sizeof(int32_t)))) {
3895
0
        fd->err = errno;
3896
0
        cram_free_container(c);
3897
0
        return NULL;
3898
0
    }
3899
102k
    for (i = 0; i < c->num_landmarks; i++) {
3900
88.1k
        if ((s = fd->vv.varint_decode32_crc(fd, &c->landmark[i], &crc)) == -1) {
3901
6
            cram_free_container(c);
3902
6
            return NULL;
3903
88.1k
        } else {
3904
88.1k
            rd += s;
3905
88.1k
        }
3906
88.1k
    }
3907
3908
14.1k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3909
1.02k
        if (-1 == int32_decode(fd, (int32_t *)&c->crc32)) {
3910
3
            cram_free_container(c);
3911
3
            return NULL;
3912
1.02k
        } else {
3913
1.02k
            rd+=4;
3914
1.02k
        }
3915
3916
1.02k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3917
        // Pretend the CRC was OK so the fuzzer doesn't have to get it right
3918
1.02k
        crc = c->crc32;
3919
1.02k
#endif
3920
3921
1.02k
        if (crc != c->crc32) {
3922
0
            hts_log_error("Container header CRC32 failure");
3923
0
            cram_free_container(c);
3924
0
            return NULL;
3925
0
        }
3926
1.02k
    }
3927
3928
14.1k
    c->offset = rd;
3929
14.1k
    c->slices = NULL;
3930
14.1k
    c->slice = NULL;
3931
14.1k
    c->curr_slice = 0;
3932
14.1k
    c->max_slice = c->num_landmarks;
3933
14.1k
    c->slice_rec = 0;
3934
14.1k
    c->curr_rec = 0;
3935
14.1k
    c->max_rec = 0;
3936
3937
14.1k
    if (c->ref_seq_id == -2) {
3938
0
        c->multi_seq = 1;
3939
0
        fd->multi_seq = 1;
3940
0
    }
3941
3942
14.1k
    fd->empty_container =
3943
14.1k
        (c->num_records == 0 &&
3944
12.9k
         c->ref_seq_id == -1 &&
3945
14.1k
         c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
3946
3947
14.1k
    return c;
3948
14.1k
}
3949
3950
3951
/* MAXIMUM storage size needed for the container. */
3952
0
int cram_container_size(cram_container *c) {
3953
0
    return 55 + 5*c->num_landmarks;
3954
0
}
3955
3956
3957
/*
3958
 * Stores the container structure in dat and returns *size as the
3959
 * number of bytes written to dat[].  The input size of dat is also
3960
 * held in *size and should be initialised to cram_container_size(c).
3961
 *
3962
 * Returns 0 on success;
3963
 *        -1 on failure
3964
 */
3965
int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
3966
0
{
3967
0
    char *cp = (char *)dat;
3968
0
    int i;
3969
3970
    // Check the input buffer is large enough according to our stated
3971
    // requirements. (NOTE: it may actually take less.)
3972
0
    if (cram_container_size(c) > *size)
3973
0
        return -1;
3974
3975
0
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3976
0
        cp += itf8_put(cp, c->length);
3977
0
    } else {
3978
0
        *(int32_t *)cp = le_int4(c->length);
3979
0
        cp += 4;
3980
0
    }
3981
0
    if (c->multi_seq) {
3982
0
        cp += fd->vv.varint_put32(cp, NULL, -2);
3983
0
        cp += fd->vv.varint_put32(cp, NULL, 0);
3984
0
        cp += fd->vv.varint_put32(cp, NULL, 0);
3985
0
    } else {
3986
0
        cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
3987
0
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3988
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start);
3989
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span);
3990
0
        } else {
3991
0
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
3992
0
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
3993
0
        }
3994
0
    }
3995
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_records);
3996
0
    if (CRAM_MAJOR_VERS(fd->version) == 2) {
3997
0
        cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
3998
0
    } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3999
0
        cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
4000
0
    }
4001
0
    cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
4002
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
4003
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
4004
0
    for (i = 0; i < c->num_landmarks; i++)
4005
0
        cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
4006
4007
0
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
4008
0
        c->crc32 = crc32(0L, (uc *)dat, cp-dat);
4009
0
        cp[0] =  c->crc32        & 0xff;
4010
0
        cp[1] = (c->crc32 >>  8) & 0xff;
4011
0
        cp[2] = (c->crc32 >> 16) & 0xff;
4012
0
        cp[3] = (c->crc32 >> 24) & 0xff;
4013
0
        cp += 4;
4014
0
    }
4015
4016
0
    *size = cp-dat; // actual used size
4017
4018
0
    return 0;
4019
0
}
4020
4021
4022
/*
4023
 * Writes a container structure.
4024
 *
4025
 * Returns 0 on success
4026
 *        -1 on failure
4027
 */
4028
48.6k
int cram_write_container(cram_fd *fd, cram_container *c) {
4029
48.6k
    char buf_a[1024], *buf = buf_a, *cp;
4030
48.6k
    int i;
4031
4032
48.6k
    if (61 + c->num_landmarks * 10 >= 1024) {
4033
0
        buf = malloc(61 + c->num_landmarks * 10);
4034
0
        if (!buf)
4035
0
            return -1;
4036
0
    }
4037
48.6k
    cp = buf;
4038
4039
48.6k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4040
0
        cp += itf8_put(cp, c->length);
4041
48.6k
    } else if (CRAM_MAJOR_VERS(fd->version) <= 3) {
4042
48.6k
        *(int32_t *)cp = le_int4(c->length);
4043
48.6k
        cp += 4;
4044
48.6k
    } else {
4045
0
        cp += fd->vv.varint_put32(cp, NULL, c->length);
4046
0
    }
4047
48.6k
    if (c->multi_seq) {
4048
477
        cp += fd->vv.varint_put32(cp, NULL, (uint32_t)-2);
4049
477
        cp += fd->vv.varint_put32(cp, NULL, 0);
4050
477
        cp += fd->vv.varint_put32(cp, NULL, 0);
4051
48.2k
    } else {
4052
48.2k
        cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
4053
48.2k
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
4054
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start);
4055
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span);
4056
48.2k
        } else {
4057
48.2k
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
4058
48.2k
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
4059
48.2k
        }
4060
48.2k
    }
4061
48.6k
    cp += fd->vv.varint_put32(cp, NULL, c->num_records);
4062
48.6k
    if (CRAM_MAJOR_VERS(fd->version) >= 3)
4063
48.6k
        cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
4064
0
    else
4065
0
        cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
4066
48.6k
    cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
4067
48.6k
    cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
4068
48.6k
    cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
4069
97.2k
    for (i = 0; i < c->num_landmarks; i++)
4070
48.6k
        cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
4071
4072
48.6k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
4073
48.6k
        c->crc32 = crc32(0L, (uc *)buf, cp-buf);
4074
48.6k
        cp[0] =  c->crc32        & 0xff;
4075
48.6k
        cp[1] = (c->crc32 >>  8) & 0xff;
4076
48.6k
        cp[2] = (c->crc32 >> 16) & 0xff;
4077
48.6k
        cp[3] = (c->crc32 >> 24) & 0xff;
4078
48.6k
        cp += 4;
4079
48.6k
    }
4080
4081
48.6k
    if (cp-buf != hwrite(fd->fp, buf, cp-buf)) {
4082
0
        if (buf != buf_a)
4083
0
            free(buf);
4084
0
        return -1;
4085
0
    }
4086
4087
48.6k
    if (buf != buf_a)
4088
0
        free(buf);
4089
4090
48.6k
    return 0;
4091
48.6k
}
4092
4093
// common component shared by cram_flush_container{,_mt}
4094
43.9k
static int cram_flush_container2(cram_fd *fd, cram_container *c) {
4095
43.9k
    int i, j;
4096
4097
43.9k
    if (c->curr_slice > 0 && !c->slices)
4098
0
        return -1;
4099
4100
    //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
4101
4102
43.9k
    off_t c_offset = htell(fd->fp); // File offset of container
4103
4104
    /* Write the container struct itself */
4105
43.9k
    if (0 != cram_write_container(fd, c))
4106
0
        return -1;
4107
4108
43.9k
    off_t hdr_size = htell(fd->fp) - c_offset;
4109
4110
    /* And the compression header */
4111
43.9k
    if (0 != cram_write_block(fd, c->comp_hdr_block))
4112
0
        return -1;
4113
4114
    /* Followed by the slice blocks */
4115
43.9k
    off_t file_offset = htell(fd->fp);
4116
87.9k
    for (i = 0; i < c->curr_slice; i++) {
4117
43.9k
        cram_slice *s = c->slices[i];
4118
43.9k
        off_t spos = file_offset - c_offset - hdr_size;
4119
4120
43.9k
        if (0 != cram_write_block(fd, s->hdr_block))
4121
0
            return -1;
4122
4123
342k
        for (j = 0; j < s->hdr->num_blocks; j++) {
4124
298k
            if (0 != cram_write_block(fd, s->block[j]))
4125
0
                return -1;
4126
298k
        }
4127
4128
43.9k
        file_offset = htell(fd->fp);
4129
43.9k
        off_t sz = file_offset - c_offset - hdr_size - spos;
4130
4131
43.9k
        if (fd->idxfp) {
4132
0
            if (cram_index_slice(fd, c, s, fd->idxfp, c_offset, spos, sz) < 0)
4133
0
                return -1;
4134
0
        }
4135
43.9k
    }
4136
4137
43.9k
    return 0;
4138
43.9k
}
4139
4140
/*
4141
 * Flushes a completely or partially full container to disk, writing
4142
 * container structure, header and blocks. This also calls the encoder
4143
 * functions.
4144
 *
4145
 * Returns 0 on success
4146
 *        -1 on failure
4147
 */
4148
44.1k
int cram_flush_container(cram_fd *fd, cram_container *c) {
4149
    /* Encode the container blocks and generate compression header */
4150
44.1k
    if (0 != cram_encode_container(fd, c))
4151
180
        return -1;
4152
4153
43.9k
    return cram_flush_container2(fd, c);
4154
44.1k
}
4155
4156
typedef struct {
4157
    cram_fd *fd;
4158
    cram_container *c;
4159
} cram_job;
4160
4161
0
void *cram_flush_thread(void *arg) {
4162
0
    cram_job *j = (cram_job *)arg;
4163
4164
    /* Encode the container blocks and generate compression header */
4165
0
    if (0 != cram_encode_container(j->fd, j->c)) {
4166
0
        hts_log_error("Call to cram_encode_container failed");
4167
0
        return NULL;
4168
0
    }
4169
4170
0
    return arg;
4171
0
}
4172
4173
0
static int cram_flush_result(cram_fd *fd) {
4174
0
    int i, ret = 0;
4175
0
    hts_tpool_result *r;
4176
0
    cram_container *lc = NULL;
4177
4178
    // NB: we can have one result per slice, not per container,
4179
    // so we need to free the container only after all slices
4180
    // within it have been freed.  (Automatic via reference counting.)
4181
0
    while ((r = hts_tpool_next_result(fd->rqueue))) {
4182
0
        cram_job *j = (cram_job *)hts_tpool_result_data(r);
4183
0
        cram_container *c;
4184
4185
0
        if (!j) {
4186
0
            hts_tpool_delete_result(r, 0);
4187
0
            return -1;
4188
0
        }
4189
4190
0
        fd = j->fd;
4191
0
        c = j->c;
4192
4193
0
        if (fd->mode == 'w')
4194
0
            if (0 != cram_flush_container2(fd, c))
4195
0
                return -1;
4196
4197
        // Free the slices; filled out by encoder only
4198
0
        if (c->slices) {
4199
0
            for (i = 0; i < c->max_slice; i++) {
4200
0
                if (c->slices[i])
4201
0
                    cram_free_slice(c->slices[i]);
4202
0
                if (c->slices[i] == c->slice)
4203
0
                    c->slice = NULL;
4204
0
                c->slices[i] = NULL;
4205
0
            }
4206
0
        }
4207
4208
        // Free the current slice; set by both encoder & decoder
4209
0
        if (c->slice) {
4210
0
            cram_free_slice(c->slice);
4211
0
            c->slice = NULL;
4212
0
        }
4213
0
        c->curr_slice = 0;
4214
4215
        // Our jobs will be in order, so we free the last
4216
        // container when our job has switched to a new one.
4217
0
        if (c != lc) {
4218
0
            if (lc) {
4219
0
                if (fd->ctr == lc)
4220
0
                    fd->ctr = NULL;
4221
0
                if (fd->ctr_mt == lc)
4222
0
                    fd->ctr_mt = NULL;
4223
0
                cram_free_container(lc);
4224
0
            }
4225
0
            lc = c;
4226
0
        }
4227
4228
0
        hts_tpool_delete_result(r, 1);
4229
0
    }
4230
0
    if (lc) {
4231
0
        if (fd->ctr == lc)
4232
0
            fd->ctr = NULL;
4233
0
        if (fd->ctr_mt == lc)
4234
0
            fd->ctr_mt = NULL;
4235
0
        cram_free_container(lc);
4236
0
    }
4237
4238
0
    return ret;
4239
0
}
4240
4241
// Note: called while metrics_lock is held.
4242
// Will be left in this state too, but may temporarily unlock.
4243
3.82k
void reset_metrics(cram_fd *fd) {
4244
3.82k
    int i;
4245
4246
3.82k
    if (fd->pool) {
4247
        // If multi-threaded we have multiple blocks being
4248
        // compressed already and several on the to-do list
4249
        // (fd->rqueue->pending).  It's tricky to reset the
4250
        // metrics exactly the correct point, so instead we
4251
        // just flush the pool, reset, and then continue again.
4252
4253
        // Don't bother starting a new trial before then though.
4254
0
        for (i = 0; i < DS_END; i++) {
4255
0
            cram_metrics *m = fd->m[i];
4256
0
            if (!m)
4257
0
                continue;
4258
0
            m->next_trial = 999;
4259
0
        }
4260
4261
0
        pthread_mutex_unlock(&fd->metrics_lock);
4262
0
        hts_tpool_process_flush(fd->rqueue);
4263
0
        pthread_mutex_lock(&fd->metrics_lock);
4264
0
    }
4265
4266
183k
    for (i = 0; i < DS_END; i++) {
4267
179k
        cram_metrics *m = fd->m[i];
4268
179k
        if (!m)
4269
0
            continue;
4270
4271
179k
        m->trial = NTRIALS;
4272
179k
        m->next_trial = TRIAL_SPAN;
4273
179k
        m->revised_method = 0;
4274
179k
        m->unpackable = 0;
4275
4276
179k
        memset(m->sz, 0, sizeof(m->sz));
4277
179k
    }
4278
3.82k
}
4279
4280
44.1k
int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
4281
44.1k
    cram_job *j;
4282
4283
    // At the junction of mapped to unmapped data the compression
4284
    // methods may need to change due to very different statistical
4285
    // properties; particularly BA if minhash sorted.
4286
    //
4287
    // However with threading we'll have several in-flight blocks
4288
    // arriving out of order.
4289
    //
4290
    // So we do one trial reset of NThreads to last for NThreads
4291
    // duration to get us over this transition period, followed
4292
    // by another retrial of the usual ntrials & trial span.
4293
44.1k
    pthread_mutex_lock(&fd->metrics_lock);
4294
44.1k
    if (c->n_mapped < 0.3*c->curr_rec &&
4295
23.3k
        fd->last_mapped > 0.7*c->max_rec) {
4296
3.82k
        reset_metrics(fd);
4297
3.82k
    }
4298
44.1k
    fd->last_mapped = c->n_mapped * (c->max_rec+1)/(c->curr_rec+1) ;
4299
44.1k
    pthread_mutex_unlock(&fd->metrics_lock);
4300
4301
44.1k
    if (!fd->pool)
4302
44.1k
        return cram_flush_container(fd, c);
4303
4304
0
    if (!(j = malloc(sizeof(*j))))
4305
0
        return -1;
4306
0
    j->fd = fd;
4307
0
    j->c = c;
4308
4309
    // Flush the job.  Note our encoder queue may be full, so we
4310
    // either have to keep trying in non-blocking mode (what we do) or
4311
    // use a dedicated separate thread for draining the queue.
4312
0
    for (;;) {
4313
0
        errno = 0;
4314
0
        hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_flush_thread, j, 1);
4315
0
        int pending = (errno == EAGAIN);
4316
0
        if (cram_flush_result(fd) != 0)
4317
0
            return -1;
4318
0
        if (!pending)
4319
0
            break;
4320
4321
0
        hts_usleep(1000);
4322
0
    }
4323
4324
0
    return 0;
4325
0
}
4326
4327
/* ----------------------------------------------------------------------
4328
 * Compression headers; the first part of the container
4329
 */
4330
4331
/*
4332
 * Creates a new blank container compression header
4333
 *
4334
 * Returns header ptr on success
4335
 *         NULL on failure
4336
 */
4337
46.4k
cram_block_compression_hdr *cram_new_compression_header(void) {
4338
46.4k
    cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
4339
46.4k
    if (!hdr)
4340
0
        return NULL;
4341
4342
46.4k
    if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
4343
0
        free(hdr);
4344
0
        return NULL;
4345
0
    }
4346
4347
46.4k
    if (!(hdr->TD_hash = kh_init(m_s2i))) {
4348
0
        cram_free_block(hdr->TD_blk);
4349
0
        free(hdr);
4350
0
        return NULL;
4351
0
    }
4352
4353
46.4k
    if (!(hdr->TD_keys = string_pool_create(8192))) {
4354
0
        kh_destroy(m_s2i, hdr->TD_hash);
4355
0
        cram_free_block(hdr->TD_blk);
4356
0
        free(hdr);
4357
0
        return NULL;
4358
0
    }
4359
4360
46.4k
    return hdr;
4361
46.4k
}
4362
4363
47.1k
void cram_free_compression_header(cram_block_compression_hdr *hdr) {
4364
47.1k
    int i;
4365
4366
47.1k
    if (hdr->landmark)
4367
673
        free(hdr->landmark);
4368
4369
47.1k
    if (hdr->preservation_map)
4370
44.6k
        kh_destroy(map, hdr->preservation_map);
4371
4372
1.55M
    for (i = 0; i < CRAM_MAP_HASH; i++) {
4373
1.50M
        cram_map *m, *m2;
4374
1.52M
        for (m = hdr->rec_encoding_map[i]; m; m = m2) {
4375
20.9k
            m2 = m->next;
4376
20.9k
            if (m->codec)
4377
0
                m->codec->free(m->codec);
4378
20.9k
            free(m);
4379
20.9k
        }
4380
1.50M
    }
4381
4382
1.55M
    for (i = 0; i < CRAM_MAP_HASH; i++) {
4383
1.50M
        cram_map *m, *m2;
4384
1.50M
        for (m = hdr->tag_encoding_map[i]; m; m = m2) {
4385
903
            m2 = m->next;
4386
903
            if (m->codec)
4387
903
                m->codec->free(m->codec);
4388
903
            free(m);
4389
903
        }
4390
1.50M
    }
4391
4392
2.26M
    for (i = 0; i < DS_END; i++) {
4393
2.21M
        if (hdr->codecs[i])
4394
821k
            hdr->codecs[i]->free(hdr->codecs[i]);
4395
2.21M
    }
4396
4397
47.1k
    if (hdr->TL)
4398
76
        free(hdr->TL);
4399
47.1k
    if (hdr->TD_blk)
4400
46.5k
        cram_free_block(hdr->TD_blk);
4401
47.1k
    if (hdr->TD_hash)
4402
46.4k
        kh_destroy(m_s2i, hdr->TD_hash);
4403
47.1k
    if (hdr->TD_keys)
4404
46.4k
        string_pool_destroy(hdr->TD_keys);
4405
4406
47.1k
    free(hdr);
4407
47.1k
}
4408
4409
4410
/* ----------------------------------------------------------------------
4411
 * Slices and slice headers
4412
 */
4413
4414
44.7k
void cram_free_slice_header(cram_block_slice_hdr *hdr) {
4415
44.7k
    if (!hdr)
4416
0
        return;
4417
4418
44.7k
    if (hdr->block_content_ids)
4419
44.5k
        free(hdr->block_content_ids);
4420
4421
44.7k
    free(hdr);
4422
4423
44.7k
    return;
4424
44.7k
}
4425
4426
44.7k
void cram_free_slice(cram_slice *s) {
4427
44.7k
    if (!s)
4428
0
        return;
4429
4430
44.7k
    if (s->hdr_block)
4431
44.5k
        cram_free_block(s->hdr_block);
4432
4433
44.7k
    if (s->block) {
4434
44.5k
        int i;
4435
4436
44.5k
        if (s->hdr) {
4437
345k
            for (i = 0; i < s->hdr->num_blocks; i++) {
4438
301k
                if (i > 0 && s->block[i] == s->block[0])
4439
1.68k
                    continue;
4440
299k
                cram_free_block(s->block[i]);
4441
299k
            }
4442
44.5k
        }
4443
44.5k
        free(s->block);
4444
44.5k
    }
4445
4446
44.7k
    {
4447
        // Normally already copied into s->block[], but potentially still
4448
        // here if we error part way through cram_encode_slice.
4449
44.7k
        int i;
4450
189k
        for (i = 0; i < s->naux_block; i++)
4451
144k
            cram_free_block(s->aux_block[i]);
4452
44.7k
    }
4453
4454
44.7k
    if (s->block_by_id)
4455
570
        free(s->block_by_id);
4456
4457
44.7k
    if (s->hdr)
4458
44.7k
        cram_free_slice_header(s->hdr);
4459
4460
44.7k
    if (s->seqs_blk)
4461
44.7k
        cram_free_block(s->seqs_blk);
4462
4463
44.7k
    if (s->qual_blk)
4464
719
        cram_free_block(s->qual_blk);
4465
4466
44.7k
    if (s->name_blk)
4467
719
        cram_free_block(s->name_blk);
4468
4469
44.7k
    if (s->aux_blk)
4470
44.7k
        cram_free_block(s->aux_blk);
4471
4472
44.7k
    if (s->base_blk)
4473
719
        cram_free_block(s->base_blk);
4474
4475
44.7k
    if (s->soft_blk)
4476
719
        cram_free_block(s->soft_blk);
4477
4478
44.7k
    if (s->cigar)
4479
44.7k
        free(s->cigar);
4480
4481
44.7k
    if (s->crecs)
4482
44.6k
        free(s->crecs);
4483
4484
44.7k
    if (s->features)
4485
20.7k
        free(s->features);
4486
4487
44.7k
    if (s->TN)
4488
0
        free(s->TN);
4489
4490
44.7k
    if (s->pair_keys)
4491
44.1k
        string_pool_destroy(s->pair_keys);
4492
4493
44.7k
    if (s->pair[0])
4494
44.1k
        kh_destroy(m_s2i, s->pair[0]);
4495
44.7k
    if (s->pair[1])
4496
44.1k
        kh_destroy(m_s2i, s->pair[1]);
4497
4498
44.7k
    if (s->aux_block)
4499
39.5k
        free(s->aux_block);
4500
4501
44.7k
    free(s);
4502
44.7k
}
4503
4504
/*
4505
 * Creates a new empty slice in memory, for subsequent writing to
4506
 * disk.
4507
 *
4508
 * Returns cram_slice ptr on success
4509
 *         NULL on failure
4510
 */
4511
44.1k
cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
4512
44.1k
    cram_slice *s = calloc(1, sizeof(*s));
4513
44.1k
    if (!s)
4514
0
        return NULL;
4515
4516
44.1k
    if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
4517
0
        goto err;
4518
44.1k
    s->hdr->content_type = type;
4519
4520
44.1k
    s->hdr_block = NULL;
4521
44.1k
    s->block = NULL;
4522
44.1k
    s->block_by_id = NULL;
4523
44.1k
    s->last_apos = 0;
4524
44.1k
    if (!(s->crecs = malloc(nrecs * sizeof(cram_record))))  goto err;
4525
44.1k
    s->cigar_alloc = 1024;
4526
44.1k
    if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err;
4527
44.1k
    s->ncigar = 0;
4528
4529
44.1k
    if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))       goto err;
4530
44.1k
    if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))   goto err;
4531
44.1k
    if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))   goto err;
4532
44.1k
    if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux)))  goto err;
4533
44.1k
    if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))   goto err;
4534
44.1k
    if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))   goto err;
4535
4536
44.1k
    s->features = NULL;
4537
44.1k
    s->nfeatures = s->afeatures = 0;
4538
4539
44.1k
#ifndef TN_external
4540
44.1k
    s->TN = NULL;
4541
44.1k
    s->nTN = s->aTN = 0;
4542
44.1k
#endif
4543
4544
    // Volatile keys as we do realloc in dstring
4545
44.1k
    if (!(s->pair_keys = string_pool_create(8192))) goto err;
4546
44.1k
    if (!(s->pair[0] = kh_init(m_s2i)))             goto err;
4547
44.1k
    if (!(s->pair[1] = kh_init(m_s2i)))             goto err;
4548
4549
#ifdef BA_external
4550
    s->BA_len = 0;
4551
#endif
4552
4553
44.1k
    return s;
4554
4555
0
 err:
4556
0
    if (s)
4557
0
        cram_free_slice(s);
4558
4559
0
    return NULL;
4560
44.1k
}
4561
4562
/*
4563
 * Loads an entire slice.
4564
 * FIXME: In 1.0 the native unit of slices within CRAM is broken
4565
 * as slices contain references to objects in other slices.
4566
 * To work around this while keeping the slice oriented outer loop
4567
 * we read all slices and stitch them together into a fake large
4568
 * slice instead.
4569
 *
4570
 * Returns cram_slice ptr on success
4571
 *         NULL on failure
4572
 */
4573
600
cram_slice *cram_read_slice(cram_fd *fd) {
4574
600
    cram_block *b = cram_read_block(fd);
4575
600
    cram_slice *s = calloc(1, sizeof(*s));
4576
600
    int i, n, max_id, min_id;
4577
4578
600
    if (!b || !s)
4579
11
        goto err;
4580
4581
589
    s->hdr_block = b;
4582
589
    switch (b->content_type) {
4583
585
    case MAPPED_SLICE:
4584
585
    case UNMAPPED_SLICE:
4585
585
        if (!(s->hdr = cram_decode_slice_header(fd, b)))
4586
8
            goto err;
4587
577
        break;
4588
4589
577
    default:
4590
4
        hts_log_error("Unexpected block of type %s",
4591
4
                      cram_content_type2str(b->content_type));
4592
4
        goto err;
4593
589
    }
4594
4595
577
    if (s->hdr->num_blocks < 1) {
4596
2
        hts_log_error("Slice does not include any data blocks");
4597
2
        goto err;
4598
2
    }
4599
4600
575
    s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
4601
575
    if (!s->block)
4602
0
        goto err;
4603
4604
2.95k
    for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
4605
2.38k
        if (!(s->block[i] = cram_read_block(fd)))
4606
5
            goto err;
4607
4608
2.38k
        if (s->block[i]->content_type == EXTERNAL) {
4609
567
            if (max_id < s->block[i]->content_id)
4610
6
                max_id = s->block[i]->content_id;
4611
567
            if (min_id > s->block[i]->content_id)
4612
567
                min_id = s->block[i]->content_id;
4613
567
        }
4614
2.38k
    }
4615
4616
570
    if (!(s->block_by_id = calloc(512, sizeof(s->block[0]))))
4617
0
        goto err;
4618
4619
2.85k
    for (i = 0; i < n; i++) {
4620
2.28k
        if (s->block[i]->content_type != EXTERNAL)
4621
1.71k
            continue;
4622
567
        uint32_t v = s->block[i]->content_id;
4623
567
        if (v >= 256)
4624
2
            v = 256 + v % 251;
4625
567
        s->block_by_id[v] = s->block[i];
4626
567
    }
4627
4628
    /* Initialise encoding/decoding tables */
4629
570
    s->cigar_alloc = 1024;
4630
570
    if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err;
4631
570
    s->ncigar = 0;
4632
4633
570
    if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))      goto err;
4634
570
    if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))  goto err;
4635
570
    if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))  goto err;
4636
570
    if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux))) goto err;
4637
570
    if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))  goto err;
4638
570
    if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))  goto err;
4639
4640
570
    s->crecs = NULL;
4641
4642
570
    s->last_apos = s->hdr->ref_seq_start;
4643
570
    s->decode_md = fd->decode_md;
4644
4645
570
    return s;
4646
4647
30
 err:
4648
30
    if (b)
4649
19
        cram_free_block(b);
4650
30
    if (s) {
4651
30
        s->hdr_block = NULL;
4652
30
        cram_free_slice(s);
4653
30
    }
4654
30
    return NULL;
4655
570
}
4656
4657
4658
/* ----------------------------------------------------------------------
4659
 * CRAM file definition (header)
4660
 */
4661
4662
/*
4663
 * Reads a CRAM file definition structure.
4664
 * Returns file_def ptr on success
4665
 *         NULL on failure
4666
 */
4667
1.61k
cram_file_def *cram_read_file_def(cram_fd *fd) {
4668
1.61k
    cram_file_def *def = malloc(sizeof(*def));
4669
1.61k
    if (!def)
4670
0
        return NULL;
4671
4672
1.61k
    if (26 != hread(fd->fp, &def->magic[0], 26)) {
4673
0
        free(def);
4674
0
        return NULL;
4675
0
    }
4676
4677
1.61k
    if (memcmp(def->magic, "CRAM", 4) != 0) {
4678
0
        free(def);
4679
0
        return NULL;
4680
0
    }
4681
4682
1.61k
    if (def->major_version > 4) {
4683
0
        hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x, 3.x or 4.x, got %d.%d",
4684
0
                      def->major_version, def->minor_version);
4685
0
        free(def);
4686
0
        return NULL;
4687
0
    }
4688
4689
1.61k
    fd->first_container += 26;
4690
1.61k
    fd->curr_position = fd->first_container;
4691
1.61k
    fd->last_slice = 0;
4692
4693
1.61k
    return def;
4694
1.61k
}
4695
4696
/*
4697
 * Writes a cram_file_def structure to cram_fd.
4698
 * Returns 0 on success
4699
 *        -1 on failure
4700
 */
4701
2.32k
int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
4702
2.32k
    return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1;
4703
2.32k
}
4704
4705
4.15k
void cram_free_file_def(cram_file_def *def) {
4706
4.15k
    if (def) free(def);
4707
4.15k
}
4708
4709
/* ----------------------------------------------------------------------
4710
 * SAM header I/O
4711
 */
4712
4713
4714
/*
4715
 * Reads the SAM header from the first CRAM data block.
4716
 * Also performs minimal parsing to extract read-group
4717
 * and sample information.
4718
4719
 * Returns SAM hdr ptr on success
4720
 *         NULL on failure
4721
 */
4722
1.61k
sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) {
4723
1.61k
    int32_t header_len;
4724
1.61k
    char *header;
4725
1.61k
    sam_hdr_t *hdr;
4726
4727
    /* 1.1 onwards stores the header in the first block of a container */
4728
1.61k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4729
        /* Length */
4730
1.55k
        if (-1 == int32_decode(fd, &header_len))
4731
0
            return NULL;
4732
4733
1.55k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
4734
1.55k
        if (header_len > FUZZ_ALLOC_LIMIT)
4735
0
            return NULL;
4736
1.55k
#endif
4737
4738
        /* Alloc and read */
4739
1.55k
        if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1)))
4740
0
            return NULL;
4741
4742
1.55k
        if (header_len != hread(fd->fp, header, header_len)) {
4743
0
            free(header);
4744
0
            return NULL;
4745
0
        }
4746
1.55k
        header[header_len] = '\0';
4747
4748
1.55k
        fd->first_container += 4 + header_len;
4749
1.55k
    } else {
4750
63
        cram_container *c = cram_read_container(fd);
4751
63
        cram_block *b;
4752
63
        int i;
4753
63
        int64_t len;
4754
4755
63
        if (!c)
4756
0
            return NULL;
4757
4758
63
        fd->first_container += c->length + c->offset;
4759
63
        fd->curr_position = fd->first_container;
4760
4761
63
        if (c->num_blocks < 1) {
4762
0
            cram_free_container(c);
4763
0
            return NULL;
4764
0
        }
4765
4766
63
        if (!(b = cram_read_block(fd))) {
4767
2
            cram_free_container(c);
4768
2
            return NULL;
4769
2
        }
4770
61
        if (cram_uncompress_block(b) != 0) {
4771
28
            cram_free_container(c);
4772
28
            cram_free_block(b);
4773
28
            return NULL;
4774
28
        }
4775
4776
33
        len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4777
33
            fd->vv.varint_size(b->content_id) +
4778
33
            fd->vv.varint_size(b->uncomp_size) +
4779
33
            fd->vv.varint_size(b->comp_size);
4780
4781
        /* Extract header from 1st block */
4782
33
        if (-1 == int32_get_blk(b, &header_len) ||
4783
33
            header_len < 0 || /* Spec. says signed...  why? */
4784
33
            b->uncomp_size - 4 < header_len) {
4785
0
            cram_free_container(c);
4786
0
            cram_free_block(b);
4787
0
            return NULL;
4788
0
        }
4789
33
        if (NULL == (header = malloc((size_t) header_len+1))) {
4790
0
            cram_free_container(c);
4791
0
            cram_free_block(b);
4792
0
            return NULL;
4793
0
        }
4794
33
        memcpy(header, BLOCK_END(b), header_len);
4795
33
        header[header_len] = '\0';
4796
33
        cram_free_block(b);
4797
4798
        /* Consume any remaining blocks */
4799
486
        for (i = 1; i < c->num_blocks; i++) {
4800
454
            if (!(b = cram_read_block(fd))) {
4801
1
                cram_free_container(c);
4802
1
                free(header);
4803
1
                return NULL;
4804
1
            }
4805
453
            len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4806
453
                fd->vv.varint_size(b->content_id) +
4807
453
                fd->vv.varint_size(b->uncomp_size) +
4808
453
                fd->vv.varint_size(b->comp_size);
4809
453
            cram_free_block(b);
4810
453
        }
4811
4812
32
        if (c->length > 0 && len > 0 && c->length > len) {
4813
            // Consume padding
4814
0
            char *pads = malloc(c->length - len);
4815
0
            if (!pads) {
4816
0
                cram_free_container(c);
4817
0
                free(header);
4818
0
                return NULL;
4819
0
            }
4820
4821
0
            if (c->length - len != hread(fd->fp, pads, c->length - len)) {
4822
0
                cram_free_container(c);
4823
0
                free(header);
4824
0
                free(pads);
4825
0
                return NULL;
4826
0
            }
4827
0
            free(pads);
4828
0
        }
4829
4830
32
        cram_free_container(c);
4831
32
    }
4832
4833
    /* Parse */
4834
1.58k
    hdr = sam_hdr_init();
4835
1.58k
    if (!hdr) {
4836
0
        free(header);
4837
0
        return NULL;
4838
0
    }
4839
4840
1.58k
    if (-1 == sam_hdr_add_lines(hdr, header, header_len)) {
4841
5
        free(header);
4842
5
        sam_hdr_destroy(hdr);
4843
5
        return NULL;
4844
5
    }
4845
4846
1.58k
    hdr->l_text = header_len;
4847
1.58k
    hdr->text = header;
4848
4849
1.58k
    return hdr;
4850
4851
1.58k
}
4852
4853
/*
4854
 * Converts 'in' to a full pathname to store in out.
4855
 * Out must be at least PATH_MAX bytes long.
4856
 */
4857
0
static void full_path(char *out, char *in) {
4858
0
    size_t in_l = strlen(in);
4859
0
    if (hisremote(in)) {
4860
0
        if (in_l > PATH_MAX) {
4861
0
            hts_log_error("Reference path is longer than %d", PATH_MAX);
4862
0
            return;
4863
0
        }
4864
0
        strncpy(out, in, PATH_MAX-1);
4865
0
        out[PATH_MAX-1] = 0;
4866
0
        return;
4867
0
    }
4868
0
    if (*in == '/' ||
4869
        // Windows paths
4870
0
        (in_l > 3 && toupper_c(*in) >= 'A'  && toupper_c(*in) <= 'Z' &&
4871
0
         in[1] == ':' && (in[2] == '/' || in[2] == '\\'))) {
4872
0
        strncpy(out, in, PATH_MAX-1);
4873
0
        out[PATH_MAX-1] = 0;
4874
0
    } else {
4875
0
        size_t len;
4876
4877
        // unable to get dir or out+in is too long
4878
0
        if (!getcwd(out, PATH_MAX) ||
4879
0
            (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
4880
0
            strncpy(out, in, PATH_MAX-1);
4881
0
            out[PATH_MAX-1] = 0;
4882
0
            return;
4883
0
        }
4884
4885
0
        snprintf(out+len, PATH_MAX - len, "/%s", in);
4886
4887
        // FIXME: cope with `pwd`/../../../foo.fa ?
4888
0
    }
4889
0
}
4890
4891
/*
4892
 * Writes a CRAM SAM header.
4893
 * Returns 0 on success
4894
 *        -1 on failure
4895
 */
4896
2.32k
int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) {
4897
2.32k
    size_t header_len;
4898
2.32k
    int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
4899
4900
    /* Write CRAM MAGIC if not yet written. */
4901
2.32k
    if (fd->file_def->major_version == 0) {
4902
2.32k
        fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version);
4903
2.32k
        fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version);
4904
2.32k
        if (0 != cram_write_file_def(fd, fd->file_def))
4905
0
            return -1;
4906
2.32k
    }
4907
4908
    /* 1.0 requires an UNKNOWN read-group */
4909
2.32k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4910
0
        if (!sam_hrecs_find_rg(hdr->hrecs, "UNKNOWN"))
4911
0
            if (sam_hdr_add_line(hdr, "RG",
4912
0
                            "ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
4913
0
                return -1;
4914
0
    }
4915
4916
2.32k
    if (-1 == refs_from_header(fd))
4917
0
        return -1;
4918
2.32k
    if (-1 == refs2id(fd->refs, fd->header))
4919
0
        return -1;
4920
4921
    /* Fix M5 strings */
4922
2.32k
    if (fd->refs && !fd->no_ref && fd->embed_ref <= 1) {
4923
2.32k
        int i;
4924
2.35k
        for (i = 0; i < hdr->hrecs->nref; i++) {
4925
1.34k
            sam_hrec_type_t *ty;
4926
1.34k
            char *ref;
4927
4928
1.34k
            if (!(ty = sam_hrecs_find_type_id(hdr->hrecs, "SQ", "SN", hdr->hrecs->ref[i].name)))
4929
0
                return -1;
4930
4931
1.34k
            if (!sam_hrecs_find_key(ty, "M5", NULL)) {
4932
1.32k
                char unsigned buf[16];
4933
1.32k
                char buf2[33];
4934
1.32k
                hts_pos_t rlen;
4935
1.32k
                hts_md5_context *md5;
4936
4937
1.32k
                if (!fd->refs ||
4938
1.32k
                    !fd->refs->ref_id ||
4939
1.32k
                    !fd->refs->ref_id[i]) {
4940
0
                    return -1;
4941
0
                }
4942
1.32k
                rlen = fd->refs->ref_id[i]->length;
4943
1.32k
                ref = cram_get_ref(fd, i, 1, rlen);
4944
1.32k
                if (NULL == ref) {
4945
1.32k
                    if (fd->embed_ref == -1) {
4946
                        // auto embed-ref
4947
1.32k
                        hts_log_warning("No M5 tags present and could not "
4948
1.32k
                                        "find reference");
4949
1.32k
                        hts_log_warning("Enabling embed_ref=2 option");
4950
1.32k
                        hts_log_warning("NOTE: the CRAM file will be bigger "
4951
1.32k
                                        "than using an external reference");
4952
1.32k
                        pthread_mutex_lock(&fd->ref_lock);
4953
                        // Best guess.  It may be unmapped data with broken
4954
                        // headers, in which case this will get ignored.
4955
1.32k
                        fd->embed_ref = 2;
4956
1.32k
                        pthread_mutex_unlock(&fd->ref_lock);
4957
1.32k
                        break;
4958
1.32k
                    }
4959
0
                    return -1;
4960
1.32k
                }
4961
0
                rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */
4962
0
                if (!(md5 = hts_md5_init()))
4963
0
                    return -1;
4964
0
                if (HTS_POS_MAX <= ULONG_MAX) {
4965
                    // Platforms with 64-bit unsigned long update in one go
4966
0
                    hts_md5_update(md5, ref, rlen);
4967
0
                } else {
4968
                    // Those with 32-bit ulong (Windows) may have to loop
4969
                    // over epic references
4970
0
                    hts_pos_t pos = 0;
4971
0
                    while (rlen - pos > ULONG_MAX) {
4972
0
                        hts_md5_update(md5, ref + pos, ULONG_MAX);
4973
0
                        pos += ULONG_MAX;
4974
0
                    }
4975
0
                    hts_md5_update(md5, ref + pos, (unsigned long)(rlen - pos));
4976
0
                }
4977
0
                hts_md5_final(buf, md5);
4978
0
                hts_md5_destroy(md5);
4979
0
                cram_ref_decr(fd->refs, i);
4980
4981
0
                hts_md5_hex(buf2, buf);
4982
0
                fd->refs->ref_id[i]->validated_md5 = 1;
4983
0
                if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "M5", buf2, NULL))
4984
0
                    return -1;
4985
0
            }
4986
4987
28
            if (fd->ref_fn) {
4988
0
                char ref_fn[PATH_MAX];
4989
0
                full_path(ref_fn, fd->ref_fn);
4990
0
                if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "UR", ref_fn, NULL))
4991
0
                    return -1;
4992
0
            }
4993
28
        }
4994
2.32k
    }
4995
4996
    /* Length */
4997
2.32k
    header_len = sam_hdr_length(hdr);
4998
2.32k
    if (header_len > INT32_MAX) {
4999
0
        hts_log_error("Header is too long for CRAM format");
5000
0
        return -1;
5001
0
    }
5002
2.32k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
5003
0
        if (-1 == int32_encode(fd, header_len))
5004
0
            return -1;
5005
5006
        /* Text data */
5007
0
        if (header_len != hwrite(fd->fp, sam_hdr_str(hdr), header_len))
5008
0
            return -1;
5009
2.32k
    } else {
5010
        /* Create block(s) inside a container */
5011
2.32k
        cram_block *b = cram_new_block(FILE_HEADER, 0);
5012
2.32k
        cram_container *c = cram_new_container(0, 0);
5013
2.32k
        int padded_length;
5014
2.32k
        char *pads;
5015
2.32k
        int is_cram_3 = (CRAM_MAJOR_VERS(fd->version) >= 3);
5016
5017
2.32k
        if (!b || !c) {
5018
0
            if (b) cram_free_block(b);
5019
0
            if (c) cram_free_container(c);
5020
0
            return -1;
5021
0
        }
5022
5023
2.32k
        if (int32_put_blk(b, header_len) < 0)
5024
0
            return -1;
5025
2.32k
        if (header_len)
5026
1.50k
            BLOCK_APPEND(b, sam_hdr_str(hdr), header_len);
5027
2.32k
        BLOCK_UPLEN(b);
5028
5029
        // Compress header block if V3.0 and above
5030
2.32k
        if (CRAM_MAJOR_VERS(fd->version) >= 3)
5031
2.32k
            if (cram_compress_block(fd, b, NULL, -1, -1) < 0)
5032
0
                return -1;
5033
5034
2.32k
        if (blank_block) {
5035
2.32k
            c->length = b->comp_size + 2 + 4*is_cram_3 +
5036
2.32k
                fd->vv.varint_size(b->content_id)   +
5037
2.32k
                fd->vv.varint_size(b->uncomp_size)  +
5038
2.32k
                fd->vv.varint_size(b->comp_size);
5039
5040
2.32k
            c->num_blocks = 2;
5041
2.32k
            c->num_landmarks = 2;
5042
2.32k
            if (!(c->landmark = malloc(2*sizeof(*c->landmark)))) {
5043
0
                cram_free_block(b);
5044
0
                cram_free_container(c);
5045
0
                return -1;
5046
0
            }
5047
2.32k
            c->landmark[0] = 0;
5048
2.32k
            c->landmark[1] = c->length;
5049
5050
            // Plus extra storage for uncompressed secondary blank block
5051
2.32k
            padded_length = MIN(c->length*.5, 10000);
5052
2.32k
            c->length += padded_length + 2 + 4*is_cram_3 +
5053
2.32k
                fd->vv.varint_size(b->content_id) +
5054
2.32k
                fd->vv.varint_size(padded_length)*2;
5055
2.32k
        } else {
5056
            // Pad the block instead.
5057
0
            c->num_blocks = 1;
5058
0
            c->num_landmarks = 1;
5059
0
            if (!(c->landmark = malloc(sizeof(*c->landmark))))
5060
0
                return -1;
5061
0
            c->landmark[0] = 0;
5062
5063
0
            padded_length = MAX(c->length*1.5, 10000) - c->length;
5064
5065
0
            c->length = b->comp_size + padded_length +
5066
0
                2 + 4*is_cram_3 +
5067
0
                fd->vv.varint_size(b->content_id)   +
5068
0
                fd->vv.varint_size(b->uncomp_size)  +
5069
0
                fd->vv.varint_size(b->comp_size);
5070
5071
0
            if (NULL == (pads = calloc(1, padded_length))) {
5072
0
                cram_free_block(b);
5073
0
                cram_free_container(c);
5074
0
                return -1;
5075
0
            }
5076
0
            BLOCK_APPEND(b, pads, padded_length);
5077
0
            BLOCK_UPLEN(b);
5078
0
            free(pads);
5079
0
        }
5080
5081
2.32k
        if (-1 == cram_write_container(fd, c)) {
5082
0
            cram_free_block(b);
5083
0
            cram_free_container(c);
5084
0
            return -1;
5085
0
        }
5086
5087
2.32k
        if (-1 == cram_write_block(fd, b)) {
5088
0
            cram_free_block(b);
5089
0
            cram_free_container(c);
5090
0
            return -1;
5091
0
        }
5092
5093
2.32k
        if (blank_block) {
5094
2.32k
            BLOCK_RESIZE(b, padded_length);
5095
2.32k
            memset(BLOCK_DATA(b), 0, padded_length);
5096
2.32k
            BLOCK_SIZE(b) = padded_length;
5097
2.32k
            BLOCK_UPLEN(b);
5098
2.32k
            b->method = RAW;
5099
2.32k
            if (-1 == cram_write_block(fd, b)) {
5100
0
                cram_free_block(b);
5101
0
                cram_free_container(c);
5102
0
                return -1;
5103
0
            }
5104
2.32k
        }
5105
5106
2.32k
        cram_free_block(b);
5107
2.32k
        cram_free_container(c);
5108
2.32k
    }
5109
5110
2.32k
    if (0 != hflush(fd->fp))
5111
0
        return -1;
5112
5113
2.32k
    RP("=== Finishing saving header ===\n");
5114
5115
2.32k
    return 0;
5116
5117
0
 block_err:
5118
0
    return -1;
5119
2.32k
}
5120
5121
/* ----------------------------------------------------------------------
5122
 * The top-level cram opening, closing and option handling
5123
 */
5124
5125
/*
5126
 * Sets CRAM variable sized integer decode function tables.
5127
 * CRAM 1, 2, and 3.x all used ITF8 for uint32 and UTF8 for uint64.
5128
 * CRAM 4.x uses the same encoding mechanism for 32-bit and 64-bit
5129
 * (or anything inbetween), but also now supports signed values.
5130
 *
5131
 * Version is the CRAM major version number.
5132
 * vv is the vector table (probably &cram_fd->vv)
5133
 */
5134
4.15k
static void cram_init_varint(varint_vec *vv, int version) {
5135
4.15k
    if (version >= 4) {
5136
25
        vv->varint_get32 = uint7_get_32; // FIXME: varint.h API should be size agnostic
5137
25
        vv->varint_get32s = sint7_get_32;
5138
25
        vv->varint_get64 = uint7_get_64;
5139
25
        vv->varint_get64s = sint7_get_64;
5140
25
        vv->varint_put32 = uint7_put_32;
5141
25
        vv->varint_put32s = sint7_put_32;
5142
25
        vv->varint_put64 = uint7_put_64;
5143
25
        vv->varint_put64s = sint7_put_64;
5144
25
        vv->varint_put32_blk = uint7_put_blk_32;
5145
25
        vv->varint_put32s_blk = sint7_put_blk_32;
5146
25
        vv->varint_put64_blk = uint7_put_blk_64;
5147
25
        vv->varint_put64s_blk = sint7_put_blk_64;
5148
25
        vv->varint_size = uint7_size;
5149
25
        vv->varint_decode32_crc = uint7_decode_crc32;
5150
25
        vv->varint_decode32s_crc = sint7_decode_crc32;
5151
25
        vv->varint_decode64_crc = uint7_decode_crc64;
5152
4.13k
    } else {
5153
4.13k
        vv->varint_get32 = safe_itf8_get;
5154
4.13k
        vv->varint_get32s = safe_itf8_get;
5155
4.13k
        vv->varint_get64 = safe_ltf8_get;
5156
4.13k
        vv->varint_get64s = safe_ltf8_get;
5157
4.13k
        vv->varint_put32 = safe_itf8_put;
5158
4.13k
        vv->varint_put32s = safe_itf8_put;
5159
4.13k
        vv->varint_put64 = safe_ltf8_put;
5160
4.13k
        vv->varint_put64s = safe_ltf8_put;
5161
4.13k
        vv->varint_put32_blk = itf8_put_blk;
5162
4.13k
        vv->varint_put32s_blk = itf8_put_blk;
5163
4.13k
        vv->varint_put64_blk = ltf8_put_blk;
5164
4.13k
        vv->varint_put64s_blk = ltf8_put_blk;
5165
4.13k
        vv->varint_size = itf8_size;
5166
4.13k
        vv->varint_decode32_crc = itf8_decode_crc;
5167
4.13k
        vv->varint_decode32s_crc = itf8_decode_crc;
5168
4.13k
        vv->varint_decode64_crc = ltf8_decode_crc;
5169
4.13k
    }
5170
4.15k
}
5171
5172
/*
5173
 * Initialises the lookup tables. These could be global statics, but they're
5174
 * clumsy to setup in a multi-threaded environment unless we generate
5175
 * verbatim code and include that.
5176
 */
5177
4.15k
static void cram_init_tables(cram_fd *fd) {
5178
4.15k
    int i;
5179
5180
4.15k
    memset(fd->L1, 4, 256);
5181
4.15k
    fd->L1['A'] = 0; fd->L1['a'] = 0;
5182
4.15k
    fd->L1['C'] = 1; fd->L1['c'] = 1;
5183
4.15k
    fd->L1['G'] = 2; fd->L1['g'] = 2;
5184
4.15k
    fd->L1['T'] = 3; fd->L1['t'] = 3;
5185
5186
4.15k
    memset(fd->L2, 5, 256);
5187
4.15k
    fd->L2['A'] = 0; fd->L2['a'] = 0;
5188
4.15k
    fd->L2['C'] = 1; fd->L2['c'] = 1;
5189
4.15k
    fd->L2['G'] = 2; fd->L2['g'] = 2;
5190
4.15k
    fd->L2['T'] = 3; fd->L2['t'] = 3;
5191
4.15k
    fd->L2['N'] = 4; fd->L2['n'] = 4;
5192
5193
4.15k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
5194
796k
        for (i = 0; i < 0x200; i++) {
5195
795k
            int f = 0;
5196
5197
795k
            if (i & CRAM_FPAIRED)      f |= BAM_FPAIRED;
5198
795k
            if (i & CRAM_FPROPER_PAIR) f |= BAM_FPROPER_PAIR;
5199
795k
            if (i & CRAM_FUNMAP)       f |= BAM_FUNMAP;
5200
795k
            if (i & CRAM_FREVERSE)     f |= BAM_FREVERSE;
5201
795k
            if (i & CRAM_FREAD1)       f |= BAM_FREAD1;
5202
795k
            if (i & CRAM_FREAD2)       f |= BAM_FREAD2;
5203
795k
            if (i & CRAM_FSECONDARY)   f |= BAM_FSECONDARY;
5204
795k
            if (i & CRAM_FQCFAIL)      f |= BAM_FQCFAIL;
5205
795k
            if (i & CRAM_FDUP)         f |= BAM_FDUP;
5206
5207
795k
            fd->bam_flag_swap[i]  = f;
5208
795k
        }
5209
5210
6.36M
        for (i = 0; i < 0x1000; i++) {
5211
6.36M
            int g = 0;
5212
5213
6.36M
            if (i & BAM_FPAIRED)           g |= CRAM_FPAIRED;
5214
6.36M
            if (i & BAM_FPROPER_PAIR)  g |= CRAM_FPROPER_PAIR;
5215
6.36M
            if (i & BAM_FUNMAP)        g |= CRAM_FUNMAP;
5216
6.36M
            if (i & BAM_FREVERSE)      g |= CRAM_FREVERSE;
5217
6.36M
            if (i & BAM_FREAD1)        g |= CRAM_FREAD1;
5218
6.36M
            if (i & BAM_FREAD2)        g |= CRAM_FREAD2;
5219
6.36M
            if (i & BAM_FSECONDARY)    g |= CRAM_FSECONDARY;
5220
6.36M
            if (i & BAM_FQCFAIL)       g |= CRAM_FQCFAIL;
5221
6.36M
            if (i & BAM_FDUP)          g |= CRAM_FDUP;
5222
5223
6.36M
            fd->cram_flag_swap[i] = g;
5224
6.36M
        }
5225
2.60k
    } else {
5226
        /* NOP */
5227
10.6M
        for (i = 0; i < 0x1000; i++)
5228
10.6M
            fd->bam_flag_swap[i] = i;
5229
10.6M
        for (i = 0; i < 0x1000; i++)
5230
10.6M
            fd->cram_flag_swap[i] = i;
5231
2.60k
    }
5232
5233
4.15k
    memset(fd->cram_sub_matrix, 4, 32*32);
5234
137k
    for (i = 0; i < 32; i++) {
5235
132k
        fd->cram_sub_matrix[i]['A'&0x1f]=0;
5236
132k
        fd->cram_sub_matrix[i]['C'&0x1f]=1;
5237
132k
        fd->cram_sub_matrix[i]['G'&0x1f]=2;
5238
132k
        fd->cram_sub_matrix[i]['T'&0x1f]=3;
5239
132k
        fd->cram_sub_matrix[i]['N'&0x1f]=4;
5240
132k
    }
5241
24.9k
    for (i = 0; i < 20; i+=4) {
5242
20.7k
        int j;
5243
436k
        for (j = 0; j < 20; j++) {
5244
415k
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5245
415k
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5246
415k
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5247
415k
            fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][j]=3;
5248
415k
        }
5249
20.7k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+0]&0x1f]=0;
5250
20.7k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+1]&0x1f]=1;
5251
20.7k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2;
5252
20.7k
        fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3;
5253
20.7k
    }
5254
5255
4.15k
    cram_init_varint(&fd->vv, CRAM_MAJOR_VERS(fd->version));
5256
4.15k
}
5257
5258
// Default version numbers for CRAM
5259
static int major_version = 3;
5260
static int minor_version = 1;
5261
5262
/*
5263
 * Opens a CRAM file for read (mode "rb") or write ("wb").
5264
 * The filename may be "-" to indicate stdin or stdout.
5265
 *
5266
 * Returns file handle on success
5267
 *         NULL on failure.
5268
 */
5269
0
cram_fd *cram_open(const char *filename, const char *mode) {
5270
0
    hFILE *fp;
5271
0
    cram_fd *fd;
5272
0
    char fmode[3]= { mode[0], '\0', '\0' };
5273
5274
0
    if (strlen(mode) > 1 && (mode[1] == 'b' || mode[1] == 'c')) {
5275
0
        fmode[1] = 'b';
5276
0
    }
5277
5278
0
    fp = hopen(filename, fmode);
5279
0
    if (!fp)
5280
0
        return NULL;
5281
5282
0
    fd = cram_dopen(fp, filename, mode);
5283
0
    if (!fd)
5284
0
        hclose_abruptly(fp);
5285
5286
0
    return fd;
5287
0
}
5288
5289
/* Opens an existing stream for reading or writing.
5290
 *
5291
 * Returns file handle on success;
5292
 *         NULL on failure.
5293
 */
5294
4.15k
cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) {
5295
4.15k
    int i;
5296
4.15k
    char *cp;
5297
4.15k
    cram_fd *fd = calloc(1, sizeof(*fd));
5298
4.15k
    if (!fd)
5299
0
        return NULL;
5300
5301
4.15k
    fd->level = CRAM_DEFAULT_LEVEL;
5302
12.4k
    for (i = 0; mode[i]; i++) {
5303
8.31k
        if (mode[i] >= '0' && mode[i] <= '9') {
5304
0
            fd->level = mode[i] - '0';
5305
0
            break;
5306
0
        }
5307
8.31k
    }
5308
5309
4.15k
    fd->fp = fp;
5310
4.15k
    fd->mode = *mode;
5311
4.15k
    fd->first_container = 0;
5312
4.15k
    fd->curr_position = 0;
5313
5314
4.15k
    if (fd->mode == 'r') {
5315
        /* Reader */
5316
5317
1.61k
        if (!(fd->file_def = cram_read_file_def(fd)))
5318
0
            goto err;
5319
5320
1.61k
        fd->version = fd->file_def->major_version * 256 +
5321
1.61k
            fd->file_def->minor_version;
5322
5323
1.61k
        cram_init_tables(fd);
5324
5325
1.61k
        if (!(fd->header = cram_read_SAM_hdr(fd))) {
5326
36
            cram_free_file_def(fd->file_def);
5327
36
            goto err;
5328
36
        }
5329
5330
2.54k
    } else {
5331
        /* Writer */
5332
2.54k
        cram_file_def *def = calloc(1, sizeof(*def));
5333
2.54k
        if (!def)
5334
0
            return NULL;
5335
5336
2.54k
        fd->file_def = def;
5337
5338
2.54k
        def->magic[0] = 'C';
5339
2.54k
        def->magic[1] = 'R';
5340
2.54k
        def->magic[2] = 'A';
5341
2.54k
        def->magic[3] = 'M';
5342
2.54k
        def->major_version = 0; // Indicator to write file def later.
5343
2.54k
        def->minor_version = 0;
5344
2.54k
        memset(def->file_id, 0, 20);
5345
2.54k
        strncpy(def->file_id, filename, 20);
5346
5347
2.54k
        fd->version = major_version * 256 + minor_version;
5348
2.54k
        cram_init_tables(fd);
5349
5350
        /* SAM header written later along with this file_def */
5351
2.54k
    }
5352
5353
4.12k
    fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename);
5354
4.12k
    if (!fd->prefix)
5355
0
        goto err;
5356
4.12k
    fd->first_base = fd->last_base = -1;
5357
4.12k
    fd->record_counter = 0;
5358
5359
4.12k
    fd->ctr = NULL;
5360
4.12k
    fd->ctr_mt = NULL;
5361
4.12k
    fd->refs  = refs_create();
5362
4.12k
    if (!fd->refs)
5363
0
        goto err;
5364
4.12k
    fd->ref_id = -2;
5365
4.12k
    fd->ref = NULL;
5366
5367
4.12k
    fd->decode_md = 0;
5368
4.12k
    fd->seqs_per_slice = SEQS_PER_SLICE;
5369
4.12k
    fd->bases_per_slice = BASES_PER_SLICE;
5370
4.12k
    fd->slices_per_container = SLICE_PER_CNT;
5371
4.12k
    fd->embed_ref = -1; // automatic selection
5372
4.12k
    fd->no_ref = 0;
5373
4.12k
    fd->no_ref_counter = 0;
5374
4.12k
    fd->ap_delta = 0;
5375
4.12k
    fd->ignore_md5 = 0;
5376
4.12k
    fd->lossy_read_names = 0;
5377
4.12k
    fd->use_bz2 = 0;
5378
4.12k
    fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3);
5379
4.12k
    fd->use_tok = (CRAM_MAJOR_VERS(fd->version) >= 3) && (CRAM_MINOR_VERS(fd->version) >= 1);
5380
4.12k
    fd->use_lzma = 0;
5381
4.12k
    fd->multi_seq = -1;
5382
4.12k
    fd->multi_seq_user = -1;
5383
4.12k
    fd->unsorted   = 0;
5384
4.12k
    fd->shared_ref = 0;
5385
4.12k
    fd->store_md = 0;
5386
4.12k
    fd->store_nm = 0;
5387
4.12k
    fd->last_RI_count = 0;
5388
5389
4.12k
    fd->index       = NULL;
5390
4.12k
    fd->own_pool    = 0;
5391
4.12k
    fd->pool        = NULL;
5392
4.12k
    fd->rqueue      = NULL;
5393
4.12k
    fd->job_pending = NULL;
5394
4.12k
    fd->ooc         = 0;
5395
4.12k
    fd->required_fields = INT_MAX;
5396
5397
4.12k
    pthread_mutex_init(&fd->metrics_lock, NULL);
5398
4.12k
    pthread_mutex_init(&fd->ref_lock, NULL);
5399
4.12k
    pthread_mutex_init(&fd->range_lock, NULL);
5400
4.12k
    pthread_mutex_init(&fd->bam_list_lock, NULL);
5401
5402
197k
    for (i = 0; i < DS_END; i++) {
5403
193k
        fd->m[i] = cram_new_metrics();
5404
193k
        if (!fd->m[i])
5405
0
            goto err;
5406
193k
    }
5407
5408
4.12k
    if (!(fd->tags_used = kh_init(m_metrics)))
5409
0
        goto err;
5410
5411
4.12k
    fd->range.refid = -2; // no ref.
5412
4.12k
    fd->eof = 1;          // See samtools issue #150
5413
4.12k
    fd->ref_fn = NULL;
5414
5415
4.12k
    fd->bl = NULL;
5416
5417
    /* Initialise dummy refs from the @SQ headers */
5418
4.12k
    if (-1 == refs_from_header(fd))
5419
0
        goto err;
5420
5421
4.12k
    return fd;
5422
5423
36
 err:
5424
36
    if (fd)
5425
36
        free(fd);
5426
5427
36
    return NULL;
5428
4.12k
}
5429
5430
/*
5431
 * Seek within a CRAM file.
5432
 *
5433
 * Returns 0 on success
5434
 *        -1 on failure
5435
 */
5436
0
int cram_seek(cram_fd *fd, off_t offset, int whence) {
5437
0
    fd->ooc = 0;
5438
5439
0
    cram_drain_rqueue(fd);
5440
5441
0
    return hseek(fd->fp, offset, whence) >= 0 ? 0 : -1;
5442
0
}
5443
5444
/*
5445
 * Flushes a CRAM file.
5446
 * Useful for when writing to stdout without wishing to close the stream.
5447
 *
5448
 * Returns 0 on success
5449
 *        -1 on failure
5450
 */
5451
0
int cram_flush(cram_fd *fd) {
5452
0
    if (!fd)
5453
0
        return -1;
5454
5455
0
    int ret = 0;
5456
5457
0
    if (fd->mode == 'w' && fd->ctr) {
5458
0
        if(fd->ctr->slice)
5459
0
            cram_update_curr_slice(fd->ctr, fd->version);
5460
5461
0
        if (-1 == cram_flush_container_mt(fd, fd->ctr))
5462
0
            ret = -1;
5463
5464
0
        cram_free_container(fd->ctr);
5465
0
        if (fd->ctr_mt == fd->ctr)
5466
0
            fd->ctr_mt = NULL;
5467
0
        fd->ctr = NULL;
5468
0
    }
5469
5470
0
    return ret;
5471
0
}
5472
5473
/*
5474
 * Writes an EOF block to a CRAM file.
5475
 *
5476
 * Returns 0 on success
5477
 *        -1 on failure
5478
 */
5479
2.40k
int cram_write_eof_block(cram_fd *fd) {
5480
    // EOF block is a container with special values to aid detection
5481
2.40k
    if (CRAM_MAJOR_VERS(fd->version) >= 2) {
5482
        // Empty container with
5483
        //   ref_seq_id -1
5484
        //   start pos 0x454f46 ("EOF")
5485
        //   span 0
5486
        //   nrec 0
5487
        //   counter 0
5488
        //   nbases 0
5489
        //   1 block (landmark 0)
5490
        //   (CRC32)
5491
2.40k
        cram_container c;
5492
2.40k
        memset(&c, 0, sizeof(c));
5493
2.40k
        c.ref_seq_id = -1;
5494
2.40k
        c.ref_seq_start = 0x454f46; // "EOF"
5495
2.40k
        c.ref_seq_span = 0;
5496
2.40k
        c.record_counter = 0;
5497
2.40k
        c.num_bases = 0;
5498
2.40k
        c.num_blocks = 1;
5499
2.40k
        int32_t land[1] = {0};
5500
2.40k
        c.landmark = land;
5501
5502
        // An empty compression header block with
5503
        //   method raw (0)
5504
        //   type comp header (1)
5505
        //   content id 0
5506
        //   block contents size 6
5507
        //   raw size 6
5508
        //     empty preservation map (01 00)
5509
        //     empty data series map (01 00)
5510
        //     empty tag map (01 00)
5511
        //   block CRC
5512
2.40k
        cram_block_compression_hdr ch;
5513
2.40k
        memset(&ch, 0, sizeof(ch));
5514
2.40k
        c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch, 0);
5515
5516
2.40k
        c.length = c.comp_hdr_block->byte            // Landmark[0]
5517
2.40k
            + 5                                      // block struct
5518
2.40k
            + 4*(CRAM_MAJOR_VERS(fd->version) >= 3); // CRC
5519
2.40k
        if (cram_write_container(fd, &c) < 0 ||
5520
2.40k
            cram_write_block(fd, c.comp_hdr_block) < 0) {
5521
0
            cram_close(fd);
5522
0
            cram_free_block(c.comp_hdr_block);
5523
0
            return -1;
5524
0
        }
5525
2.40k
        if (ch.preservation_map)
5526
0
            kh_destroy(map, ch.preservation_map);
5527
2.40k
        cram_free_block(c.comp_hdr_block);
5528
5529
        // V2.1 bytes
5530
        // 0b 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id
5531
        // e0 45 4f 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5532
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5533
        // 00 01 00 06 06             // Comp.HDR blk
5534
        // 01 00 01 00 01 00          // Comp.HDR blk
5535
5536
        // V3.0 bytes:
5537
        // 0f 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id
5538
        // e0 45 4f 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5539
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5540
        // 05 bd d9 4f                // CRC32
5541
        // 00 01 00 06 06             // Comp.HDR blk
5542
        // 01 00 01 00 01 00          // Comp.HDR blk
5543
        // ee 63 01 4b                // CRC32
5544
5545
        // V4.0 bytes:
5546
        // 0f 00 00 00 8f ff ff ff    // Cont HDR: size, ref seq id
5547
        // 82 95 9e 46 00 00 00       // Cont HDR: pos, span, nrec, counter
5548
        // 00 01 00                   // Cont HDR: nbase, nblk, landmark
5549
        // ac d6 05 bc                // CRC32
5550
        // 00 01 00 06 06             // Comp.HDR blk
5551
        // 01 00 01 00 01 00          // Comp.HDR blk
5552
        // ee 63 01 4b                // CRC32
5553
2.40k
    }
5554
5555
2.40k
    return 0;
5556
2.40k
}
5557
5558
/*
5559
 * Closes a CRAM file.
5560
 * Returns 0 on success
5561
 *        -1 on failure
5562
 */
5563
4.12k
int cram_close(cram_fd *fd) {
5564
4.12k
    spare_bams *bl, *next;
5565
4.12k
    int i, ret = 0;
5566
5567
4.12k
    if (!fd)
5568
0
        return -1;
5569
5570
4.12k
    if (fd->mode == 'w' && fd->ctr) {
5571
1.61k
        if(fd->ctr->slice)
5572
1.61k
            cram_update_curr_slice(fd->ctr, fd->version);
5573
5574
1.61k
        if (-1 == cram_flush_container_mt(fd, fd->ctr))
5575
137
            ret = -1;
5576
1.61k
    }
5577
5578
4.12k
    if (fd->mode != 'w')
5579
1.58k
        cram_drain_rqueue(fd);
5580
5581
4.12k
    if (fd->pool && fd->eof >= 0 && fd->rqueue) {
5582
0
        hts_tpool_process_flush(fd->rqueue);
5583
5584
0
        if (0 != cram_flush_result(fd))
5585
0
            ret = -1;
5586
5587
0
        if (fd->mode == 'w')
5588
0
            fd->ctr = NULL; // prevent double freeing
5589
5590
        //fprintf(stderr, "CRAM: destroy queue %p\n", fd->rqueue);
5591
5592
0
        hts_tpool_process_destroy(fd->rqueue);
5593
0
    }
5594
5595
4.12k
    pthread_mutex_destroy(&fd->metrics_lock);
5596
4.12k
    pthread_mutex_destroy(&fd->ref_lock);
5597
4.12k
    pthread_mutex_destroy(&fd->range_lock);
5598
4.12k
    pthread_mutex_destroy(&fd->bam_list_lock);
5599
5600
4.12k
    if (ret == 0 && fd->mode == 'w') {
5601
        /* Write EOF block */
5602
2.40k
        if (0 != cram_write_eof_block(fd))
5603
0
            ret = -1;
5604
2.40k
    }
5605
5606
5.65k
    for (bl = fd->bl; bl; bl = next) {
5607
1.53k
        int max_rec = fd->seqs_per_slice * fd->slices_per_container;
5608
5609
1.53k
        next = bl->next;
5610
1.53k
        free_bam_list(bl->bams, max_rec);
5611
1.53k
        free(bl);
5612
1.53k
    }
5613
5614
4.12k
    if (hclose(fd->fp) != 0)
5615
0
        ret = -1;
5616
5617
4.12k
    if (fd->file_def)
5618
4.12k
        cram_free_file_def(fd->file_def);
5619
5620
4.12k
    if (fd->header)
5621
4.01k
        sam_hdr_destroy(fd->header);
5622
5623
4.12k
    free(fd->prefix);
5624
5625
4.12k
    if (fd->ctr)
5626
2.64k
        cram_free_container(fd->ctr);
5627
5628
4.12k
    if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
5629
52
        cram_free_container(fd->ctr_mt);
5630
5631
4.12k
    if (fd->refs)
5632
4.12k
        refs_free(fd->refs);
5633
4.12k
    if (fd->ref_free)
5634
0
        free(fd->ref_free);
5635
5636
197k
    for (i = 0; i < DS_END; i++)
5637
193k
        if (fd->m[i])
5638
193k
            free(fd->m[i]);
5639
5640
4.12k
    if (fd->tags_used) {
5641
4.12k
        khint_t k;
5642
5643
18.6k
        for (k = kh_begin(fd->tags_used); k != kh_end(fd->tags_used); k++) {
5644
14.5k
            if (kh_exist(fd->tags_used, k))
5645
7.25k
                free(kh_val(fd->tags_used, k));
5646
14.5k
        }
5647
5648
4.12k
        kh_destroy(m_metrics, fd->tags_used);
5649
4.12k
    }
5650
5651
4.12k
    if (fd->index)
5652
0
        cram_index_free(fd);
5653
5654
4.12k
    if (fd->own_pool && fd->pool)
5655
0
        hts_tpool_destroy(fd->pool);
5656
5657
4.12k
    if (fd->idxfp)
5658
0
        if (bgzf_close(fd->idxfp) < 0)
5659
0
            ret = -1;
5660
5661
4.12k
    free(fd);
5662
5663
4.12k
    return ret;
5664
4.12k
}
5665
5666
/*
5667
 * Returns 1 if we hit an EOF while reading.
5668
 */
5669
2.65k
int cram_eof(cram_fd *fd) {
5670
2.65k
    return fd->eof;
5671
2.65k
}
5672
5673
5674
/*
5675
 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5676
 * Use this immediately after opening.
5677
 *
5678
 * Returns 0 on success
5679
 *        -1 on failure
5680
 */
5681
1.58k
int cram_set_option(cram_fd *fd, enum hts_fmt_option opt, ...) {
5682
1.58k
    int r;
5683
1.58k
    va_list args;
5684
5685
1.58k
    va_start(args, opt);
5686
1.58k
    r = cram_set_voption(fd, opt, args);
5687
1.58k
    va_end(args);
5688
5689
1.58k
    return r;
5690
1.58k
}
5691
5692
/*
5693
 * Sets options on the cram_fd. See CRAM_OPT_* definitions in cram_structs.h.
5694
 * Use this immediately after opening.
5695
 *
5696
 * Returns 0 on success
5697
 *        -1 on failure
5698
 */
5699
1.58k
int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) {
5700
1.58k
    refs_t *refs;
5701
5702
1.58k
    if (!fd) {
5703
0
        errno = EBADF;
5704
0
        return -1;
5705
0
    }
5706
5707
1.58k
    switch (opt) {
5708
1.58k
    case CRAM_OPT_DECODE_MD:
5709
1.58k
        fd->decode_md = va_arg(args, int);
5710
1.58k
        break;
5711
5712
0
    case CRAM_OPT_PREFIX:
5713
0
        if (fd->prefix)
5714
0
            free(fd->prefix);
5715
0
        if (!(fd->prefix = strdup(va_arg(args, char *))))
5716
0
            return -1;
5717
0
        break;
5718
5719
0
    case CRAM_OPT_VERBOSITY:
5720
0
        break;
5721
5722
0
    case CRAM_OPT_SEQS_PER_SLICE:
5723
0
        fd->seqs_per_slice = va_arg(args, int);
5724
0
        if (fd->bases_per_slice == BASES_PER_SLICE)
5725
0
            fd->bases_per_slice = fd->seqs_per_slice * 500;
5726
0
        break;
5727
5728
0
    case CRAM_OPT_BASES_PER_SLICE:
5729
0
        fd->bases_per_slice = va_arg(args, int);
5730
0
        break;
5731
5732
0
    case CRAM_OPT_SLICES_PER_CONTAINER:
5733
0
        fd->slices_per_container = va_arg(args, int);
5734
0
        break;
5735
5736
0
    case CRAM_OPT_EMBED_REF:
5737
0
        fd->embed_ref = va_arg(args, int);
5738
0
        break;
5739
5740
0
    case CRAM_OPT_NO_REF:
5741
0
        fd->no_ref = va_arg(args, int);
5742
0
        break;
5743
5744
0
    case CRAM_OPT_POS_DELTA:
5745
0
        fd->ap_delta = va_arg(args, int);
5746
0
        break;
5747
5748
0
    case CRAM_OPT_IGNORE_MD5:
5749
0
        fd->ignore_md5 = va_arg(args, int);
5750
0
        break;
5751
5752
0
    case CRAM_OPT_LOSSY_NAMES:
5753
0
        fd->lossy_read_names = va_arg(args, int);
5754
        // Currently lossy read names required paired (attached) reads.
5755
        // TLEN 0 or being 1 out causes read pairs to be detached, breaking
5756
        // the lossy read name compression, so we have extra options to
5757
        // slacken the exact TLEN round-trip checks.
5758
0
        fd->tlen_approx = fd->lossy_read_names;
5759
0
        fd->tlen_zero = fd->lossy_read_names;
5760
0
        break;
5761
5762
0
    case CRAM_OPT_USE_BZIP2:
5763
0
        fd->use_bz2 = va_arg(args, int);
5764
0
        break;
5765
5766
0
    case CRAM_OPT_USE_RANS:
5767
0
        fd->use_rans = va_arg(args, int);
5768
0
        break;
5769
5770
0
    case CRAM_OPT_USE_TOK:
5771
0
        fd->use_tok = va_arg(args, int);
5772
0
        break;
5773
5774
0
    case CRAM_OPT_USE_FQZ:
5775
0
        fd->use_fqz = va_arg(args, int);
5776
0
        break;
5777
5778
0
    case CRAM_OPT_USE_ARITH:
5779
0
        fd->use_arith = va_arg(args, int);
5780
0
        break;
5781
5782
0
    case CRAM_OPT_USE_LZMA:
5783
0
        fd->use_lzma = va_arg(args, int);
5784
0
        break;
5785
5786
0
    case CRAM_OPT_SHARED_REF:
5787
0
        fd->shared_ref = 1;
5788
0
        refs = va_arg(args, refs_t *);
5789
0
        if (refs != fd->refs) {
5790
0
            if (fd->refs)
5791
0
                refs_free(fd->refs);
5792
0
            fd->refs = refs;
5793
0
            fd->refs->count++;
5794
0
        }
5795
0
        break;
5796
5797
0
    case CRAM_OPT_RANGE: {
5798
0
        int r = cram_seek_to_refpos(fd, va_arg(args, cram_range *));
5799
0
        pthread_mutex_lock(&fd->range_lock);
5800
//        printf("opt range noseek to %p %d:%ld-%ld\n",
5801
//               fd, fd->range.refid, fd->range.start, fd->range.end);
5802
0
        if (fd->range.refid != -2)
5803
0
            fd->required_fields |= SAM_POS;
5804
0
        pthread_mutex_unlock(&fd->range_lock);
5805
0
        return r;
5806
0
    }
5807
5808
0
    case CRAM_OPT_RANGE_NOSEEK: {
5809
        // As per CRAM_OPT_RANGE, but no seeking
5810
0
        pthread_mutex_lock(&fd->range_lock);
5811
0
        cram_range *r = va_arg(args, cram_range *);
5812
0
        fd->range = *r;
5813
0
        if (r->refid == HTS_IDX_NOCOOR) {
5814
0
            fd->range.refid = -1;
5815
0
            fd->range.start = 0;
5816
0
        } else if (r->refid == HTS_IDX_START || r->refid == HTS_IDX_REST) {
5817
0
            fd->range.refid = -2; // special case in cram_next_slice
5818
0
        }
5819
0
        if (fd->range.refid != -2)
5820
0
            fd->required_fields |= SAM_POS;
5821
0
        fd->ooc = 0;
5822
0
        fd->eof = 0;
5823
0
        pthread_mutex_unlock(&fd->range_lock);
5824
0
        return 0;
5825
0
    }
5826
5827
0
    case CRAM_OPT_REFERENCE:
5828
0
        return cram_load_reference(fd, va_arg(args, char *));
5829
5830
0
    case CRAM_OPT_VERSION: {
5831
0
        int major, minor;
5832
0
        char *s = va_arg(args, char *);
5833
0
        if (2 != sscanf(s, "%d.%d", &major, &minor)) {
5834
0
            hts_log_error("Malformed version string %s", s);
5835
0
            return -1;
5836
0
        }
5837
0
        if (!((major == 1 &&  minor == 0) ||
5838
0
              (major == 2 && (minor == 0 || minor == 1)) ||
5839
0
              (major == 3 && (minor == 0 || minor == 1)) ||
5840
0
              (major == 4 &&  minor == 0))) {
5841
0
            hts_log_error("Unknown version string; use 1.0, 2.0, 2.1, 3.0, 3.1 or 4.0");
5842
0
            errno = EINVAL;
5843
0
            return -1;
5844
0
        }
5845
5846
0
        if (major > 3 || (major == 3 && minor > 1)) {
5847
0
            hts_log_warning(
5848
0
                "CRAM version %s is still a draft and subject to change.\n"
5849
0
                "This is a technology demonstration that should not be "
5850
0
                "used for archival data.", s);
5851
0
        }
5852
5853
0
        fd->version = major*256 + minor;
5854
5855
0
        fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0;
5856
5857
0
        fd->use_tok = ((CRAM_MAJOR_VERS(fd->version) == 3 &&
5858
0
                        CRAM_MINOR_VERS(fd->version) >= 1) ||
5859
0
                        CRAM_MAJOR_VERS(fd->version) >= 4) ? 1 : 0;
5860
0
        cram_init_tables(fd);
5861
5862
0
        break;
5863
0
    }
5864
5865
0
    case CRAM_OPT_MULTI_SEQ_PER_SLICE:
5866
0
        fd->multi_seq_user = fd->multi_seq = va_arg(args, int);
5867
0
        break;
5868
5869
0
    case CRAM_OPT_NTHREADS: {
5870
0
        int nthreads =  va_arg(args, int);
5871
0
        if (nthreads >= 1) {
5872
0
            if (!(fd->pool = hts_tpool_init(nthreads)))
5873
0
                return -1;
5874
5875
0
            fd->rqueue = hts_tpool_process_init(fd->pool, nthreads*2, 0);
5876
0
            fd->shared_ref = 1;
5877
0
            fd->own_pool = 1;
5878
0
        }
5879
0
        break;
5880
0
    }
5881
5882
0
    case CRAM_OPT_THREAD_POOL: {
5883
0
        htsThreadPool *p = va_arg(args, htsThreadPool *);
5884
0
        fd->pool = p ? p->pool : NULL;
5885
0
        if (fd->pool) {
5886
0
            fd->rqueue = hts_tpool_process_init(fd->pool,
5887
0
                                                p->qsize ? p->qsize : hts_tpool_size(fd->pool)*2,
5888
0
                                                0);
5889
0
        }
5890
0
        fd->shared_ref = 1; // Needed to avoid clobbering ref between threads
5891
0
        fd->own_pool = 0;
5892
5893
        //fd->qsize = 1;
5894
        //fd->decoded = calloc(fd->qsize, sizeof(cram_container *));
5895
        //hts_tpool_dispatch(fd->pool, cram_decoder_thread, fd);
5896
0
        break;
5897
0
    }
5898
5899
0
    case CRAM_OPT_REQUIRED_FIELDS:
5900
0
        fd->required_fields = va_arg(args, int);
5901
0
        if (fd->range.refid != -2)
5902
0
            fd->required_fields |= SAM_POS;
5903
0
        break;
5904
5905
0
    case CRAM_OPT_STORE_MD:
5906
0
        fd->store_md = va_arg(args, int);
5907
0
        break;
5908
5909
0
    case CRAM_OPT_STORE_NM:
5910
0
        fd->store_nm = va_arg(args, int);
5911
0
        break;
5912
5913
0
    case HTS_OPT_COMPRESSION_LEVEL:
5914
0
        fd->level = va_arg(args, int);
5915
0
        break;
5916
5917
0
    case HTS_OPT_PROFILE: {
5918
0
        enum hts_profile_option prof = va_arg(args, int);
5919
0
        switch (prof) {
5920
0
        case HTS_PROFILE_FAST:
5921
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 1;
5922
0
            fd->use_tok = 0;
5923
0
            fd->seqs_per_slice = 10000;
5924
0
            break;
5925
5926
0
        case HTS_PROFILE_NORMAL:
5927
0
            break;
5928
5929
0
        case HTS_PROFILE_SMALL:
5930
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 6;
5931
0
            fd->use_bz2 = 1;
5932
0
            fd->use_fqz = 1;
5933
0
            fd->seqs_per_slice = 25000;
5934
0
            break;
5935
5936
0
        case HTS_PROFILE_ARCHIVE:
5937
0
            if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 7;
5938
0
            fd->use_bz2 = 1;
5939
0
            fd->use_fqz = 1;
5940
0
            fd->use_arith = 1;
5941
0
            if (fd->level > 7)
5942
0
                fd->use_lzma = 1;
5943
0
            fd->seqs_per_slice = 100000;
5944
0
            break;
5945
0
        }
5946
5947
0
        if (fd->bases_per_slice == BASES_PER_SLICE)
5948
0
            fd->bases_per_slice = fd->seqs_per_slice * 500;
5949
0
        break;
5950
0
    }
5951
5952
0
    default:
5953
0
        hts_log_error("Unknown CRAM option code %d", opt);
5954
0
        errno = EINVAL;
5955
0
        return -1;
5956
1.58k
    }
5957
5958
1.58k
    return 0;
5959
1.58k
}
5960
5961
int cram_check_EOF(cram_fd *fd)
5962
0
{
5963
    // Byte 9 in these templates is & with 0x0f to resolve differences
5964
    // between ITF-8 interpretations between early Java and C
5965
    // implementations of CRAM
5966
0
    static const unsigned char TEMPLATE_2_1[30] = {
5967
0
        0x0b, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
5968
0
        0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
5969
0
        0x01, 0x00, 0x06, 0x06, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00
5970
0
    };
5971
0
    static const unsigned char TEMPLATE_3[38] = {
5972
0
        0x0f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x0f, 0xe0,
5973
0
        0x45, 0x4f, 0x46, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x05,
5974
0
        0xbd, 0xd9, 0x4f, 0x00, 0x01, 0x00, 0x06, 0x06, 0x01, 0x00,
5975
0
        0x01, 0x00, 0x01, 0x00, 0xee, 0x63, 0x01, 0x4b
5976
0
    };
5977
5978
0
    unsigned char buf[38]; // max(sizeof TEMPLATE_*)
5979
5980
0
    uint8_t major = CRAM_MAJOR_VERS(fd->version);
5981
0
    uint8_t minor = CRAM_MINOR_VERS(fd->version);
5982
5983
0
    const unsigned char *template;
5984
0
    ssize_t template_len;
5985
0
    if ((major < 2) ||
5986
0
        (major == 2 && minor == 0)) {
5987
0
        return 3; // No EOF support in cram versions less than 2.1
5988
0
    } else if (major == 2 && minor == 1) {
5989
0
        template = TEMPLATE_2_1;
5990
0
        template_len = sizeof TEMPLATE_2_1;
5991
0
    } else {
5992
0
        template = TEMPLATE_3;
5993
0
        template_len = sizeof TEMPLATE_3;
5994
0
    }
5995
5996
0
    off_t offset = htell(fd->fp);
5997
0
    if (hseek(fd->fp, -template_len, SEEK_END) < 0) {
5998
0
        if (errno == ESPIPE) {
5999
0
            hclearerr(fd->fp);
6000
0
            return 2;
6001
0
        }
6002
0
        else {
6003
0
            return -1;
6004
0
        }
6005
0
    }
6006
0
    if (hread(fd->fp, buf, template_len) != template_len) return -1;
6007
0
    if (hseek(fd->fp, offset, SEEK_SET) < 0) return -1;
6008
0
    buf[8] &= 0x0f;
6009
0
    return (memcmp(template, buf, template_len) == 0)? 1 : 0;
6010
0
}