Coverage Report

Created: 2023-01-17 06:24

/src/htslib/cram/cram_io.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2012-2022 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * CRAM I/O primitives.
33
 *
34
 * - ITF8 encoding and decoding.
35
 * - Block based I/O
36
 * - Zlib inflating and deflating (memory)
37
 * - CRAM basic data structure reading and writing
38
 * - File opening / closing
39
 * - Reference sequence handling
40
 */
41
42
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
43
#include <config.h>
44
45
#include <stdio.h>
46
#include <errno.h>
47
#include <assert.h>
48
#include <stdlib.h>
49
#include <string.h>
50
#include <unistd.h>
51
#include <zlib.h>
52
#ifdef HAVE_LIBBZ2
53
#include <bzlib.h>
54
#endif
55
#ifdef HAVE_LIBLZMA
56
#ifdef HAVE_LZMA_H
57
#include <lzma.h>
58
#else
59
#include "../os/lzma_stub.h"
60
#endif
61
#endif
62
#include <sys/types.h>
63
#include <sys/stat.h>
64
#include <math.h>
65
#include <stdint.h>
66
67
#ifdef HAVE_LIBDEFLATE
68
#include <libdeflate.h>
69
#define crc32(a,b,c) libdeflate_crc32((a),(b),(c))
70
#endif
71
72
#include "cram.h"
73
#include "os.h"
74
#include "../htslib/hts.h"
75
#include "open_trace_file.h"
76
77
#if defined(HAVE_EXTERNAL_LIBHTSCODECS)
78
#include <htscodecs/rANS_static.h>
79
#include <htscodecs/rANS_static4x16.h>
80
#include <htscodecs/arith_dynamic.h>
81
#include <htscodecs/tokenise_name3.h>
82
#include <htscodecs/fqzcomp_qual.h>
83
#include <htscodecs/varint.h> // CRAM v4.0 variable-size integers
84
#else
85
#include "../htscodecs/htscodecs/rANS_static.h"
86
#include "../htscodecs/htscodecs/rANS_static4x16.h"
87
#include "../htscodecs/htscodecs/arith_dynamic.h"
88
#include "../htscodecs/htscodecs/tokenise_name3.h"
89
#include "../htscodecs/htscodecs/fqzcomp_qual.h"
90
#include "../htscodecs/htscodecs/varint.h"
91
#endif
92
93
//#define REF_DEBUG
94
95
#ifdef REF_DEBUG
96
#include <sys/syscall.h>
97
#define gettid() (int)syscall(SYS_gettid)
98
99
#define RP(...) fprintf (stderr, __VA_ARGS__)
100
#else
101
#define RP(...)
102
#endif
103
104
#include "../htslib/hfile.h"
105
#include "../htslib/bgzf.h"
106
#include "../htslib/faidx.h"
107
#include "../hts_internal.h"
108
109
#ifndef PATH_MAX
110
#define PATH_MAX FILENAME_MAX
111
#endif
112
113
11.5k
#define TRIAL_SPAN 70
114
11.5k
#define NTRIALS 3
115
116
275
#define CRAM_DEFAULT_LEVEL 5
117
118
/* ----------------------------------------------------------------------
119
 * ITF8 encoding and decoding.
120
 *
121
 * Also see the itf8_get and itf8_put macros in cram_io.h
122
 */
123
124
/*
125
 * LEGACY: consider using itf8_decode_crc.
126
 *
127
 * Reads an integer in ITF-8 encoding from 'cp' and stores it in
128
 * *val.
129
 *
130
 * Returns the number of bytes read on success
131
 *        -1 on failure
132
 */
133
0
int itf8_decode(cram_fd *fd, int32_t *val_p) {
134
0
    static int nbytes[16] = {
135
0
        0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
136
0
        1,1,1,1,                                        // 1000xxxx - 1011xxxx
137
0
        2,2,                                            // 1100xxxx - 1101xxxx
138
0
        3,                                              // 1110xxxx
139
0
        4,                                              // 1111xxxx
140
0
    };
141
142
0
    static int nbits[16] = {
143
0
        0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
144
0
        0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
145
0
        0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
146
0
        0x0f,                                           // 1110xxxx
147
0
        0x0f,                                           // 1111xxxx
148
0
    };
149
150
0
    int32_t val = hgetc(fd->fp);
151
0
    if (val == -1)
152
0
        return -1;
153
154
0
    int i = nbytes[val>>4];
155
0
    val &= nbits[val>>4];
156
157
0
    switch(i) {
158
0
    case 0:
159
0
        *val_p = val;
160
0
        return 1;
161
162
0
    case 1:
163
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
164
0
        *val_p = val;
165
0
        return 2;
166
167
0
    case 2:
168
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
169
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
170
0
        *val_p = val;
171
0
        return 3;
172
173
0
    case 3:
174
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
175
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
176
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
177
0
        *val_p = val;
178
0
        return 4;
179
180
0
    case 4: // really 3.5 more, why make it different?
181
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
182
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
183
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
184
0
        val = (val<<4) | (((unsigned char)hgetc(fd->fp)) & 0x0f);
185
0
        *val_p = val;
186
0
    }
187
188
0
    return 5;
189
0
}
190
191
540k
int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
192
540k
    static int nbytes[16] = {
193
540k
        0,0,0,0, 0,0,0,0,                               // 0000xxxx - 0111xxxx
194
540k
        1,1,1,1,                                        // 1000xxxx - 1011xxxx
195
540k
        2,2,                                            // 1100xxxx - 1101xxxx
196
540k
        3,                                              // 1110xxxx
197
540k
        4,                                              // 1111xxxx
198
540k
    };
199
200
540k
    static int nbits[16] = {
201
540k
        0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // 0000xxxx - 0111xxxx
202
540k
        0x3f, 0x3f, 0x3f, 0x3f,                         // 1000xxxx - 1011xxxx
203
540k
        0x1f, 0x1f,                                     // 1100xxxx - 1101xxxx
204
540k
        0x0f,                                           // 1110xxxx
205
540k
        0x0f,                                           // 1111xxxx
206
540k
    };
207
540k
    unsigned char c[5];
208
209
540k
    int32_t val = hgetc(fd->fp);
210
540k
    if (val == -1)
211
23
        return -1;
212
213
540k
    c[0]=val;
214
215
540k
    int i = nbytes[val>>4];
216
540k
    val &= nbits[val>>4];
217
218
540k
    if (i > 0) {
219
130k
        if (hread(fd->fp, &c[1], i) < i)
220
3
            return -1;
221
130k
    }
222
223
540k
    switch(i) {
224
410k
    case 0:
225
410k
        *val_p = val;
226
410k
        *crc = crc32(*crc, c, 1);
227
410k
        return 1;
228
229
64.5k
    case 1:
230
64.5k
        val = (val<<8) | c[1];
231
64.5k
        *val_p = val;
232
64.5k
        *crc = crc32(*crc, c, 2);
233
64.5k
        return 2;
234
235
31.7k
    case 2:
236
31.7k
        val = (val<<8) | c[1];
237
31.7k
        val = (val<<8) | c[2];
238
31.7k
        *val_p = val;
239
31.7k
        *crc = crc32(*crc, c, 3);
240
31.7k
        return 3;
241
242
16.6k
    case 3:
243
16.6k
        val = (val<<8) | c[1];
244
16.6k
        val = (val<<8) | c[2];
245
16.6k
        val = (val<<8) | c[3];
246
16.6k
        *val_p = val;
247
16.6k
        *crc = crc32(*crc, c, 4);
248
16.6k
        return 4;
249
250
17.4k
    case 4: // really 3.5 more, why make it different?
251
17.4k
        {
252
17.4k
            uint32_t uv = val;
253
17.4k
            uv = (uv<<8) |  c[1];
254
17.4k
            uv = (uv<<8) |  c[2];
255
17.4k
            uv = (uv<<8) |  c[3];
256
17.4k
            uv = (uv<<4) | (c[4] & 0x0f);
257
            // Avoid implementation-defined behaviour on negative values
258
17.4k
            *val_p = uv < 0x80000000UL ? (int32_t) uv : -((int32_t) (0xffffffffUL - uv)) - 1;
259
17.4k
            *crc = crc32(*crc, c, 5);
260
17.4k
        }
261
540k
    }
262
263
17.4k
    return 5;
264
540k
}
265
266
/*
267
 * Stores a value to memory in ITF-8 format.
268
 *
269
 * Returns the number of bytes required to store the number.
270
 * This is a maximum of 5 bytes.
271
 */
272
0
static inline int itf8_put(char *cp, int32_t val) {
273
0
    unsigned char *up = (unsigned char *)cp;
274
0
    if        (!(val & ~0x00000007f)) { // 1 byte
275
0
        *up = val;
276
0
        return 1;
277
0
    } else if (!(val & ~0x00003fff)) { // 2 byte
278
0
        *up++ = (val >> 8 ) | 0x80;
279
0
        *up   = val & 0xff;
280
0
        return 2;
281
0
    } else if (!(val & ~0x01fffff)) { // 3 byte
282
0
        *up++ = (val >> 16) | 0xc0;
283
0
        *up++ = (val >> 8 ) & 0xff;
284
0
        *up   = val & 0xff;
285
0
        return 3;
286
0
    } else if (!(val & ~0x0fffffff)) { // 4 byte
287
0
        *up++ = (val >> 24) | 0xe0;
288
0
        *up++ = (val >> 16) & 0xff;
289
0
        *up++ = (val >> 8 ) & 0xff;
290
0
        *up   = val & 0xff;
291
0
        return 4;
292
0
    } else {                           // 5 byte
293
0
        *up++ = 0xf0 | ((val>>28) & 0xff);
294
0
        *up++ = (val >> 20) & 0xff;
295
0
        *up++ = (val >> 12) & 0xff;
296
0
        *up++ = (val >> 4 ) & 0xff;
297
0
        *up = val & 0x0f;
298
0
        return 5;
299
0
    }
300
0
}
301
302
303
/* 64-bit itf8 variant */
304
0
static inline int ltf8_put(char *cp, int64_t val) {
305
0
    unsigned char *up = (unsigned char *)cp;
306
0
    if        (!(val & ~((1LL<<7)-1))) {
307
0
        *up = val;
308
0
        return 1;
309
0
    } else if (!(val & ~((1LL<<(6+8))-1))) {
310
0
        *up++ = (val >> 8 ) | 0x80;
311
0
        *up   = val & 0xff;
312
0
        return 2;
313
0
    } else if (!(val & ~((1LL<<(5+2*8))-1))) {
314
0
        *up++ = (val >> 16) | 0xc0;
315
0
        *up++ = (val >> 8 ) & 0xff;
316
0
        *up   = val & 0xff;
317
0
        return 3;
318
0
    } else if (!(val & ~((1LL<<(4+3*8))-1))) {
319
0
        *up++ = (val >> 24) | 0xe0;
320
0
        *up++ = (val >> 16) & 0xff;
321
0
        *up++ = (val >> 8 ) & 0xff;
322
0
        *up   = val & 0xff;
323
0
        return 4;
324
0
    } else if (!(val & ~((1LL<<(3+4*8))-1))) {
325
0
        *up++ = (val >> 32) | 0xf0;
326
0
        *up++ = (val >> 24) & 0xff;
327
0
        *up++ = (val >> 16) & 0xff;
328
0
        *up++ = (val >> 8 ) & 0xff;
329
0
        *up   = val & 0xff;
330
0
        return 5;
331
0
    } else if (!(val & ~((1LL<<(2+5*8))-1))) {
332
0
        *up++ = (val >> 40) | 0xf8;
333
0
        *up++ = (val >> 32) & 0xff;
334
0
        *up++ = (val >> 24) & 0xff;
335
0
        *up++ = (val >> 16) & 0xff;
336
0
        *up++ = (val >> 8 ) & 0xff;
337
0
        *up   = val & 0xff;
338
0
        return 6;
339
0
    } else if (!(val & ~((1LL<<(1+6*8))-1))) {
340
0
        *up++ = (val >> 48) | 0xfc;
341
0
        *up++ = (val >> 40) & 0xff;
342
0
        *up++ = (val >> 32) & 0xff;
343
0
        *up++ = (val >> 24) & 0xff;
344
0
        *up++ = (val >> 16) & 0xff;
345
0
        *up++ = (val >> 8 ) & 0xff;
346
0
        *up   = val & 0xff;
347
0
        return 7;
348
0
    } else if (!(val & ~((1LL<<(7*8))-1))) {
349
0
        *up++ = (val >> 56) | 0xfe;
350
0
        *up++ = (val >> 48) & 0xff;
351
0
        *up++ = (val >> 40) & 0xff;
352
0
        *up++ = (val >> 32) & 0xff;
353
0
        *up++ = (val >> 24) & 0xff;
354
0
        *up++ = (val >> 16) & 0xff;
355
0
        *up++ = (val >> 8 ) & 0xff;
356
0
        *up   = val & 0xff;
357
0
        return 8;
358
0
    } else {
359
0
        *up++ = 0xff;
360
0
        *up++ = (val >> 56) & 0xff;
361
0
        *up++ = (val >> 48) & 0xff;
362
0
        *up++ = (val >> 40) & 0xff;
363
0
        *up++ = (val >> 32) & 0xff;
364
0
        *up++ = (val >> 24) & 0xff;
365
0
        *up++ = (val >> 16) & 0xff;
366
0
        *up++ = (val >> 8 ) & 0xff;
367
0
        *up   = val & 0xff;
368
0
        return 9;
369
0
    }
370
0
}
371
372
/*
373
 * Encodes and writes a single integer in ITF-8 format.
374
 * Returns 0 on success
375
 *        -1 on failure
376
 */
377
0
int itf8_encode(cram_fd *fd, int32_t val) {
378
0
    char buf[5];
379
0
    int len = itf8_put(buf, val);
380
0
    return hwrite(fd->fp, buf, len) == len ? 0 : -1;
381
0
}
382
383
const int itf8_bytes[16] = {
384
    1, 1, 1, 1,  1, 1, 1, 1,
385
    2, 2, 2, 2,  3, 3, 4, 5
386
};
387
388
const int ltf8_bytes[256] = {
389
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
390
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
391
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
392
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
393
394
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
395
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
396
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
397
    1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,  1, 1, 1, 1,
398
399
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
400
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
401
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
402
    2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,  2, 2, 2, 2,
403
404
    3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
405
    3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,  3, 3, 3, 3,
406
407
    4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
408
409
    5, 5, 5, 5,  5, 5, 5, 5,  6, 6, 6, 6,  7, 7, 8, 9
410
};
411
412
/*
413
 * LEGACY: consider using ltf8_decode_crc.
414
 */
415
0
int ltf8_decode(cram_fd *fd, int64_t *val_p) {
416
0
    int c = hgetc(fd->fp);
417
0
    int64_t val = (unsigned char)c;
418
0
    if (c == -1)
419
0
        return -1;
420
421
0
    if (val < 0x80) {
422
0
        *val_p =   val;
423
0
        return 1;
424
425
0
    } else if (val < 0xc0) {
426
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
427
0
        *val_p = val & (((1LL<<(6+8)))-1);
428
0
        return 2;
429
430
0
    } else if (val < 0xe0) {
431
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
432
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
433
0
        *val_p = val & ((1LL<<(5+2*8))-1);
434
0
        return 3;
435
436
0
    } else if (val < 0xf0) {
437
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
438
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
439
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
440
0
        *val_p = val & ((1LL<<(4+3*8))-1);
441
0
        return 4;
442
443
0
    } else if (val < 0xf8) {
444
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
445
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
446
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
447
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
448
0
        *val_p = val & ((1LL<<(3+4*8))-1);
449
0
        return 5;
450
451
0
    } else if (val < 0xfc) {
452
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
453
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
454
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
455
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
456
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
457
0
        *val_p = val & ((1LL<<(2+5*8))-1);
458
0
        return 6;
459
460
0
    } else if (val < 0xfe) {
461
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
462
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
463
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
464
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
465
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
466
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
467
0
        *val_p = val & ((1LL<<(1+6*8))-1);
468
0
        return 7;
469
470
0
    } else if (val < 0xff) {
471
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
472
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
473
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
474
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
475
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
476
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
477
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
478
0
        *val_p = val & ((1LL<<(7*8))-1);
479
0
        return 8;
480
481
0
    } else {
482
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
483
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
484
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
485
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
486
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
487
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
488
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
489
0
        val = (val<<8) | (unsigned char)hgetc(fd->fp);
490
0
        *val_p = val;
491
0
    }
492
493
0
    return 9;
494
0
}
495
496
1.31k
int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
497
1.31k
    unsigned char c[9];
498
1.31k
    int64_t val = hgetc(fd->fp);
499
1.31k
    if (val < 0)
500
0
        return -1;
501
502
1.31k
    c[0] = val;
503
504
1.31k
    if (val < 0x80) {
505
1.03k
        *val_p =   val;
506
1.03k
        *crc = crc32(*crc, c, 1);
507
1.03k
        return 1;
508
509
1.03k
    } else if (val < 0xc0) {
510
55
        int v = hgetc(fd->fp);
511
55
        if (v < 0)
512
0
            return -1;
513
55
        val = (val<<8) | (c[1]=v);
514
55
        *val_p = val & (((1LL<<(6+8)))-1);
515
55
        *crc = crc32(*crc, c, 2);
516
55
        return 2;
517
518
224
    } else if (val < 0xe0) {
519
50
        if (hread(fd->fp, &c[1], 2) < 2)
520
0
            return -1;
521
50
        val = (val<<8) | c[1];
522
50
        val = (val<<8) | c[2];
523
50
        *val_p = val & ((1LL<<(5+2*8))-1);
524
50
        *crc = crc32(*crc, c, 3);
525
50
        return 3;
526
527
174
    } else if (val < 0xf0) {
528
20
        if (hread(fd->fp, &c[1], 3) < 3)
529
0
            return -1;
530
20
        val = (val<<8) | c[1];
531
20
        val = (val<<8) | c[2];
532
20
        val = (val<<8) | c[3];
533
20
        *val_p = val & ((1LL<<(4+3*8))-1);
534
20
        *crc = crc32(*crc, c, 4);
535
20
        return 4;
536
537
154
    } else if (val < 0xf8) {
538
1
        if (hread(fd->fp, &c[1], 4) < 4)
539
0
            return -1;
540
1
        val = (val<<8) | c[1];
541
1
        val = (val<<8) | c[2];
542
1
        val = (val<<8) | c[3];
543
1
        val = (val<<8) | c[4];
544
1
        *val_p = val & ((1LL<<(3+4*8))-1);
545
1
        *crc = crc32(*crc, c, 5);
546
1
        return 5;
547
548
153
    } else if (val < 0xfc) {
549
79
        if (hread(fd->fp, &c[1], 5) < 5)
550
0
            return -1;
551
79
        val = (val<<8) | c[1];
552
79
        val = (val<<8) | c[2];
553
79
        val = (val<<8) | c[3];
554
79
        val = (val<<8) | c[4];
555
79
        val = (val<<8) | c[5];
556
79
        *val_p = val & ((1LL<<(2+5*8))-1);
557
79
        *crc = crc32(*crc, c, 6);
558
79
        return 6;
559
560
79
    } else if (val < 0xfe) {
561
32
        if (hread(fd->fp, &c[1], 6) < 6)
562
0
            return -1;
563
32
        val = (val<<8) | c[1];
564
32
        val = (val<<8) | c[2];
565
32
        val = (val<<8) | c[3];
566
32
        val = (val<<8) | c[4];
567
32
        val = (val<<8) | c[5];
568
32
        val = (val<<8) | c[6];
569
32
        *val_p = val & ((1LL<<(1+6*8))-1);
570
32
        *crc = crc32(*crc, c, 7);
571
32
        return 7;
572
573
42
    } else if (val < 0xff) {
574
0
        uint64_t uval = val;
575
0
        if (hread(fd->fp, &c[1], 7) < 7)
576
0
            return -1;
577
0
        uval = (uval<<8) | c[1];
578
0
        uval = (uval<<8) | c[2];
579
0
        uval = (uval<<8) | c[3];
580
0
        uval = (uval<<8) | c[4];
581
0
        uval = (uval<<8) | c[5];
582
0
        uval = (uval<<8) | c[6];
583
0
        uval = (uval<<8) | c[7];
584
0
        *val_p = uval & ((1ULL<<(7*8))-1);
585
0
        *crc = crc32(*crc, c, 8);
586
0
        return 8;
587
588
42
    } else {
589
42
        uint64_t uval;
590
42
        if (hread(fd->fp, &c[1], 8) < 8)
591
0
            return -1;
592
42
        uval =             c[1];
593
42
        uval = (uval<<8) | c[2];
594
42
        uval = (uval<<8) | c[3];
595
42
        uval = (uval<<8) | c[4];
596
42
        uval = (uval<<8) | c[5];
597
42
        uval = (uval<<8) | c[6];
598
42
        uval = (uval<<8) | c[7];
599
42
        uval = (uval<<8) | c[8];
600
42
        *crc = crc32(*crc, c, 9);
601
        // Avoid implementation-defined behaviour on negative values
602
42
        *val_p = c[1] < 0x80 ? (int64_t) uval : -((int64_t) (0xffffffffffffffffULL - uval)) - 1;
603
42
    }
604
605
42
    return 9;
606
1.31k
}
607
608
/*
609
 * Pushes a value in ITF8 format onto the end of a block.
610
 * This shouldn't be used for high-volume data as it is not the fastest
611
 * method.
612
 *
613
 * Returns the number of bytes written
614
 */
615
0
int itf8_put_blk(cram_block *blk, int32_t val) {
616
0
    char buf[5];
617
0
    int sz;
618
619
0
    sz = itf8_put(buf, val);
620
0
    BLOCK_APPEND(blk, buf, sz);
621
0
    return sz;
622
623
0
 block_err:
624
0
    return -1;
625
0
}
626
627
0
int ltf8_put_blk(cram_block *blk, int64_t val) {
628
0
    char buf[9];
629
0
    int sz;
630
631
0
    sz = ltf8_put(buf, val);
632
0
    BLOCK_APPEND(blk, buf, sz);
633
0
    return sz;
634
635
0
 block_err:
636
0
    return -1;
637
0
}
638
639
57.1k
static int64_t safe_itf8_get(char **cp, const char *endp, int *err) {
640
57.1k
    const unsigned char *up = (unsigned char *)*cp;
641
642
57.1k
    if (endp && endp - *cp < 5 &&
643
57.1k
        (*cp >= endp || endp - *cp < itf8_bytes[up[0]>>4])) {
644
337
        if (err) *err = 1;
645
337
        return 0;
646
337
    }
647
648
56.8k
    if (up[0] < 0x80) {
649
53.9k
        (*cp)++;
650
53.9k
        return up[0];
651
53.9k
    } else if (up[0] < 0xc0) {
652
1.59k
        (*cp)+=2;
653
1.59k
        return ((up[0] <<8) |  up[1])                           & 0x3fff;
654
1.59k
    } else if (up[0] < 0xe0) {
655
246
        (*cp)+=3;
656
246
        return ((up[0]<<16) | (up[1]<< 8) |  up[2])             & 0x1fffff;
657
1.08k
    } else if (up[0] < 0xf0) {
658
516
        (*cp)+=4;
659
516
        uint32_t uv = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff;
660
516
        return (int32_t)uv;
661
565
    } else {
662
565
        (*cp)+=5;
663
565
        uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f);
664
565
        return (int32_t)uv;
665
565
    }
666
56.8k
}
667
668
409
static int64_t safe_ltf8_get(char **cp, const char *endp, int *err) {
669
409
    unsigned char *up = (unsigned char *)*cp;
670
671
409
    if (endp && endp - *cp < 9 &&
672
409
        (*cp >= endp || endp - *cp < ltf8_bytes[up[0]])) {
673
238
        if (err) *err = 1;
674
238
        return 0;
675
238
    }
676
677
171
    if (up[0] < 0x80) {
678
141
        (*cp)++;
679
141
        return up[0];
680
141
    } else if (up[0] < 0xc0) {
681
10
        (*cp)+=2;
682
10
        return (((uint64_t)up[0]<< 8) |
683
10
                 (uint64_t)up[1]) & (((1LL<<(6+8)))-1);
684
20
    } else if (up[0] < 0xe0) {
685
0
        (*cp)+=3;
686
0
        return (((uint64_t)up[0]<<16) |
687
0
                ((uint64_t)up[1]<< 8) |
688
0
                 (uint64_t)up[2]) & ((1LL<<(5+2*8))-1);
689
20
    } else if (up[0] < 0xf0) {
690
0
        (*cp)+=4;
691
0
        return (((uint64_t)up[0]<<24) |
692
0
                ((uint64_t)up[1]<<16) |
693
0
                ((uint64_t)up[2]<< 8) |
694
0
                 (uint64_t)up[3]) & ((1LL<<(4+3*8))-1);
695
20
    } else if (up[0] < 0xf8) {
696
16
        (*cp)+=5;
697
16
        return (((uint64_t)up[0]<<32) |
698
16
                ((uint64_t)up[1]<<24) |
699
16
                ((uint64_t)up[2]<<16) |
700
16
                ((uint64_t)up[3]<< 8) |
701
16
                 (uint64_t)up[4]) & ((1LL<<(3+4*8))-1);
702
16
    } else if (up[0] < 0xfc) {
703
1
        (*cp)+=6;
704
1
        return (((uint64_t)up[0]<<40) |
705
1
                ((uint64_t)up[1]<<32) |
706
1
                ((uint64_t)up[2]<<24) |
707
1
                ((uint64_t)up[3]<<16) |
708
1
                ((uint64_t)up[4]<< 8) |
709
1
                 (uint64_t)up[5]) & ((1LL<<(2+5*8))-1);
710
3
    } else if (up[0] < 0xfe) {
711
0
        (*cp)+=7;
712
0
        return (((uint64_t)up[0]<<48) |
713
0
                ((uint64_t)up[1]<<40) |
714
0
                ((uint64_t)up[2]<<32) |
715
0
                ((uint64_t)up[3]<<24) |
716
0
                ((uint64_t)up[4]<<16) |
717
0
                ((uint64_t)up[5]<< 8) |
718
0
                 (uint64_t)up[6]) & ((1LL<<(1+6*8))-1);
719
3
    } else if (up[0] < 0xff) {
720
0
        (*cp)+=8;
721
0
        return (((uint64_t)up[1]<<48) |
722
0
                ((uint64_t)up[2]<<40) |
723
0
                ((uint64_t)up[3]<<32) |
724
0
                ((uint64_t)up[4]<<24) |
725
0
                ((uint64_t)up[5]<<16) |
726
0
                ((uint64_t)up[6]<< 8) |
727
0
                 (uint64_t)up[7]) & ((1LL<<(7*8))-1);
728
3
    } else {
729
3
        (*cp)+=9;
730
3
        return (((uint64_t)up[1]<<56) |
731
3
                ((uint64_t)up[2]<<48) |
732
3
                ((uint64_t)up[3]<<40) |
733
3
                ((uint64_t)up[4]<<32) |
734
3
                ((uint64_t)up[5]<<24) |
735
3
                ((uint64_t)up[6]<<16) |
736
3
                ((uint64_t)up[7]<< 8) |
737
3
                 (uint64_t)up[8]);
738
3
    }
739
171
}
740
741
// Wrapper for now
742
0
static int safe_itf8_put(char *cp, char *cp_end, int32_t val) {
743
0
    return itf8_put(cp, val);
744
0
}
745
746
0
static int safe_ltf8_put(char *cp, char *cp_end, int64_t val) {
747
0
    return ltf8_put(cp, val);
748
0
}
749
750
5.81k
static int itf8_size(int64_t v) {
751
5.81k
    return ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5);
752
5.81k
}
753
754
//-----------------------------------------------------------------------------
755
756
// CRAM v4.0 onwards uses a different variable sized integer encoding
757
// that is size agnostic.
758
759
// Local interface to varint.h inline version, so we can use in func ptr.
760
// Note a lot of these use the unsigned interface but take signed int64_t.
761
// This is because the old CRAM ITF8 inteface had signed -1 as unsigned
762
// 0xffffffff.
763
0
static int uint7_size(int64_t v) {
764
0
    return var_size_u64(v);
765
0
}
766
767
0
static int64_t uint7_get_32(char **cp, const char *endp, int *err) {
768
0
    uint32_t val;
769
0
    int nb = var_get_u32((uint8_t *)(*cp), (const uint8_t *)endp, &val);
770
0
    (*cp) += nb;
771
0
    if (!nb && err) *err = 1;
772
0
    return val;
773
0
}
774
775
0
static int64_t sint7_get_32(char **cp, const char *endp, int *err) {
776
0
    int32_t val;
777
0
    int nb = var_get_s32((uint8_t *)(*cp), (const uint8_t *)endp, &val);
778
0
    (*cp) += nb;
779
0
    if (!nb && err) *err = 1;
780
0
    return val;
781
0
}
782
783
0
static int64_t uint7_get_64(char **cp, const char *endp, int *err) {
784
0
    uint64_t val;
785
0
    int nb = var_get_u64((uint8_t *)(*cp), (const uint8_t *)endp, &val);
786
0
    (*cp) += nb;
787
0
    if (!nb && err) *err = 1;
788
0
    return val;
789
0
}
790
791
0
static int64_t sint7_get_64(char **cp, const char *endp, int *err) {
792
0
    int64_t val;
793
0
    int nb = var_get_s64((uint8_t *)(*cp), (const uint8_t *)endp, &val);
794
0
    (*cp) += nb;
795
0
    if (!nb && err) *err = 1;
796
0
    return val;
797
0
}
798
799
0
static int uint7_put_32(char *cp, char *endp, int32_t val) {
800
0
    return var_put_u32((uint8_t *)cp, (uint8_t *)endp, val);
801
0
}
802
803
0
static int sint7_put_32(char *cp, char *endp, int32_t val) {
804
0
    return var_put_s32((uint8_t *)cp, (uint8_t *)endp, val);
805
0
}
806
807
0
static int uint7_put_64(char *cp, char *endp, int64_t val) {
808
0
    return var_put_u64((uint8_t *)cp, (uint8_t *)endp, val);
809
0
}
810
811
0
static int sint7_put_64(char *cp, char *endp, int64_t val) {
812
0
    return var_put_s64((uint8_t *)cp, (uint8_t *)endp, val);
813
0
}
814
815
// Put direct to to cram_block
816
0
static int uint7_put_blk_32(cram_block *blk, int32_t v) {
817
0
    uint8_t buf[10];
818
0
    int sz = var_put_u32(buf, buf+10, v);
819
0
    BLOCK_APPEND(blk, buf, sz);
820
0
    return sz;
821
822
0
 block_err:
823
0
    return -1;
824
0
}
825
826
0
static int sint7_put_blk_32(cram_block *blk, int32_t v) {
827
0
    uint8_t buf[10];
828
0
    int sz = var_put_s32(buf, buf+10, v);
829
0
    BLOCK_APPEND(blk, buf, sz);
830
0
    return sz;
831
832
0
 block_err:
833
0
    return -1;
834
0
}
835
836
0
static int uint7_put_blk_64(cram_block *blk, int64_t v) {
837
0
    uint8_t buf[10];
838
0
    int sz = var_put_u64(buf, buf+10, v);
839
0
    BLOCK_APPEND(blk, buf, sz);
840
0
    return sz;
841
842
0
 block_err:
843
0
    return -1;
844
0
}
845
846
0
static int sint7_put_blk_64(cram_block *blk, int64_t v) {
847
0
    uint8_t buf[10];
848
0
    int sz = var_put_s64(buf, buf+10, v);
849
0
    BLOCK_APPEND(blk, buf, sz);
850
0
    return sz;
851
852
0
 block_err:
853
0
    return -1;
854
0
}
855
856
// Decode 32-bits with CRC update from cram_fd
857
5.16k
static int uint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
858
5.16k
    uint8_t b[5], i = 0;
859
5.16k
    int c;
860
5.16k
    uint32_t v = 0;
861
862
#ifdef VARINT2
863
    b[0] = hgetc(fd->fp);
864
    if (b[0] < 177) {
865
    } else if (b[0] < 241) {
866
        b[1] = hgetc(fd->fp);
867
    } else if (b[0] < 249) {
868
        b[1] = hgetc(fd->fp);
869
        b[2] = hgetc(fd->fp);
870
    } else {
871
        int n = b[0]+2, z = 1;
872
        while (n-- >= 249)
873
            b[z++] = hgetc(fd->fp);
874
    }
875
    i = var_get_u32(b, NULL, &v);
876
#else
877
//    // Little endian
878
//    int s = 0;
879
//    do {
880
//        b[i++] = c = hgetc(fd->fp);
881
//        if (c < 0)
882
//            return -1;
883
//        v |= (c & 0x7f) << s;
884
//      s += 7;
885
//    } while (i < 5 && (c & 0x80));
886
887
    // Big endian, see also htscodecs/varint.h
888
10.3k
    do {
889
10.3k
        b[i++] = c = hgetc(fd->fp);
890
10.3k
        if (c < 0)
891
1
            return -1;
892
10.3k
        v = (v<<7) | (c & 0x7f);
893
10.3k
    } while (i < 5 && (c & 0x80));
894
5.16k
#endif
895
5.16k
    *crc = crc32(*crc, b, i);
896
897
5.16k
    *val_p = v;
898
5.16k
    return i;
899
5.16k
}
900
901
// Decode 32-bits with CRC update from cram_fd
902
3
static int sint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) {
903
3
    uint8_t b[5], i = 0;
904
3
    int c;
905
3
    uint32_t v = 0;
906
907
#ifdef VARINT2
908
    b[0] = hgetc(fd->fp);
909
    if (b[0] < 177) {
910
    } else if (b[0] < 241) {
911
        b[1] = hgetc(fd->fp);
912
    } else if (b[0] < 249) {
913
        b[1] = hgetc(fd->fp);
914
        b[2] = hgetc(fd->fp);
915
    } else {
916
        int n = b[0]+2, z = 1;
917
        while (n-- >= 249)
918
            b[z++] = hgetc(fd->fp);
919
    }
920
    i = var_get_u32(b, NULL, &v);
921
#else
922
//    // Little endian
923
//    int s = 0;
924
//    do {
925
//        b[i++] = c = hgetc(fd->fp);
926
//        if (c < 0)
927
//            return -1;
928
//        v |= (c & 0x7f) << s;
929
//      s += 7;
930
//    } while (i < 5 && (c & 0x80));
931
932
    // Big endian, see also htscodecs/varint.h
933
7
    do {
934
7
        b[i++] = c = hgetc(fd->fp);
935
7
        if (c < 0)
936
0
            return -1;
937
7
        v = (v<<7) | (c & 0x7f);
938
7
    } while (i < 5 && (c & 0x80));
939
3
#endif
940
3
    *crc = crc32(*crc, b, i);
941
942
3
    *val_p = (v>>1) ^ -(v&1);
943
3
    return i;
944
3
}
945
946
947
// Decode 64-bits with CRC update from cram_fd
948
12
static int uint7_decode_crc64(cram_fd *fd, int64_t *val_p, uint32_t *crc) {
949
12
    uint8_t b[10], i = 0;
950
12
    int c;
951
12
    uint64_t v = 0;
952
953
#ifdef VARINT2
954
    b[0] = hgetc(fd->fp);
955
    if (b[0] < 177) {
956
    } else if (b[0] < 241) {
957
        b[1] = hgetc(fd->fp);
958
    } else if (b[0] < 249) {
959
        b[1] = hgetc(fd->fp);
960
        b[2] = hgetc(fd->fp);
961
    } else {
962
        int n = b[0]+2, z = 1;
963
        while (n-- >= 249)
964
            b[z++] = hgetc(fd->fp);
965
    }
966
    i = var_get_u64(b, NULL, &v);
967
#else
968
//    // Little endian
969
//    int s = 0;
970
//    do {
971
//        b[i++] = c = hgetc(fd->fp);
972
//        if (c < 0)
973
//            return -1;
974
//        v |= (c & 0x7f) << s;
975
//      s += 7;
976
//    } while (i < 10 && (c & 0x80));
977
978
    // Big endian, see also htscodecs/varint.h
979
20
    do {
980
20
        b[i++] = c = hgetc(fd->fp);
981
20
        if (c < 0)
982
0
            return -1;
983
20
        v = (v<<7) | (c & 0x7f);
984
20
    } while (i < 5 && (c & 0x80));
985
12
#endif
986
12
    *crc = crc32(*crc, b, i);
987
988
12
    *val_p = v;
989
12
    return i;
990
12
}
991
992
//-----------------------------------------------------------------------------
993
994
/*
995
 * Decodes a 32-bit little endian value from fd and stores in val.
996
 *
997
 * Returns the number of bytes read on success
998
 *         -1 on failure
999
 */
1000
1.56k
static int int32_decode(cram_fd *fd, int32_t *val) {
1001
1.56k
    int32_t i;
1002
1.56k
    if (4 != hread(fd->fp, &i, 4))
1003
0
        return -1;
1004
1005
1.56k
    *val = le_int4(i);
1006
1.56k
    return 4;
1007
1.56k
}
1008
1009
/*
1010
 * Encodes a 32-bit little endian value 'val' and writes to fd.
1011
 *
1012
 * Returns the number of bytes written on success
1013
 *         -1 on failure
1014
 */
1015
0
static int int32_encode(cram_fd *fd, int32_t val) {
1016
0
    uint32_t v = le_int4(val);
1017
0
    if (4 != hwrite(fd->fp, &v, 4))
1018
0
        return -1;
1019
1020
0
    return 4;
1021
0
}
1022
1023
/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
1024
15
int int32_get_blk(cram_block *b, int32_t *val) {
1025
15
    if (b->uncomp_size - BLOCK_SIZE(b) < 4)
1026
1
        return -1;
1027
1028
14
    uint32_t v =
1029
14
         ((uint32_t) b->data[b->byte  ])        |
1030
14
        (((uint32_t) b->data[b->byte+1]) <<  8) |
1031
14
        (((uint32_t) b->data[b->byte+2]) << 16) |
1032
14
        (((uint32_t) b->data[b->byte+3]) << 24);
1033
    // Avoid implementation-defined behaviour on negative values
1034
14
    *val = v < 0x80000000U ? (int32_t) v : -((int32_t) (0xffffffffU - v)) - 1;
1035
14
    BLOCK_SIZE(b) += 4;
1036
14
    return 4;
1037
15
}
1038
1039
/* As int32_decoded/encode, but from/to blocks instead of cram_fd */
1040
0
int int32_put_blk(cram_block *b, int32_t val) {
1041
0
    unsigned char cp[4];
1042
0
    uint32_t v = val;
1043
0
    cp[0] = ( v      & 0xff);
1044
0
    cp[1] = ((v>>8)  & 0xff);
1045
0
    cp[2] = ((v>>16) & 0xff);
1046
0
    cp[3] = ((v>>24) & 0xff);
1047
1048
0
    BLOCK_APPEND(b, cp, 4);
1049
0
    return 0;
1050
1051
0
 block_err:
1052
0
    return -1;
1053
0
}
1054
1055
#ifdef HAVE_LIBDEFLATE
1056
/* ----------------------------------------------------------------------
1057
 * libdeflate compression code, with interface to match
1058
 * zlib_mem_{in,de}flate for simplicity elsewhere.
1059
 */
1060
1061
// Named the same as the version that uses zlib as we always use libdeflate for
1062
// decompression when available.
1063
char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1064
    struct libdeflate_decompressor *z = libdeflate_alloc_decompressor();
1065
    if (!z) {
1066
        hts_log_error("Call to libdeflate_alloc_decompressor failed");
1067
        return NULL;
1068
    }
1069
1070
    uint8_t *data = NULL, *new_data;
1071
    if (!*size)
1072
        *size = csize*2;
1073
    for(;;) {
1074
        new_data = realloc(data, *size);
1075
        if (!new_data) {
1076
            hts_log_error("Memory allocation failure");
1077
            goto fail;
1078
        }
1079
        data = new_data;
1080
1081
        int ret = libdeflate_gzip_decompress(z, cdata, csize, data, *size, size);
1082
1083
        // Auto grow output buffer size if needed and try again.
1084
        // Fortunately for all bar one call of this we know the size already.
1085
        if (ret == LIBDEFLATE_INSUFFICIENT_SPACE) {
1086
            (*size) *= 1.5;
1087
            continue;
1088
        }
1089
1090
        if (ret != LIBDEFLATE_SUCCESS) {
1091
            hts_log_error("Inflate operation failed: %d", ret);
1092
            goto fail;
1093
        } else {
1094
            break;
1095
        }
1096
    }
1097
1098
    libdeflate_free_decompressor(z);
1099
    return (char *)data;
1100
1101
 fail:
1102
    libdeflate_free_decompressor(z);
1103
    free(data);
1104
    return NULL;
1105
}
1106
1107
// Named differently as we use both zlib/libdeflate for compression.
1108
static char *libdeflate_deflate(char *data, size_t size, size_t *cdata_size,
1109
                                int level, int strat) {
1110
    level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default
1111
    level *= 1.23;     // NB levels go up to 12 here; 5 onwards is +1
1112
    level += level>=8; // 5,6,7->6,7,8  8->10  9->12
1113
    if (level > 12) level = 12;
1114
1115
    if (strat == Z_RLE) // not supported by libdeflate
1116
        level = 1;
1117
1118
    struct libdeflate_compressor *z = libdeflate_alloc_compressor(level);
1119
    if (!z) {
1120
        hts_log_error("Call to libdeflate_alloc_compressor failed");
1121
        return NULL;
1122
    }
1123
1124
    unsigned char *cdata = NULL; /* Compressed output */
1125
    size_t cdata_alloc;
1126
    cdata = malloc(cdata_alloc = size*1.05+100);
1127
    if (!cdata) {
1128
        hts_log_error("Memory allocation failure");
1129
        libdeflate_free_compressor(z);
1130
        return NULL;
1131
    }
1132
1133
    *cdata_size = libdeflate_gzip_compress(z, data, size, cdata, cdata_alloc);
1134
    libdeflate_free_compressor(z);
1135
1136
    if (*cdata_size == 0) {
1137
        hts_log_error("Call to libdeflate_gzip_compress failed");
1138
        free(cdata);
1139
        return NULL;
1140
    }
1141
1142
    return (char *)cdata;
1143
}
1144
1145
#else
1146
1147
/* ----------------------------------------------------------------------
1148
 * zlib compression code - from Gap5's tg_iface_g.c
1149
 * They're static here as they're only used within the cram_compress_block
1150
 * and cram_uncompress_block functions, which are the external interface.
1151
 */
1152
0
char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) {
1153
0
    z_stream s;
1154
0
    unsigned char *data = NULL; /* Uncompressed output */
1155
0
    int data_alloc = 0;
1156
0
    int err;
1157
1158
    /* Starting point at uncompressed size, and scale after that */
1159
0
    data = malloc(data_alloc = csize*1.2+100);
1160
0
    if (!data)
1161
0
        return NULL;
1162
1163
    /* Initialise zlib stream */
1164
0
    s.zalloc = Z_NULL; /* use default allocation functions */
1165
0
    s.zfree  = Z_NULL;
1166
0
    s.opaque = Z_NULL;
1167
0
    s.next_in  = (unsigned char *)cdata;
1168
0
    s.avail_in = csize;
1169
0
    s.total_in = 0;
1170
0
    s.next_out  = data;
1171
0
    s.avail_out = data_alloc;
1172
0
    s.total_out = 0;
1173
1174
    //err = inflateInit(&s);
1175
0
    err = inflateInit2(&s, 15 + 32);
1176
0
    if (err != Z_OK) {
1177
0
        hts_log_error("Call to zlib inflateInit failed: %s", s.msg);
1178
0
        free(data);
1179
0
        return NULL;
1180
0
    }
1181
1182
    /* Decode to 'data' array */
1183
0
    for (;s.avail_in;) {
1184
0
        unsigned char *data_tmp;
1185
0
        int alloc_inc;
1186
1187
0
        s.next_out = &data[s.total_out];
1188
0
        err = inflate(&s, Z_NO_FLUSH);
1189
0
        if (err == Z_STREAM_END)
1190
0
            break;
1191
1192
0
        if (err != Z_OK) {
1193
0
            hts_log_error("Call to zlib inflate failed: %s", s.msg);
1194
0
            free(data);
1195
0
            inflateEnd(&s);
1196
0
            return NULL;
1197
0
        }
1198
1199
        /* More to come, so realloc based on growth so far */
1200
0
        alloc_inc = (double)s.avail_in/s.total_in * s.total_out + 100;
1201
0
        data = realloc((data_tmp = data), data_alloc += alloc_inc);
1202
0
        if (!data) {
1203
0
            free(data_tmp);
1204
0
            inflateEnd(&s);
1205
0
            return NULL;
1206
0
        }
1207
0
        s.avail_out += alloc_inc;
1208
0
    }
1209
0
    inflateEnd(&s);
1210
1211
0
    *size = s.total_out;
1212
0
    return (char *)data;
1213
0
}
1214
#endif
1215
1216
#if !defined(HAVE_LIBDEFLATE) || LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR ==  1 && LIBDEFLATE_VERSION_MINOR <= 8)
1217
static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size,
1218
0
                              int level, int strat) {
1219
0
    z_stream s;
1220
0
    unsigned char *cdata = NULL; /* Compressed output */
1221
0
    int cdata_alloc = 0;
1222
0
    int cdata_pos = 0;
1223
0
    int err;
1224
1225
0
    cdata = malloc(cdata_alloc = size*1.05+100);
1226
0
    if (!cdata)
1227
0
        return NULL;
1228
0
    cdata_pos = 0;
1229
1230
    /* Initialise zlib stream */
1231
0
    s.zalloc = Z_NULL; /* use default allocation functions */
1232
0
    s.zfree  = Z_NULL;
1233
0
    s.opaque = Z_NULL;
1234
0
    s.next_in  = (unsigned char *)data;
1235
0
    s.avail_in = size;
1236
0
    s.total_in = 0;
1237
0
    s.next_out  = cdata;
1238
0
    s.avail_out = cdata_alloc;
1239
0
    s.total_out = 0;
1240
0
    s.data_type = Z_BINARY;
1241
1242
0
    err = deflateInit2(&s, level, Z_DEFLATED, 15|16, 9, strat);
1243
0
    if (err != Z_OK) {
1244
0
        hts_log_error("Call to zlib deflateInit2 failed: %s", s.msg);
1245
0
        return NULL;
1246
0
    }
1247
1248
    /* Encode to 'cdata' array */
1249
0
    for (;s.avail_in;) {
1250
0
        s.next_out = &cdata[cdata_pos];
1251
0
        s.avail_out = cdata_alloc - cdata_pos;
1252
0
        if (cdata_alloc - cdata_pos <= 0) {
1253
0
            hts_log_error("Deflate produced larger output than expected");
1254
0
            return NULL;
1255
0
        }
1256
0
        err = deflate(&s, Z_NO_FLUSH);
1257
0
        cdata_pos = cdata_alloc - s.avail_out;
1258
0
        if (err != Z_OK) {
1259
0
            hts_log_error("Call to zlib deflate failed: %s", s.msg);
1260
0
            break;
1261
0
        }
1262
0
    }
1263
0
    if (deflate(&s, Z_FINISH) != Z_STREAM_END) {
1264
0
        hts_log_error("Call to zlib deflate failed: %s", s.msg);
1265
0
    }
1266
0
    *cdata_size = s.total_out;
1267
1268
0
    if (deflateEnd(&s) != Z_OK) {
1269
0
        hts_log_error("Call to zlib deflate failed: %s", s.msg);
1270
0
    }
1271
0
    return (char *)cdata;
1272
0
}
1273
#endif
1274
1275
#ifdef HAVE_LIBLZMA
1276
/* ------------------------------------------------------------------------ */
1277
/*
1278
 * Data compression routines using liblzma (xz)
1279
 *
1280
 * On a test set this shrunk the main db from 136157104 bytes to 114796168, but
1281
 * caused tg_index to grow from 2m43.707s to 15m3.961s. Exporting as bfastq
1282
 * went from 18.3s to 36.3s. So decompression suffers too, but not as bad
1283
 * as compression times.
1284
 *
1285
 * For now we disable this functionality. If it's to be reenabled make sure you
1286
 * improve the mem_inflate implementation as it's just a test hack at the
1287
 * moment.
1288
 */
1289
1290
static char *lzma_mem_deflate(char *data, size_t size, size_t *cdata_size,
1291
0
                              int level) {
1292
0
    char *out;
1293
0
    size_t out_size = lzma_stream_buffer_bound(size);
1294
0
    *cdata_size = 0;
1295
1296
0
    out = malloc(out_size);
1297
1298
    /* Single call compression */
1299
0
    if (LZMA_OK != lzma_easy_buffer_encode(level, LZMA_CHECK_CRC32, NULL,
1300
0
                                           (uint8_t *)data, size,
1301
0
                                           (uint8_t *)out, cdata_size,
1302
0
                                           out_size))
1303
0
        return NULL;
1304
1305
0
    return out;
1306
0
}
1307
1308
0
static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) {
1309
0
    lzma_stream strm = LZMA_STREAM_INIT;
1310
0
    size_t out_size = 0, out_pos = 0;
1311
0
    char *out = NULL, *new_out;
1312
0
    int r;
1313
1314
    /* Initiate the decoder */
1315
0
    if (LZMA_OK != lzma_stream_decoder(&strm, lzma_easy_decoder_memusage(9), 0))
1316
0
        return NULL;
1317
1318
    /* Decode loop */
1319
0
    strm.avail_in = csize;
1320
0
    strm.next_in = (uint8_t *)cdata;
1321
1322
0
    for (;strm.avail_in;) {
1323
0
        if (strm.avail_in > out_size - out_pos) {
1324
0
            out_size += strm.avail_in * 4 + 32768;
1325
0
            new_out = realloc(out, out_size);
1326
0
            if (!new_out)
1327
0
                goto fail;
1328
0
            out = new_out;
1329
0
        }
1330
0
        strm.avail_out = out_size - out_pos;
1331
0
        strm.next_out = (uint8_t *)&out[out_pos];
1332
1333
0
        r = lzma_code(&strm, LZMA_RUN);
1334
0
        if (LZMA_OK != r && LZMA_STREAM_END != r) {
1335
0
            hts_log_error("LZMA decode failure (error %d)", r);
1336
0
            goto fail;
1337
0
        }
1338
1339
0
        out_pos = strm.total_out;
1340
1341
0
        if (r == LZMA_STREAM_END)
1342
0
            break;
1343
0
    }
1344
1345
    /* finish up any unflushed data; necessary? */
1346
0
    r = lzma_code(&strm, LZMA_FINISH);
1347
0
    if (r != LZMA_OK && r != LZMA_STREAM_END) {
1348
0
        hts_log_error("Call to lzma_code failed with error %d", r);
1349
0
        goto fail;
1350
0
    }
1351
1352
0
    new_out = realloc(out, strm.total_out > 0 ? strm.total_out : 1);
1353
0
    if (new_out)
1354
0
        out = new_out;
1355
0
    *size = strm.total_out;
1356
1357
0
    lzma_end(&strm);
1358
1359
0
    return out;
1360
1361
0
 fail:
1362
0
    lzma_end(&strm);
1363
0
    free(out);
1364
0
    return NULL;
1365
0
}
1366
#endif
1367
1368
/* ----------------------------------------------------------------------
1369
 * CRAM blocks - the dynamically growable data block. We have code to
1370
 * create, update, (un)compress and read/write.
1371
 *
1372
 * These are derived from the deflate_interlaced.c blocks, but with the
1373
 * CRAM extension of content types and IDs.
1374
 */
1375
1376
/*
1377
 * Allocates a new cram_block structure with a specified content_type and
1378
 * id.
1379
 *
1380
 * Returns block pointer on success
1381
 *         NULL on failure
1382
 */
1383
cram_block *cram_new_block(enum cram_content_type content_type,
1384
633
                           int content_id) {
1385
633
    cram_block *b = malloc(sizeof(*b));
1386
633
    if (!b)
1387
0
        return NULL;
1388
633
    b->method = b->orig_method = RAW;
1389
633
    b->content_type = content_type;
1390
633
    b->content_id = content_id;
1391
633
    b->comp_size = 0;
1392
633
    b->uncomp_size = 0;
1393
633
    b->data = NULL;
1394
633
    b->alloc = 0;
1395
633
    b->byte = 0;
1396
633
    b->bit = 7; // MSB
1397
633
    b->crc32 = 0;
1398
633
    b->idx = 0;
1399
633
    b->m = NULL;
1400
1401
633
    return b;
1402
633
}
1403
1404
/*
1405
 * Reads a block from a cram file.
1406
 * Returns cram_block pointer on success.
1407
 *         NULL on failure
1408
 */
1409
3.33k
cram_block *cram_read_block(cram_fd *fd) {
1410
3.33k
    cram_block *b = malloc(sizeof(*b));
1411
3.33k
    unsigned char c;
1412
3.33k
    uint32_t crc = 0;
1413
3.33k
    if (!b)
1414
0
        return NULL;
1415
1416
    //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp));
1417
1418
3.33k
    if (-1 == (b->method      = hgetc(fd->fp))) { free(b); return NULL; }
1419
3.33k
    c = b->method; crc = crc32(crc, &c, 1);
1420
3.33k
    if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; }
1421
3.33k
    c = b->content_type; crc = crc32(crc, &c, 1);
1422
3.33k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->content_id, &crc))  { free(b); return NULL; }
1423
3.33k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->comp_size, &crc))   { free(b); return NULL; }
1424
3.33k
    if (-1 == fd->vv.varint_decode32_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; }
1425
1426
    //fprintf(stderr, "  method %d, ctype %d, cid %d, csize %d, ucsize %d\n",
1427
    //      b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size);
1428
1429
3.33k
    if (b->method == RAW) {
1430
673
        if (b->uncomp_size < 0 || b->comp_size != b->uncomp_size) {
1431
5
            free(b);
1432
5
            return NULL;
1433
5
        }
1434
668
        b->alloc = b->uncomp_size;
1435
668
        if (!(b->data = malloc(b->uncomp_size))){ free(b); return NULL; }
1436
668
        if (b->uncomp_size != hread(fd->fp, b->data, b->uncomp_size)) {
1437
4
            free(b->data);
1438
4
            free(b);
1439
4
            return NULL;
1440
4
        }
1441
2.66k
    } else {
1442
2.66k
        if (b->comp_size < 0 || b->uncomp_size < 0) {
1443
3
            free(b);
1444
3
            return NULL;
1445
3
        }
1446
2.65k
        b->alloc = b->comp_size;
1447
2.65k
        if (!(b->data = malloc(b->comp_size)))  { free(b); return NULL; }
1448
2.65k
        if (b->comp_size != hread(fd->fp, b->data, b->comp_size)) {
1449
11
            free(b->data);
1450
11
            free(b);
1451
11
            return NULL;
1452
11
        }
1453
2.65k
    }
1454
1455
3.31k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1456
0
        if (-1 == int32_decode(fd, (int32_t *)&b->crc32)) {
1457
0
            free(b->data);
1458
0
            free(b);
1459
0
            return NULL;
1460
0
        }
1461
1462
0
        b->crc32_checked = fd->ignore_md5;
1463
0
        b->crc_part = crc;
1464
3.31k
    } else {
1465
3.31k
        b->crc32_checked = 1; // CRC not present
1466
3.31k
    }
1467
1468
3.31k
    b->orig_method = b->method;
1469
3.31k
    b->idx = 0;
1470
3.31k
    b->byte = 0;
1471
3.31k
    b->bit = 7; // MSB
1472
1473
3.31k
    return b;
1474
3.31k
}
1475
1476
1477
/*
1478
 * Computes the size of a cram block, including the block
1479
 * header itself.
1480
 */
1481
0
uint32_t cram_block_size(cram_block *b) {
1482
0
    unsigned char dat[100], *cp = dat;;
1483
0
    uint32_t sz;
1484
1485
0
    *cp++ = b->method;
1486
0
    *cp++ = b->content_type;
1487
0
    cp += itf8_put((char*)cp, b->content_id);
1488
0
    cp += itf8_put((char*)cp, b->comp_size);
1489
0
    cp += itf8_put((char*)cp, b->uncomp_size);
1490
1491
0
    sz = cp-dat + 4;
1492
0
    sz += b->method == RAW ? b->uncomp_size : b->comp_size;
1493
1494
0
    return sz;
1495
0
}
1496
1497
/*
1498
 * Writes a CRAM block.
1499
 * Returns 0 on success
1500
 *        -1 on failure
1501
 */
1502
0
int cram_write_block(cram_fd *fd, cram_block *b) {
1503
0
    char vardata[100];
1504
0
    int vardata_o = 0;
1505
1506
0
    assert(b->method != RAW || (b->comp_size == b->uncomp_size));
1507
1508
0
    if (hputc(b->method,       fd->fp)  == EOF) return -1;
1509
0
    if (hputc(b->content_type, fd->fp)  == EOF) return -1;
1510
0
    vardata_o += fd->vv.varint_put32(vardata          , vardata+100, b->content_id);
1511
0
    vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->comp_size);
1512
0
    vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->uncomp_size);
1513
0
    if (vardata_o != hwrite(fd->fp, vardata, vardata_o))
1514
0
        return -1;
1515
1516
0
    if (b->data) {
1517
0
        if (b->method == RAW) {
1518
0
            if (b->uncomp_size != hwrite(fd->fp, b->data, b->uncomp_size))
1519
0
                return -1;
1520
0
        } else {
1521
0
            if (b->comp_size != hwrite(fd->fp, b->data, b->comp_size))
1522
0
                return -1;
1523
0
        }
1524
0
    } else {
1525
        // Absent blocks should be size 0
1526
0
        assert(b->method == RAW && b->uncomp_size == 0);
1527
0
    }
1528
1529
0
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
1530
0
        char dat[100], *cp = (char *)dat;
1531
0
        uint32_t crc;
1532
1533
0
        *cp++ = b->method;
1534
0
        *cp++ = b->content_type;
1535
0
        cp += fd->vv.varint_put32(cp, dat+100, b->content_id);
1536
0
        cp += fd->vv.varint_put32(cp, dat+100, b->comp_size);
1537
0
        cp += fd->vv.varint_put32(cp, dat+100, b->uncomp_size);
1538
0
        crc = crc32(0L, (uc *)dat, cp-dat);
1539
1540
0
        if (b->method == RAW) {
1541
0
            b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size);
1542
0
        } else {
1543
0
            b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->comp_size);
1544
0
        }
1545
1546
0
        if (-1 == int32_encode(fd, b->crc32))
1547
0
            return -1;
1548
0
    }
1549
1550
0
    return 0;
1551
0
}
1552
1553
/*
1554
 * Frees a CRAM block, deallocating internal data too.
1555
 */
1556
4.12k
void cram_free_block(cram_block *b) {
1557
4.12k
    if (!b)
1558
180
        return;
1559
3.94k
    if (b->data)
1560
3.72k
        free(b->data);
1561
3.94k
    free(b);
1562
3.94k
}
1563
1564
/*
1565
 * Uncompresses a CRAM block, if compressed.
1566
 */
1567
1.02k
int cram_uncompress_block(cram_block *b) {
1568
1.02k
    char *uncomp;
1569
1.02k
    size_t uncomp_size = 0;
1570
1571
1.02k
    if (b->crc32_checked == 0) {
1572
0
        uint32_t crc = crc32(b->crc_part, b->data ? b->data : (uc *)"", b->alloc);
1573
0
        b->crc32_checked = 1;
1574
0
        if (crc != b->crc32) {
1575
0
            hts_log_error("Block CRC32 failure");
1576
0
            return -1;
1577
0
        }
1578
0
    }
1579
1580
1.02k
    if (b->uncomp_size == 0) {
1581
        // blank block
1582
432
        b->method = RAW;
1583
432
        return 0;
1584
432
    }
1585
592
    assert(b->uncomp_size >= 0); // cram_read_block should ensure this
1586
1587
592
    switch (b->method) {
1588
10
    case RAW:
1589
10
        return 0;
1590
1591
0
    case GZIP:
1592
0
        uncomp_size = b->uncomp_size;
1593
0
        uncomp = zlib_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1594
1595
0
        if (!uncomp)
1596
0
            return -1;
1597
0
        if (uncomp_size != b->uncomp_size) {
1598
0
            free(uncomp);
1599
0
            return -1;
1600
0
        }
1601
0
        free(b->data);
1602
0
        b->data = (unsigned char *)uncomp;
1603
0
        b->alloc = uncomp_size;
1604
0
        b->method = RAW;
1605
0
        break;
1606
1607
0
#ifdef HAVE_LIBBZ2
1608
0
    case BZIP2: {
1609
0
        unsigned int usize = b->uncomp_size;
1610
0
        if (!(uncomp = malloc(usize)))
1611
0
            return -1;
1612
0
        if (BZ_OK != BZ2_bzBuffToBuffDecompress(uncomp, &usize,
1613
0
                                                (char *)b->data, b->comp_size,
1614
0
                                                0, 0)) {
1615
0
            free(uncomp);
1616
0
            return -1;
1617
0
        }
1618
0
        free(b->data);
1619
0
        b->data = (unsigned char *)uncomp;
1620
0
        b->alloc = usize;
1621
0
        b->method = RAW;
1622
0
        b->uncomp_size = usize; // Just in case it differs
1623
0
        break;
1624
0
    }
1625
#else
1626
    case BZIP2:
1627
        hts_log_error("Bzip2 compression is not compiled into this version. Please rebuild and try again");
1628
        return -1;
1629
#endif
1630
1631
0
#ifdef HAVE_LIBLZMA
1632
0
    case LZMA:
1633
0
        uncomp = lzma_mem_inflate((char *)b->data, b->comp_size, &uncomp_size);
1634
0
        if (!uncomp)
1635
0
            return -1;
1636
0
        if (uncomp_size != b->uncomp_size) {
1637
0
            free(uncomp);
1638
0
            return -1;
1639
0
        }
1640
0
        free(b->data);
1641
0
        b->data = (unsigned char *)uncomp;
1642
0
        b->alloc = uncomp_size;
1643
0
        b->method = RAW;
1644
0
        break;
1645
#else
1646
    case LZMA:
1647
        hts_log_error("Lzma compression is not compiled into this version. Please rebuild and try again");
1648
        return -1;
1649
        break;
1650
#endif
1651
1652
1
    case RANS: {
1653
1
        unsigned int usize = b->uncomp_size, usize2;
1654
1
        uncomp = (char *)rans_uncompress(b->data, b->comp_size, &usize2);
1655
1
        if (!uncomp)
1656
1
            return -1;
1657
0
        if (usize != usize2) {
1658
0
            free(uncomp);
1659
0
            return -1;
1660
0
        }
1661
0
        free(b->data);
1662
0
        b->data = (unsigned char *)uncomp;
1663
0
        b->alloc = usize2;
1664
0
        b->method = RAW;
1665
0
        b->uncomp_size = usize2; // Just in case it differs
1666
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1667
0
        break;
1668
0
    }
1669
1670
82
    case FQZ: {
1671
82
        uncomp_size = b->uncomp_size;
1672
82
        uncomp = fqz_decompress((char *)b->data, b->comp_size, &uncomp_size, NULL, 0);
1673
82
        if (!uncomp)
1674
18
            return -1;
1675
64
        free(b->data);
1676
64
        b->data = (unsigned char *)uncomp;
1677
64
        b->alloc = uncomp_size;
1678
64
        b->method = RAW;
1679
64
        b->uncomp_size = uncomp_size;
1680
64
        break;
1681
82
    }
1682
1683
127
    case RANS_PR0: {
1684
127
        unsigned int usize = b->uncomp_size, usize2;
1685
127
        uncomp = (char *)rans_uncompress_4x16(b->data, b->comp_size, &usize2);
1686
127
        if (!uncomp)
1687
2
            return -1;
1688
125
        if (usize != usize2) {
1689
0
            free(uncomp);
1690
0
            return -1;
1691
0
        }
1692
125
        b->orig_method = RANS_PR0 + (b->data[0]&1)
1693
125
            + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0);
1694
125
        free(b->data);
1695
125
        b->data = (unsigned char *)uncomp;
1696
125
        b->alloc = usize2;
1697
125
        b->method = RAW;
1698
125
        b->uncomp_size = usize2; // Just incase it differs
1699
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1700
125
        break;
1701
125
    }
1702
1703
245
    case ARITH_PR0: {
1704
245
        unsigned int usize = b->uncomp_size, usize2;
1705
245
        uncomp = (char *)arith_uncompress_to(b->data, b->comp_size, NULL, &usize2);
1706
245
        if (!uncomp)
1707
0
            return -1;
1708
245
        if (usize != usize2) {
1709
0
            free(uncomp);
1710
0
            return -1;
1711
0
        }
1712
245
        b->orig_method = ARITH_PR0 + (b->data[0]&1)
1713
245
            + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0);
1714
245
        free(b->data);
1715
245
        b->data = (unsigned char *)uncomp;
1716
245
        b->alloc = usize2;
1717
245
        b->method = RAW;
1718
245
        b->uncomp_size = usize2; // Just incase it differs
1719
        //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size);
1720
245
        break;
1721
245
    }
1722
1723
126
    case TOK3: {
1724
126
        uint32_t out_len;
1725
126
        uint8_t *cp = tok3_decode_names(b->data, b->comp_size, &out_len);
1726
126
        if (!cp)
1727
124
            return -1;
1728
2
        b->orig_method = TOK3;
1729
2
        b->method = RAW;
1730
2
        free(b->data);
1731
2
        b->data = cp;
1732
2
        b->alloc = out_len;
1733
2
        b->uncomp_size = out_len;
1734
2
        break;
1735
126
    }
1736
1737
1
    default:
1738
1
        return -1;
1739
592
    }
1740
1741
436
    return 0;
1742
592
}
1743
1744
static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size,
1745
                                     int content_id, size_t *out_size,
1746
                                     enum cram_block_method_int method,
1747
0
                                     int level, int strat) {
1748
0
    switch (method) {
1749
0
    case GZIP:
1750
0
    case GZIP_RLE:
1751
0
    case GZIP_1:
1752
        // Read names bizarrely benefit from zlib over libdeflate for
1753
        // mid-range compression levels.  Focusing purely of ratio or
1754
        // speed, libdeflate still wins.  It also seems to win for
1755
        // other data series too.
1756
        //
1757
        // Eg RN at level 5;  libdeflate=55.9MB  zlib=51.6MB
1758
#ifdef HAVE_LIBDEFLATE
1759
#  if (LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR == 1 && LIBDEFLATE_VERSION_MINOR <= 8))
1760
        if (content_id == DS_RN && level >= 4 && level <= 7)
1761
            return zlib_mem_deflate(in, in_size, out_size, level, strat);
1762
        else
1763
#  endif
1764
            return libdeflate_deflate(in, in_size, out_size, level, strat);
1765
#else
1766
0
        return zlib_mem_deflate(in, in_size, out_size, level, strat);
1767
0
#endif
1768
1769
0
    case BZIP2: {
1770
0
#ifdef HAVE_LIBBZ2
1771
0
        unsigned int comp_size = in_size*1.01 + 600;
1772
0
        char *comp = malloc(comp_size);
1773
0
        if (!comp)
1774
0
            return NULL;
1775
1776
0
        if (BZ_OK != BZ2_bzBuffToBuffCompress(comp, &comp_size,
1777
0
                                              in, in_size,
1778
0
                                              level, 0, 30)) {
1779
0
            free(comp);
1780
0
            return NULL;
1781
0
        }
1782
0
        *out_size = comp_size;
1783
0
        return comp;
1784
#else
1785
        return NULL;
1786
#endif
1787
0
    }
1788
1789
0
    case FQZ:
1790
0
    case FQZ_b:
1791
0
    case FQZ_c:
1792
0
    case FQZ_d: {
1793
        // Extract the necessary portion of the slice into an fqz_slice struct.
1794
        // These previously were the same thing, but this permits us to detach
1795
        // the codec from the rest of this CRAM implementation.
1796
0
        fqz_slice *f = malloc(2*s->hdr->num_records * sizeof(uint32_t) + sizeof(fqz_slice));
1797
0
        if (!f)
1798
0
            return NULL;
1799
0
        f->num_records = s->hdr->num_records;
1800
0
        f->len = (uint32_t *)(((char *)f) + sizeof(fqz_slice));
1801
0
        f->flags = f->len + s->hdr->num_records;
1802
0
        int i;
1803
0
        for (i = 0; i < s->hdr->num_records; i++) {
1804
0
            f->flags[i] = s->crecs[i].flags;
1805
0
            f->len[i] = (i+1 < s->hdr->num_records
1806
0
                         ? s->crecs[i+1].qual - s->crecs[i].qual
1807
0
                         : s->block[DS_QS]->uncomp_size - s->crecs[i].qual);
1808
0
        }
1809
0
        char *comp = fqz_compress(strat & 0xff /* cram vers */, f,
1810
0
                                  in, in_size, out_size, strat >> 8, NULL);
1811
0
        free(f);
1812
0
        return comp;
1813
0
    }
1814
1815
0
    case LZMA:
1816
0
#ifdef HAVE_LIBLZMA
1817
0
        return lzma_mem_deflate(in, in_size, out_size, level);
1818
#else
1819
        return NULL;
1820
#endif
1821
1822
0
    case RANS0:
1823
0
    case RANS1: {
1824
0
        unsigned int out_size_i;
1825
0
        unsigned char *cp;
1826
0
        cp = rans_compress((unsigned char *)in, in_size, &out_size_i,
1827
0
                           method == RANS0 ? 0 : 1);
1828
0
        *out_size = out_size_i;
1829
0
        return (char *)cp;
1830
0
    }
1831
1832
0
    case RANS_PR0:
1833
0
    case RANS_PR1:
1834
0
    case RANS_PR64:
1835
0
    case RANS_PR9:
1836
0
    case RANS_PR128:
1837
0
    case RANS_PR129:
1838
0
    case RANS_PR192:
1839
0
    case RANS_PR193: {
1840
0
        unsigned int out_size_i;
1841
0
        unsigned char *cp;
1842
1843
        // see enum cram_block. We map RANS_* methods to order bit-fields
1844
0
        static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1845
1846
0
        cp = rans_compress_4x16((unsigned char *)in, in_size, &out_size_i,
1847
0
                                method == RANS_PR0 ? 0 : methmap[method - RANS_PR1]);
1848
0
        *out_size = out_size_i;
1849
0
        return (char *)cp;
1850
0
    }
1851
1852
0
    case ARITH_PR0:
1853
0
    case ARITH_PR1:
1854
0
    case ARITH_PR64:
1855
0
    case ARITH_PR9:
1856
0
    case ARITH_PR128:
1857
0
    case ARITH_PR129:
1858
0
    case ARITH_PR192:
1859
0
    case ARITH_PR193: {
1860
0
        unsigned int out_size_i;
1861
0
        unsigned char *cp;
1862
1863
        // see enum cram_block. We map ARITH_* methods to order bit-fields
1864
0
        static int methmap[] = { 1, 64,9, 128,129, 192,193 };
1865
1866
0
        cp = arith_compress_to((unsigned char *)in, in_size, NULL, &out_size_i,
1867
0
                               method == ARITH_PR0 ? 0 : methmap[method - ARITH_PR1]);
1868
0
        *out_size = out_size_i;
1869
0
        return (char *)cp;
1870
0
    }
1871
1872
0
    case TOK3:
1873
0
    case TOKA: {
1874
0
        int out_len;
1875
0
        int lev = level;
1876
0
        if (method == TOK3 && lev > 3)
1877
0
            lev = 3;
1878
0
        uint8_t *cp = tok3_encode_names(in, in_size, lev, strat, &out_len, NULL);
1879
0
        *out_size = out_len;
1880
0
        return (char *)cp;
1881
0
    }
1882
1883
0
    case RAW:
1884
0
        break;
1885
1886
0
    default:
1887
0
        return NULL;
1888
0
    }
1889
1890
0
    return NULL;
1891
0
}
1892
1893
1894
/*
1895
 * Compresses a block using one of two different zlib strategies. If we only
1896
 * want one choice set strat2 to be -1.
1897
 *
1898
 * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
1899
 * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
1900
 * significantly faster.
1901
 *
1902
 * Method and level -1 implies defaults, as specified in cram_fd.
1903
 */
1904
int cram_compress_block2(cram_fd *fd, cram_slice *s,
1905
                         cram_block *b, cram_metrics *metrics,
1906
0
                         int method, int level) {
1907
1908
0
    if (!b)
1909
0
        return 0;
1910
1911
0
    char *comp = NULL;
1912
0
    size_t comp_size = 0;
1913
0
    int strat;
1914
1915
    // Internally we have parameterised methods that externally map
1916
    // to the same CRAM method value.
1917
    // See enum_cram_block_method_int in cram_structs.h.
1918
0
    int methmap[] = {
1919
        // Externally defined values
1920
0
        RAW, GZIP, BZIP2, LZMA, RANS, RANSPR, ARITH, FQZ, TOK3,
1921
1922
        // Reserved for possible expansion
1923
0
        0, 0,
1924
1925
        // Internally parameterised versions matching back to above
1926
        // external values
1927
0
        GZIP, GZIP,
1928
0
        FQZ, FQZ, FQZ,
1929
0
        RANS,
1930
0
        RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR,
1931
0
        TOK3,
1932
0
        ARITH,  ARITH,  ARITH,  ARITH,  ARITH,  ARITH,  ARITH,
1933
0
    };
1934
1935
0
    if (b->method != RAW) {
1936
        // Maybe already compressed if s->block[0] was compressed and
1937
        // we have e.g. s->block[DS_BA] set to s->block[0] due to only
1938
        // one base type present and hence using E_HUFFMAN on block 0.
1939
        // A second explicit attempt to compress the same block then
1940
        // occurs.
1941
0
        return 0;
1942
0
    }
1943
1944
0
    if (method == -1) {
1945
0
        method = 1<<GZIP;
1946
0
        if (fd->use_bz2)
1947
0
            method |= 1<<BZIP2;
1948
0
        if (fd->use_lzma)
1949
0
            method |= 1<<LZMA;
1950
0
    }
1951
1952
0
    if (level == -1)
1953
0
        level = fd->level;
1954
1955
    //fprintf(stderr, "IN: block %d, sz %d\n", b->content_id, b->uncomp_size);
1956
1957
0
    if (method == RAW || level == 0 || b->uncomp_size == 0) {
1958
0
        b->method = RAW;
1959
0
        b->comp_size = b->uncomp_size;
1960
        //fprintf(stderr, "Skip block id %d\n", b->content_id);
1961
0
        return 0;
1962
0
    }
1963
1964
0
#ifndef ABS
1965
0
#    define ABS(a) ((a)>=0?(a):-(a))
1966
0
#endif
1967
1968
0
    if (metrics) {
1969
0
        pthread_mutex_lock(&fd->metrics_lock);
1970
        // Sudden changes in size trigger a retrial.  These are mainly
1971
        // triggered when switching to sorted / unsorted, where the number
1972
        // of elements in a slice radically changes.
1973
        //
1974
        // We also get large fluctuations based on genome coordinate for
1975
        // e.g. SA:Z and SC series, but we consider the typical scale of
1976
        // delta between blocks and use this to look for abnormality.
1977
0
        if (metrics->input_avg_sz &&
1978
0
            (b->uncomp_size + 1000 > 4*(metrics->input_avg_sz+1000) ||
1979
0
             b->uncomp_size + 1000 < (metrics->input_avg_sz+1000)/4) &&
1980
0
            ABS(b->uncomp_size-metrics->input_avg_sz)
1981
0
                > 10*metrics->input_avg_delta) {
1982
0
            metrics->next_trial = 0;
1983
0
        }
1984
1985
0
        if (metrics->trial > 0 || --metrics->next_trial <= 0) {
1986
0
            int m, unpackable = metrics->unpackable;
1987
0
            size_t sz_best = b->uncomp_size;
1988
0
            size_t sz[CRAM_MAX_METHOD] = {0};
1989
0
            int method_best = 0; // RAW
1990
0
            char *c_best = NULL, *c = NULL;
1991
1992
0
            metrics->input_avg_delta =
1993
0
                0.9 * (metrics->input_avg_delta +
1994
0
                       ABS(b->uncomp_size - metrics->input_avg_sz));
1995
1996
0
            metrics->input_avg_sz += b->uncomp_size*.2;
1997
0
            metrics->input_avg_sz *= 0.8;
1998
1999
0
            if (metrics->revised_method)
2000
0
                method = metrics->revised_method;
2001
0
            else
2002
0
                metrics->revised_method = method;
2003
2004
0
            if (metrics->next_trial <= 0) {
2005
0
                metrics->next_trial = TRIAL_SPAN;
2006
0
                metrics->trial = NTRIALS;
2007
0
                for (m = 0; m < CRAM_MAX_METHOD; m++)
2008
0
                    metrics->sz[m] /= 2;
2009
0
                metrics->unpackable = 0;
2010
0
            }
2011
2012
            // Compress this block using the best method
2013
0
            if (unpackable && CRAM_MAJOR_VERS(fd->version) > 3) {
2014
                // No point trying bit-pack if 17+ symbols.
2015
0
                if (method & (1<<RANS_PR128))
2016
0
                    method = (method|(1<<RANS_PR0))&~(1<<RANS_PR128);
2017
0
                if (method & (1<<RANS_PR129))
2018
0
                    method = (method|(1<<RANS_PR1))&~(1<<RANS_PR129);
2019
0
                if (method & (1<<RANS_PR192))
2020
0
                    method = (method|(1<<RANS_PR64))&~(1<<RANS_PR192);
2021
0
                if (method & (1<<RANS_PR193))
2022
0
                    method = (method|(1<<RANS_PR64)|(1<<RANS_PR1))&~(1<<RANS_PR193);
2023
2024
0
                if (method & (1<<ARITH_PR128))
2025
0
                    method = (method|(1<<ARITH_PR0))&~(1<<ARITH_PR128);
2026
0
                if (method & (1<<ARITH_PR129))
2027
0
                    method = (method|(1<<ARITH_PR1))&~(1<<ARITH_PR129);
2028
0
                if (method & (1<<ARITH_PR192))
2029
0
                    method = (method|(1<<ARITH_PR64))&~(1<<ARITH_PR192);
2030
0
                if (method & (1u<<ARITH_PR193))
2031
0
                    method = (method|(1<<ARITH_PR64)|(1<<ARITH_PR1))&~(1u<<ARITH_PR193);
2032
0
            }
2033
2034
            // Libdeflate doesn't have a Z_RLE strategy.
2035
            // We treat it as level 1, but iff we haven't also
2036
            // explicitly listed that in the method list.
2037
#ifdef HAVE_LIBDEFLATE
2038
            if ((method & (1<<GZIP_RLE)) && (method & (1<<GZIP_1)))
2039
                method &= ~(1<<GZIP_RLE);
2040
#endif
2041
2042
0
            pthread_mutex_unlock(&fd->metrics_lock);
2043
2044
0
            for (m = 0; m < CRAM_MAX_METHOD; m++) {
2045
0
                if (method & (1u<<m)) {
2046
0
                    int lvl = level;
2047
0
                    switch (m) {
2048
0
                    case GZIP:     strat = Z_FILTERED; break;
2049
0
                    case GZIP_1:   strat = Z_DEFAULT_STRATEGY; lvl = 1; break;
2050
0
                    case GZIP_RLE: strat = Z_RLE; break;
2051
0
                    case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
2052
0
                    case FQZ_b:    strat = CRAM_MAJOR_VERS(fd->version)+256; break;
2053
0
                    case FQZ_c:    strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
2054
0
                    case FQZ_d:    strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
2055
0
                    case TOK3:     strat = 0; break;
2056
0
                    case TOKA:     strat = 1; break;
2057
0
                    default:       strat = 0;
2058
0
                    }
2059
2060
0
                    c = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2061
0
                                                b->content_id, &sz[m], m, lvl, strat);
2062
2063
0
                    if (c && sz_best > sz[m]) {
2064
0
                        sz_best = sz[m];
2065
0
                        method_best = m;
2066
0
                        if (c_best)
2067
0
                            free(c_best);
2068
0
                        c_best = c;
2069
0
                    } else if (c) {
2070
0
                        free(c);
2071
0
                    } else {
2072
0
                        sz[m] = b->uncomp_size*2+1000; // arbitrarily worse than raw
2073
0
                    }
2074
0
                } else {
2075
0
                    sz[m] = b->uncomp_size*2+1000; // arbitrarily worse than raw
2076
0
                }
2077
0
            }
2078
2079
0
            if (c_best) {
2080
0
                free(b->data);
2081
0
                b->data = (unsigned char *)c_best;
2082
0
                b->method = method_best; // adjusted to methmap[method_best] later
2083
0
                b->comp_size = sz_best;
2084
0
            }
2085
2086
            // Accumulate stats for all methods tried
2087
0
            pthread_mutex_lock(&fd->metrics_lock);
2088
0
            for (m = 0; m < CRAM_MAX_METHOD; m++)
2089
                // don't be overly sure on small blocks.
2090
                // +2000 means eg bzip2 vs gzip (1.07 to 1.04) or gz vs rans1
2091
                // needs to be at least 60 bytes smaller to overcome the
2092
                // fixed size addition.
2093
0
                metrics->sz[m] += sz[m]+2000;
2094
2095
            // When enough trials performed, find the best on average
2096
0
            if (--metrics->trial == 0) {
2097
0
                int best_method = RAW;
2098
0
                int best_sz = INT_MAX;
2099
2100
                // Relative costs of methods. See enum_cram_block_method_int
2101
                // and methmap
2102
0
                double meth_cost[32] = {
2103
                    // Externally defined methods
2104
0
                    1,    // 0  raw
2105
0
                    1.04, // 1  gzip (Z_FILTERED)
2106
0
                    1.07, // 2  bzip2
2107
0
                    1.08, // 3  lzma
2108
0
                    1.00, // 4  rans    (O0)
2109
0
                    1.00, // 5  ranspr  (O0)
2110
0
                    1.04, // 6  arithpr (O0)
2111
0
                    1.05, // 7  fqz
2112
0
                    1.05, // 8  tok3 (rans)
2113
0
                    1.00, 1.00, // 9,10 reserved
2114
2115
                    // Paramterised versions of above
2116
0
                    1.01, // gzip rle
2117
0
                    1.01, // gzip -1
2118
2119
0
                    1.05, 1.05, 1.05, // FQZ_b,c,d
2120
2121
0
                    1.01, // rans O1
2122
2123
0
                    1.01, // rans_pr1
2124
0
                    1.00, // rans_pr64; if smaller, usually fast
2125
0
                    1.03, // rans_pr65/9
2126
0
                    1.00, // rans_pr128
2127
0
                    1.01, // rans_pr129
2128
0
                    1.00, // rans_pr192
2129
0
                    1.01, // rans_pr193
2130
2131
0
                    1.07, // tok3 arith
2132
2133
0
                    1.04, // arith_pr1
2134
0
                    1.04, // arith_pr64
2135
0
                    1.04, // arith_pr9
2136
0
                    1.03, // arith_pr128
2137
0
                    1.04, // arith_pr129
2138
0
                    1.04, // arith_pr192
2139
0
                    1.04, // arith_pr193
2140
0
                };
2141
2142
                // Scale methods by cost based on compression level
2143
0
                if (fd->level <= 1) {
2144
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2145
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1)*4;
2146
0
                } else if (fd->level <= 3) {
2147
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2148
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1);
2149
0
                } else if (fd->level <= 6) {
2150
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2151
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1)/2;
2152
0
                } else if (fd->level <= 7) {
2153
0
                    for (m = 0; m < CRAM_MAX_METHOD; m++)
2154
0
                        metrics->sz[m] *= 1+(meth_cost[m]-1)/3;
2155
0
                } // else cost is ignored
2156
2157
                // Ensure these are never used; BSC and ZSTD
2158
0
                metrics->sz[9] = metrics->sz[10] = INT_MAX;
2159
2160
0
                for (m = 0; m < CRAM_MAX_METHOD; m++) {
2161
0
                    if ((!metrics->sz[m]) || (!(method & (1u<<m))))
2162
0
                        continue;
2163
2164
0
                    if (best_sz > metrics->sz[m])
2165
0
                        best_sz = metrics->sz[m], best_method = m;
2166
0
                }
2167
2168
0
                if (best_method != metrics->method) {
2169
                    //metrics->trial = (NTRIALS+1)/2; // be sure
2170
                    //metrics->next_trial /= 1.5;
2171
0
                    metrics->consistency = 0;
2172
0
                } else {
2173
0
                    metrics->next_trial *= MIN(2, 1+metrics->consistency/4.0);
2174
0
                    metrics->consistency++;
2175
0
                }
2176
2177
0
                metrics->method = best_method;
2178
0
                switch (best_method) {
2179
0
                case GZIP:     strat = Z_FILTERED; break;
2180
0
                case GZIP_1:   strat = Z_DEFAULT_STRATEGY; break;
2181
0
                case GZIP_RLE: strat = Z_RLE; break;
2182
0
                case FQZ:      strat = CRAM_MAJOR_VERS(fd->version); break;
2183
0
                case FQZ_b:    strat = CRAM_MAJOR_VERS(fd->version)+256; break;
2184
0
                case FQZ_c:    strat = CRAM_MAJOR_VERS(fd->version)+2*256; break;
2185
0
                case FQZ_d:    strat = CRAM_MAJOR_VERS(fd->version)+3*256; break;
2186
0
                default:       strat = 0;
2187
0
                }
2188
0
                metrics->strat  = strat;
2189
2190
                // If we see at least MAXFAIL trials in a row for a specific
2191
                // compression method with more than MAXDELTA aggregate
2192
                // size then we drop this from the list of methods used
2193
                // for this block type.
2194
0
#define MAXDELTA 0.20
2195
0
#define MAXFAILS 4
2196
0
                for (m = 0; m < CRAM_MAX_METHOD; m++) {
2197
0
                    if (best_method == m) {
2198
0
                        metrics->cnt[m] = 0;
2199
0
                        metrics->extra[m] = 0;
2200
0
                    } else if (best_sz < metrics->sz[m]) {
2201
0
                        double r = (double)metrics->sz[m] / best_sz - 1;
2202
0
                        int mul = 1+(fd->level>=7);
2203
0
                        if (++metrics->cnt[m] >= MAXFAILS*mul &&
2204
0
                            (metrics->extra[m] += r) >= MAXDELTA*mul)
2205
0
                            method &= ~(1u<<m);
2206
2207
                        // Special case for fqzcomp as it rarely changes
2208
0
                        if (m == FQZ || m == FQZ_b || m == FQZ_c || m == FQZ_d) {
2209
0
                            if (metrics->sz[m] > best_sz)
2210
0
                                method &= ~(1u<<m);
2211
0
                        }
2212
0
                    }
2213
0
                }
2214
2215
                //if (fd->verbose > 1 && method != metrics->revised_method)
2216
                //    fprintf(stderr, "%d: revising method from %x to %x\n",
2217
                //          b->content_id, metrics->revised_method, method);
2218
0
                metrics->revised_method = method;
2219
0
            }
2220
0
            pthread_mutex_unlock(&fd->metrics_lock);
2221
0
        } else {
2222
0
            metrics->input_avg_delta =
2223
0
                0.9 * (metrics->input_avg_delta +
2224
0
                       ABS(b->uncomp_size - metrics->input_avg_sz));
2225
2226
0
            metrics->input_avg_sz += b->uncomp_size*.2;
2227
0
            metrics->input_avg_sz *= 0.8;
2228
2229
0
            strat = metrics->strat;
2230
0
            method = metrics->method;
2231
2232
0
            pthread_mutex_unlock(&fd->metrics_lock);
2233
0
            comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2234
0
                                           b->content_id, &comp_size, method,
2235
0
                                           method == GZIP_1 ? 1 : level,
2236
0
                                           strat);
2237
0
            if (!comp)
2238
0
                return -1;
2239
2240
0
            if (comp_size < b->uncomp_size) {
2241
0
                free(b->data);
2242
0
                b->data = (unsigned char *)comp;
2243
0
                b->comp_size = comp_size;
2244
0
                b->method = method;
2245
0
            } else {
2246
0
                free(comp);
2247
0
            }
2248
0
        }
2249
2250
0
    } else {
2251
        // no cached metrics, so just do zlib?
2252
0
        comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size,
2253
0
                                       b->content_id, &comp_size, GZIP, level, Z_FILTERED);
2254
0
        if (!comp) {
2255
0
            hts_log_error("Compression failed!");
2256
0
            return -1;
2257
0
        }
2258
2259
0
        if (comp_size < b->uncomp_size) {
2260
0
            free(b->data);
2261
0
            b->data = (unsigned char *)comp;
2262
0
            b->comp_size = comp_size;
2263
0
            b->method = GZIP;
2264
0
        } else {
2265
0
            free(comp);
2266
0
        }
2267
0
        strat = Z_FILTERED;
2268
0
    }
2269
2270
0
    hts_log_info("Compressed block ID %d from %d to %d by method %s",
2271
0
                 b->content_id, b->uncomp_size, b->comp_size,
2272
0
                 cram_block_method2str(b->method));
2273
2274
0
    b->method = methmap[b->method];
2275
2276
0
    return 0;
2277
0
}
2278
int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics,
2279
0
                        int method, int level) {
2280
0
    return cram_compress_block2(fd, NULL, b, metrics, method, level);
2281
0
}
2282
2283
11.5k
cram_metrics *cram_new_metrics(void) {
2284
11.5k
    cram_metrics *m = calloc(1, sizeof(*m));
2285
11.5k
    if (!m)
2286
0
        return NULL;
2287
11.5k
    m->trial = NTRIALS-1;
2288
11.5k
    m->next_trial = TRIAL_SPAN/2; // learn quicker at start
2289
11.5k
    m->method = RAW;
2290
11.5k
    m->strat = 0;
2291
11.5k
    m->revised_method = 0;
2292
11.5k
    m->unpackable = 0;
2293
2294
11.5k
    return m;
2295
11.5k
}
2296
2297
0
char *cram_block_method2str(enum cram_block_method_int m) {
2298
0
    switch(m) {
2299
0
    case RAW:         return "RAW";
2300
0
    case GZIP:        return "GZIP";
2301
0
    case BZIP2:       return "BZIP2";
2302
0
    case LZMA:        return "LZMA";
2303
0
    case RANS0:       return "RANS0";
2304
0
    case RANS1:       return "RANS1";
2305
0
    case GZIP_RLE:    return "GZIP_RLE";
2306
0
    case GZIP_1:      return "GZIP_1";
2307
0
    case FQZ:         return "FQZ";
2308
0
    case FQZ_b:       return "FQZ_b";
2309
0
    case FQZ_c:       return "FQZ_c";
2310
0
    case FQZ_d:       return "FQZ_d";
2311
0
    case RANS_PR0:    return "RANS_PR0";
2312
0
    case RANS_PR1:    return "RANS_PR1";
2313
0
    case RANS_PR64:   return "RANS_PR64";
2314
0
    case RANS_PR9:    return "RANS_PR9";
2315
0
    case RANS_PR128:  return "RANS_PR128";
2316
0
    case RANS_PR129:  return "RANS_PR129";
2317
0
    case RANS_PR192:  return "RANS_PR192";
2318
0
    case RANS_PR193:  return "RANS_PR193";
2319
0
    case TOK3:        return "TOK3_R";
2320
0
    case TOKA:        return "TOK3_A";
2321
0
    case ARITH_PR0:   return "ARITH_PR0";
2322
0
    case ARITH_PR1:   return "ARITH_PR1";
2323
0
    case ARITH_PR64:  return "ARITH_PR64";
2324
0
    case ARITH_PR9:   return "ARITH_PR9";
2325
0
    case ARITH_PR128: return "ARITH_PR128";
2326
0
    case ARITH_PR129: return "ARITH_PR129";
2327
0
    case ARITH_PR192: return "ARITH_PR192";
2328
0
    case ARITH_PR193: return "ARITH_PR193";
2329
0
    case BM_ERROR: break;
2330
0
    }
2331
0
    return "?";
2332
0
}
2333
2334
1
char *cram_content_type2str(enum cram_content_type t) {
2335
1
    switch (t) {
2336
1
    case FILE_HEADER:         return "FILE_HEADER";
2337
0
    case COMPRESSION_HEADER:  return "COMPRESSION_HEADER";
2338
0
    case MAPPED_SLICE:        return "MAPPED_SLICE";
2339
0
    case UNMAPPED_SLICE:      return "UNMAPPED_SLICE";
2340
0
    case EXTERNAL:            return "EXTERNAL";
2341
0
    case CORE:                return "CORE";
2342
0
    case CT_ERROR:            break;
2343
1
    }
2344
0
    return "?";
2345
1
}
2346
2347
/* ----------------------------------------------------------------------
2348
 * Reference sequence handling
2349
 *
2350
 * These revolve around the refs_t structure, which may potentially be
2351
 * shared between multiple cram_fd.
2352
 *
2353
 * We start with refs_create() to allocate an empty refs_t and then
2354
 * populate it with @SQ line data using refs_from_header(). This is done on
2355
 * cram_open().  Also at start up we can call cram_load_reference() which
2356
 * is used with "scramble -r foo.fa". This replaces the fd->refs with the
2357
 * new one specified. In either case refs2id() is then called which
2358
 * maps ref_entry names to @SQ ids (refs_t->ref_id[]).
2359
 *
2360
 * Later, possibly within a thread, we will want to know the actual ref
2361
 * seq itself, obtained by calling cram_get_ref().  This may use the
2362
 * UR: or M5: fields or the filename specified in the original
2363
 * cram_load_reference() call.
2364
 *
2365
 * Given the potential for multi-threaded reference usage, we have
2366
 * reference counting (sorry for the confusing double use of "ref") to
2367
 * track the number of callers interested in any specific reference.
2368
 */
2369
2370
/*
2371
 * Frees/unmaps a reference sequence and associated file handles.
2372
 */
2373
216
static void ref_entry_free_seq(ref_entry *e) {
2374
216
    if (e->mf)
2375
0
        mfclose(e->mf);
2376
216
    if (e->seq && !e->mf)
2377
0
        free(e->seq);
2378
2379
216
    e->seq = NULL;
2380
216
    e->mf = NULL;
2381
216
}
2382
2383
245
void refs_free(refs_t *r) {
2384
245
    RP("refs_free()\n");
2385
2386
245
    if (--r->count > 0)
2387
0
        return;
2388
2389
245
    if (!r)
2390
0
        return;
2391
2392
245
    if (r->pool)
2393
245
        string_pool_destroy(r->pool);
2394
2395
245
    if (r->h_meta) {
2396
245
        khint_t k;
2397
2398
653
        for (k = kh_begin(r->h_meta); k != kh_end(r->h_meta); k++) {
2399
408
            ref_entry *e;
2400
2401
408
            if (!kh_exist(r->h_meta, k))
2402
192
                continue;
2403
216
            if (!(e = kh_val(r->h_meta, k)))
2404
0
                continue;
2405
216
            ref_entry_free_seq(e);
2406
216
            free(e);
2407
216
        }
2408
2409
245
        kh_destroy(refs, r->h_meta);
2410
245
    }
2411
2412
245
    if (r->ref_id)
2413
18
        free(r->ref_id);
2414
2415
245
    if (r->fp)
2416
0
        bgzf_close(r->fp);
2417
2418
245
    pthread_mutex_destroy(&r->lock);
2419
2420
245
    free(r);
2421
245
}
2422
2423
245
static refs_t *refs_create(void) {
2424
245
    refs_t *r = calloc(1, sizeof(*r));
2425
2426
245
    RP("refs_create()\n");
2427
2428
245
    if (!r)
2429
0
        return NULL;
2430
2431
245
    if (!(r->pool = string_pool_create(8192)))
2432
0
        goto err;
2433
2434
245
    r->ref_id = NULL; // see refs2id() to populate.
2435
245
    r->count = 1;
2436
245
    r->last = NULL;
2437
245
    r->last_id = -1;
2438
2439
245
    if (!(r->h_meta = kh_init(refs)))
2440
0
        goto err;
2441
2442
245
    pthread_mutex_init(&r->lock, NULL);
2443
2444
245
    return r;
2445
2446
0
 err:
2447
0
    refs_free(r);
2448
0
    return NULL;
2449
245
}
2450
2451
/*
2452
 * Opens a reference fasta file as a BGZF stream, allowing for
2453
 * compressed files.  It automatically builds a .fai file if
2454
 * required and if compressed a .gzi bgzf index too.
2455
 *
2456
 * Returns a BGZF handle on success;
2457
 *         NULL on failure.
2458
 */
2459
0
static BGZF *bgzf_open_ref(char *fn, char *mode, int is_md5) {
2460
0
    BGZF *fp;
2461
2462
0
    if (!is_md5 && !hisremote(fn)) {
2463
0
        char fai_file[PATH_MAX];
2464
2465
0
        snprintf(fai_file, PATH_MAX, "%s.fai", fn);
2466
0
        if (access(fai_file, R_OK) != 0)
2467
0
            if (fai_build(fn) != 0)
2468
0
                return NULL;
2469
0
    }
2470
2471
0
    if (!(fp = bgzf_open(fn, mode))) {
2472
0
        perror(fn);
2473
0
        return NULL;
2474
0
    }
2475
2476
0
    if (fp->is_compressed == 1 && bgzf_index_load(fp, fn, ".gzi") < 0) {
2477
0
        hts_log_error("Unable to load .gzi index '%s.gzi'", fn);
2478
0
        bgzf_close(fp);
2479
0
        return NULL;
2480
0
    }
2481
2482
0
    return fp;
2483
0
}
2484
2485
/*
2486
 * Loads a FAI file for a reference.fasta.
2487
 * "is_err" indicates whether failure to load is worthy of emitting an
2488
 * error message. In some cases (eg with embedded references) we
2489
 * speculatively load, just in case, and silently ignore errors.
2490
 *
2491
 * Returns the refs_t struct on success (maybe newly allocated);
2492
 *         NULL on failure
2493
 */
2494
0
static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) {
2495
0
    hFILE *fp = NULL;
2496
0
    char fai_fn[PATH_MAX];
2497
0
    char line[8192];
2498
0
    refs_t *r = r_orig;
2499
0
    size_t fn_l = strlen(fn);
2500
0
    int id = 0, id_alloc = 0;
2501
2502
0
    RP("refs_load_fai %s\n", fn);
2503
2504
0
    if (!r)
2505
0
        if (!(r = refs_create()))
2506
0
            goto err;
2507
2508
0
    if (r->fp)
2509
0
        if (bgzf_close(r->fp) != 0)
2510
0
            goto err;
2511
0
    r->fp = NULL;
2512
2513
    /* Look for a FASTA##idx##FAI format */
2514
0
    char *fn_delim = strstr(fn, HTS_IDX_DELIM);
2515
0
    if (fn_delim) {
2516
0
        if (!(r->fn = string_ndup(r->pool, fn, fn_delim - fn)))
2517
0
            goto err;
2518
0
        fn_delim += strlen(HTS_IDX_DELIM);
2519
0
        snprintf(fai_fn, PATH_MAX, "%s", fn_delim);
2520
0
    } else {
2521
        /* An index file was provided, instead of the actual reference file */
2522
0
        if (fn_l > 4 && strcmp(&fn[fn_l-4], ".fai") == 0) {
2523
0
            if (!r->fn) {
2524
0
                if (!(r->fn = string_ndup(r->pool, fn, fn_l-4)))
2525
0
                    goto err;
2526
0
            }
2527
0
            snprintf(fai_fn, PATH_MAX, "%s", fn);
2528
0
        } else {
2529
        /* Only the reference file provided. Get the index file name from it */
2530
0
            if (!(r->fn = string_dup(r->pool, fn)))
2531
0
                goto err;
2532
0
            sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, fn);
2533
0
        }
2534
0
    }
2535
2536
0
    if (!(r->fp = bgzf_open_ref(r->fn, "r", 0))) {
2537
0
        hts_log_error("Failed to open reference file '%s'", r->fn);
2538
0
        goto err;
2539
0
    }
2540
2541
0
    if (!(fp = hopen(fai_fn, "r"))) {
2542
0
        hts_log_error("Failed to open index file '%s'", fai_fn);
2543
0
        if (is_err)
2544
0
            perror(fai_fn);
2545
0
        goto err;
2546
0
    }
2547
0
    while (hgets(line, 8192, fp) != NULL) {
2548
0
        ref_entry *e = malloc(sizeof(*e));
2549
0
        char *cp;
2550
0
        int n;
2551
0
        khint_t k;
2552
2553
0
        if (!e)
2554
0
            return NULL;
2555
2556
        // id
2557
0
        for (cp = line; *cp && !isspace_c(*cp); cp++)
2558
0
            ;
2559
0
        *cp++ = 0;
2560
0
        e->name = string_dup(r->pool, line);
2561
2562
        // length
2563
0
        while (*cp && isspace_c(*cp))
2564
0
            cp++;
2565
0
        e->length = strtoll(cp, &cp, 10);
2566
2567
        // offset
2568
0
        while (*cp && isspace_c(*cp))
2569
0
            cp++;
2570
0
        e->offset = strtoll(cp, &cp, 10);
2571
2572
        // bases per line
2573
0
        while (*cp && isspace_c(*cp))
2574
0
            cp++;
2575
0
        e->bases_per_line = strtol(cp, &cp, 10);
2576
2577
        // line length
2578
0
        while (*cp && isspace_c(*cp))
2579
0
            cp++;
2580
0
        e->line_length = strtol(cp, &cp, 10);
2581
2582
        // filename
2583
0
        e->fn = r->fn;
2584
2585
0
        e->count = 0;
2586
0
        e->seq = NULL;
2587
0
        e->mf = NULL;
2588
0
        e->is_md5 = 0;
2589
0
        e->validated_md5 = 0;
2590
2591
0
        k = kh_put(refs, r->h_meta, e->name, &n);
2592
0
        if (-1 == n)  {
2593
0
            free(e);
2594
0
            return NULL;
2595
0
        }
2596
2597
0
        if (n) {
2598
0
            kh_val(r->h_meta, k) = e;
2599
0
        } else {
2600
0
            ref_entry *re = kh_val(r->h_meta, k);
2601
0
            if (re && (re->count != 0 || re->length != 0)) {
2602
                /* Keep old */
2603
0
                free(e);
2604
0
            } else {
2605
                /* Replace old */
2606
0
                if (re)
2607
0
                    free(re);
2608
0
                kh_val(r->h_meta, k) = e;
2609
0
            }
2610
0
        }
2611
2612
0
        if (id >= id_alloc) {
2613
0
            ref_entry **new_refs;
2614
0
            int x;
2615
2616
0
            id_alloc = id_alloc ?id_alloc*2 : 16;
2617
0
            new_refs = realloc(r->ref_id, id_alloc * sizeof(*r->ref_id));
2618
0
            if (!new_refs)
2619
0
                goto err;
2620
0
            r->ref_id = new_refs;
2621
2622
0
            for (x = id; x < id_alloc; x++)
2623
0
                r->ref_id[x] = NULL;
2624
0
        }
2625
0
        r->ref_id[id] = e;
2626
0
        r->nref = ++id;
2627
0
    }
2628
2629
0
    if(hclose(fp) < 0)
2630
0
        goto err;
2631
0
    return r;
2632
2633
0
 err:
2634
0
    if (fp)
2635
0
        hclose_abruptly(fp);
2636
2637
0
    if (!r_orig)
2638
0
        refs_free(r);
2639
2640
0
    return NULL;
2641
0
}
2642
2643
/*
2644
 * Verifies that the CRAM @SQ lines and .fai files match.
2645
 */
2646
0
static void sanitise_SQ_lines(cram_fd *fd) {
2647
0
    int i;
2648
2649
0
    if (!fd->header || !fd->header->hrecs)
2650
0
        return;
2651
2652
0
    if (!fd->refs || !fd->refs->h_meta)
2653
0
        return;
2654
2655
0
    for (i = 0; i < fd->header->hrecs->nref; i++) {
2656
0
        const char *name = fd->header->hrecs->ref[i].name;
2657
0
        khint_t k = kh_get(refs, fd->refs->h_meta, name);
2658
0
        ref_entry *r;
2659
2660
        // We may have @SQ lines which have no known .fai, but do not
2661
        // in themselves pose a problem because they are unused in the file.
2662
0
        if (k == kh_end(fd->refs->h_meta))
2663
0
            continue;
2664
2665
0
        if (!(r = (ref_entry *)kh_val(fd->refs->h_meta, k)))
2666
0
            continue;
2667
2668
0
        if (r->length && r->length != fd->header->hrecs->ref[i].len) {
2669
0
            assert(strcmp(r->name, fd->header->hrecs->ref[i].name) == 0);
2670
2671
            // Should we also check MD5sums here to ensure the correct
2672
            // reference was given?
2673
0
            hts_log_warning("Header @SQ length mismatch for ref %s, %"PRIhts_pos" vs %d",
2674
0
                            r->name, fd->header->hrecs->ref[i].len, (int)r->length);
2675
2676
            // Fixing the parsed @SQ header will make MD:Z: strings work
2677
            // and also stop it producing N for the sequence.
2678
0
            fd->header->hrecs->ref[i].len = r->length;
2679
0
        }
2680
0
    }
2681
0
}
2682
2683
/*
2684
 * Indexes references by the order they appear in a BAM file. This may not
2685
 * necessarily be the same order they appear in the fasta reference file.
2686
 *
2687
 * Returns 0 on success
2688
 *        -1 on failure
2689
 */
2690
0
int refs2id(refs_t *r, sam_hdr_t *hdr) {
2691
0
    int i;
2692
0
    sam_hrecs_t *h = hdr->hrecs;
2693
2694
0
    if (r->ref_id)
2695
0
        free(r->ref_id);
2696
0
    if (r->last)
2697
0
        r->last = NULL;
2698
2699
0
    r->ref_id = calloc(h->nref, sizeof(*r->ref_id));
2700
0
    if (!r->ref_id)
2701
0
        return -1;
2702
2703
0
    r->nref = h->nref;
2704
0
    for (i = 0; i < h->nref; i++) {
2705
0
        khint_t k = kh_get(refs, r->h_meta, h->ref[i].name);
2706
0
        if (k != kh_end(r->h_meta)) {
2707
0
            r->ref_id[i] = kh_val(r->h_meta, k);
2708
0
        } else {
2709
0
            hts_log_warning("Unable to find ref name '%s'", h->ref[i].name);
2710
0
        }
2711
0
    }
2712
2713
0
    return 0;
2714
0
}
2715
2716
/*
2717
 * Generates refs_t entries based on @SQ lines in the header.
2718
 * Returns 0 on success
2719
 *         -1 on failure
2720
 */
2721
245
static int refs_from_header(cram_fd *fd) {
2722
245
    if (!fd)
2723
0
        return -1;
2724
2725
245
    refs_t *r = fd->refs;
2726
245
    if (!r)
2727
0
        return -1;
2728
2729
245
    sam_hdr_t *h = fd->header;
2730
245
    if (!h)
2731
0
        return 0;
2732
2733
245
    if (!h->hrecs) {
2734
197
        if (-1 == sam_hdr_fill_hrecs(h))
2735
0
            return -1;
2736
197
    }
2737
2738
245
    if (h->hrecs->nref == 0)
2739
227
        return 0;
2740
2741
    //fprintf(stderr, "refs_from_header for %p mode %c\n", fd, fd->mode);
2742
2743
    /* Existing refs are fine, as long as they're compatible with the hdr. */
2744
18
    ref_entry **new_ref_id = realloc(r->ref_id, (r->nref + h->hrecs->nref) * sizeof(*r->ref_id));
2745
18
    if (!new_ref_id)
2746
0
        return -1;
2747
18
    r->ref_id = new_ref_id;
2748
2749
18
    int i, j;
2750
    /* Copy info from h->ref[i] over to r */
2751
234
    for (i = 0, j = r->nref; i < h->hrecs->nref; i++) {
2752
216
        sam_hrec_type_t *ty;
2753
216
        sam_hrec_tag_t *tag;
2754
216
        khint_t k;
2755
216
        int n;
2756
2757
216
        k = kh_get(refs, r->h_meta, h->hrecs->ref[i].name);
2758
216
        if (k != kh_end(r->h_meta))
2759
            // Ref already known about
2760
0
            continue;
2761
2762
216
        if (!(r->ref_id[j] = calloc(1, sizeof(ref_entry))))
2763
0
            return -1;
2764
2765
216
        if (!h->hrecs->ref[i].name)
2766
0
            return -1;
2767
2768
216
        r->ref_id[j]->name = string_dup(r->pool, h->hrecs->ref[i].name);
2769
216
        if (!r->ref_id[j]->name) return -1;
2770
216
        r->ref_id[j]->length = 0; // marker for not yet loaded
2771
2772
        /* Initialise likely filename if known */
2773
216
        if ((ty = sam_hrecs_find_type_id(h->hrecs, "SQ", "SN", h->hrecs->ref[i].name))) {
2774
216
            if ((tag = sam_hrecs_find_key(ty, "M5", NULL))) {
2775
0
                r->ref_id[j]->fn = string_dup(r->pool, tag->str+3);
2776
                //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn);
2777
0
            }
2778
216
        }
2779
2780
216
        k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n);
2781
216
        if (n <= 0) // already exists or error
2782
0
            return -1;
2783
216
        kh_val(r->h_meta, k) = r->ref_id[j];
2784
2785
216
        j++;
2786
216
    }
2787
18
    r->nref = j;
2788
2789
18
    return 0;
2790
18
}
2791
2792
/*
2793
 * Attaches a header to a cram_fd.
2794
 *
2795
 * This should be used when creating a new cram_fd for writing where
2796
 * we have a header already constructed (eg from a file we've read
2797
 * in).
2798
 */
2799
0
int cram_set_header2(cram_fd *fd, const sam_hdr_t *hdr) {
2800
0
    if (!fd || !hdr )
2801
0
        return -1;
2802
2803
0
    if (fd->header != hdr) {
2804
0
        if (fd->header)
2805
0
            sam_hdr_destroy(fd->header);
2806
0
        fd->header = sam_hdr_dup(hdr);
2807
0
        if (!fd->header)
2808
0
            return -1;
2809
0
    }
2810
0
    return refs_from_header(fd);
2811
0
}
2812
2813
0
int cram_set_header(cram_fd *fd, sam_hdr_t *hdr) {
2814
0
    return cram_set_header2(fd, hdr);
2815
0
}
2816
2817
/*
2818
 * Returns whether the path refers to a directory.
2819
 */
2820
0
static int is_directory(char *fn) {
2821
0
    struct stat buf;
2822
0
    if ( stat(fn,&buf) ) return 0;
2823
0
    return S_ISDIR(buf.st_mode);
2824
0
}
2825
2826
/*
2827
 * Converts a directory and a filename into an expanded path, replacing %s
2828
 * in directory with the filename and %[0-9]+s with portions of the filename
2829
 * Any remaining parts of filename are added to the end with /%s.
2830
 */
2831
0
static int expand_cache_path(char *path, char *dir, const char *fn) {
2832
0
    char *cp, *start = path;
2833
0
    size_t len;
2834
0
    size_t sz = PATH_MAX;
2835
2836
0
    while ((cp = strchr(dir, '%'))) {
2837
0
        if (cp-dir >= sz) return -1;
2838
0
        strncpy(path, dir, cp-dir);
2839
0
        path += cp-dir;
2840
0
        sz -= cp-dir;
2841
2842
0
        if (*++cp == 's') {
2843
0
            len = strlen(fn);
2844
0
            if (len >= sz) return -1;
2845
0
            strcpy(path, fn);
2846
0
            path += len;
2847
0
            sz -= len;
2848
0
            fn += len;
2849
0
            cp++;
2850
0
        } else if (*cp >= '0' && *cp <= '9') {
2851
0
            char *endp;
2852
0
            long l;
2853
2854
0
            l = strtol(cp, &endp, 10);
2855
0
            l = MIN(l, strlen(fn));
2856
0
            if (*endp == 's') {
2857
0
                if (l >= sz) return -1;
2858
0
                strncpy(path, fn, l);
2859
0
                path += l;
2860
0
                fn += l;
2861
0
                sz -= l;
2862
0
                *path = 0;
2863
0
                cp = endp+1;
2864
0
            } else {
2865
0
                if (sz < 3) return -1;
2866
0
                *path++ = '%';
2867
0
                *path++ = *cp++;
2868
0
            }
2869
0
        } else {
2870
0
            if (sz < 3) return -1;
2871
0
            *path++ = '%';
2872
0
            *path++ = *cp++;
2873
0
        }
2874
0
        dir = cp;
2875
0
    }
2876
2877
0
    len = strlen(dir);
2878
0
    if (len >= sz) return -1;
2879
0
    strcpy(path, dir);
2880
0
    path += len;
2881
0
    sz -= len;
2882
2883
0
    len = strlen(fn) + ((*fn && path > start && path[-1] != '/') ? 1 : 0);
2884
0
    if (len >= sz) return -1;
2885
0
    if (*fn && path > start && path[-1] != '/')
2886
0
        *path++ = '/';
2887
0
    strcpy(path, fn);
2888
0
    return 0;
2889
0
}
2890
2891
/*
2892
 * Make the directory containing path and any prefix directories.
2893
 */
2894
0
static void mkdir_prefix(char *path, int mode) {
2895
0
    char *cp = strrchr(path, '/');
2896
0
    if (!cp)
2897
0
        return;
2898
2899
0
    *cp = 0;
2900
0
    if (is_directory(path)) {
2901
0
        *cp = '/';
2902
0
        return;
2903
0
    }
2904
2905
0
    if (mkdir(path, mode) == 0) {
2906
0
        chmod(path, mode);
2907
0
        *cp = '/';
2908
0
        return;
2909
0
    }
2910
2911
0
    mkdir_prefix(path, mode);
2912
0
    mkdir(path, mode);
2913
0
    chmod(path, mode);
2914
0
    *cp = '/';
2915
0
}
2916
2917
/*
2918
 * Return the cache directory to use, based on the first of these
2919
 * environment variables to be set to a non-empty value.
2920
 */
2921
0
static const char *get_cache_basedir(const char **extra) {
2922
0
    char *base;
2923
2924
0
    *extra = "";
2925
2926
0
    base = getenv("XDG_CACHE_HOME");
2927
0
    if (base && *base) return base;
2928
2929
0
    base = getenv("HOME");
2930
0
    if (base && *base) { *extra = "/.cache"; return base; }
2931
2932
0
    base = getenv("TMPDIR");
2933
0
    if (base && *base) return base;
2934
2935
0
    base = getenv("TEMP");
2936
0
    if (base && *base) return base;
2937
2938
0
    return "/tmp";
2939
0
}
2940
2941
/*
2942
 * Queries the M5 string from the header and attempts to populate the
2943
 * reference from this using the REF_PATH environment.
2944
 *
2945
 * Returns 0 on success
2946
 *        -1 on failure
2947
 */
2948
0
static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) {
2949
0
    char *ref_path = getenv("REF_PATH");
2950
0
    sam_hrec_type_t *ty;
2951
0
    sam_hrec_tag_t *tag;
2952
0
    char path[PATH_MAX];
2953
0
    kstring_t path_tmp = KS_INITIALIZE;
2954
0
    char cache[PATH_MAX], cache_root[PATH_MAX];
2955
0
    char *local_cache = getenv("REF_CACHE");
2956
0
    mFILE *mf;
2957
0
    int local_path = 0;
2958
2959
0
    hts_log_info("Running cram_populate_ref on fd %p, id %d", (void *)fd, id);
2960
2961
0
    cache_root[0] = '\0';
2962
2963
0
    if (!ref_path || *ref_path == '\0') {
2964
        /*
2965
         * If we have no ref path, we use the EBI server.
2966
         * However to avoid spamming it we require a local ref cache too.
2967
         */
2968
0
        ref_path = "https://www.ebi.ac.uk/ena/cram/md5/%s";
2969
0
        if (!local_cache || *local_cache == '\0') {
2970
0
            const char *extra;
2971
0
            const char *base = get_cache_basedir(&extra);
2972
0
            snprintf(cache_root, PATH_MAX, "%s%s/hts-ref", base, extra);
2973
0
            snprintf(cache,PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, extra);
2974
0
            local_cache = cache;
2975
0
            hts_log_info("Populating local cache: %s", local_cache);
2976
0
        }
2977
0
    }
2978
2979
0
    if (!r->name)
2980
0
        return -1;
2981
2982
0
    if (!(ty = sam_hrecs_find_type_id(fd->header->hrecs, "SQ", "SN", r->name)))
2983
0
        return -1;
2984
2985
0
    if (!(tag = sam_hrecs_find_key(ty, "M5", NULL)))
2986
0
        goto no_M5;
2987
2988
0
    hts_log_info("Querying ref %s", tag->str+3);
2989
2990
    /* Use cache if available */
2991
0
    if (local_cache && *local_cache) {
2992
0
        if (expand_cache_path(path, local_cache, tag->str+3) == 0)
2993
0
            local_path = 1;
2994
0
    }
2995
2996
#ifndef HAVE_MMAP
2997
    char *path2;
2998
    /* Search local files in REF_PATH; we can open them and return as above */
2999
    if (!local_path && (path2 = find_path(tag->str+3, ref_path))) {
3000
        int len = snprintf(path, PATH_MAX, "%s", path2);
3001
        free(path2);
3002
        if (len > 0 && len < PATH_MAX) // in case it's too long
3003
            local_path = 1;
3004
    }
3005
#endif
3006
3007
    /* Found via REF_CACHE or local REF_PATH file */
3008
0
    if (local_path) {
3009
0
        struct stat sb;
3010
0
        BGZF *fp;
3011
3012
0
        if (0 == stat(path, &sb)
3013
0
            && S_ISREG(sb.st_mode)
3014
0
            && (fp = bgzf_open(path, "r"))) {
3015
0
            r->length = sb.st_size;
3016
0
            r->offset = r->line_length = r->bases_per_line = 0;
3017
3018
0
            r->fn = string_dup(fd->refs->pool, path);
3019
3020
0
            if (fd->refs->fp)
3021
0
                if (bgzf_close(fd->refs->fp) != 0)
3022
0
                    return -1;
3023
0
            fd->refs->fp = fp;
3024
0
            fd->refs->fn = r->fn;
3025
0
            r->is_md5 = 1;
3026
0
            r->validated_md5 = 1;
3027
3028
            // Fall back to cram_get_ref() where it'll do the actual
3029
            // reading of the file.
3030
0
            return 0;
3031
0
        }
3032
0
    }
3033
3034
3035
    /* Otherwise search full REF_PATH; slower as loads entire file */
3036
0
    if ((mf = open_path_mfile(tag->str+3, ref_path, NULL))) {
3037
0
        size_t sz;
3038
0
        r->seq = mfsteal(mf, &sz);
3039
0
        if (r->seq) {
3040
0
            r->mf = NULL;
3041
0
        } else {
3042
            // keep mf around as we couldn't detach
3043
0
            r->seq = mf->data;
3044
0
            r->mf = mf;
3045
0
        }
3046
0
        r->length = sz;
3047
0
        r->is_md5 = 1;
3048
0
        r->validated_md5 = 1;
3049
0
    } else {
3050
0
        refs_t *refs;
3051
0
        const char *fn;
3052
3053
0
    no_M5:
3054
        /* Failed to find in search path or M5 cache, see if @SQ UR: tag? */
3055
0
        if (!(tag = sam_hrecs_find_key(ty, "UR", NULL)))
3056
0
            return -1;
3057
3058
0
        fn = (strncmp(tag->str+3, "file:", 5) == 0)
3059
0
            ? tag->str+8
3060
0
            : tag->str+3;
3061
3062
0
        if (fd->refs->fp) {
3063
0
            if (bgzf_close(fd->refs->fp) != 0)
3064
0
                return -1;
3065
0
            fd->refs->fp = NULL;
3066
0
        }
3067
0
        if (!(refs = refs_load_fai(fd->refs, fn, 0)))
3068
0
            return -1;
3069
0
        sanitise_SQ_lines(fd);
3070
3071
0
        fd->refs = refs;
3072
0
        if (fd->refs->fp) {
3073
0
            if (bgzf_close(fd->refs->fp) != 0)
3074
0
                return -1;
3075
0
            fd->refs->fp = NULL;
3076
0
        }
3077
3078
0
        if (!fd->refs->fn)
3079
0
            return -1;
3080
3081
0
        if (-1 == refs2id(fd->refs, fd->header))
3082
0
            return -1;
3083
0
        if (!fd->refs->ref_id || !fd->refs->ref_id[id])
3084
0
            return -1;
3085
3086
        // Local copy already, so fall back to cram_get_ref().
3087
0
        return 0;
3088
0
    }
3089
3090
    /* Populate the local disk cache if required */
3091
0
    if (local_cache && *local_cache) {
3092
0
        hFILE *fp;
3093
3094
0
        if (*cache_root && !is_directory(cache_root)) {
3095
0
            hts_log_warning("Creating reference cache directory %s\n"
3096
0
                            "This may become large; see the samtools(1) manual page REF_CACHE discussion",
3097
0
                            cache_root);
3098
0
        }
3099
3100
0
        if (expand_cache_path(path, local_cache, tag->str+3) < 0) {
3101
0
            return 0; // Not fatal - we have the data already so keep going.
3102
0
        }
3103
0
        hts_log_info("Writing cache file '%s'", path);
3104
0
        mkdir_prefix(path, 01777);
3105
3106
0
        fp = hts_open_tmpfile(path, "wx", &path_tmp);
3107
0
        if (!fp) {
3108
0
            perror(path_tmp.s);
3109
0
            free(path_tmp.s);
3110
3111
            // Not fatal - we have the data already so keep going.
3112
0
            return 0;
3113
0
        }
3114
3115
        // Check md5sum
3116
0
        hts_md5_context *md5;
3117
0
        char unsigned md5_buf1[16];
3118
0
        char md5_buf2[33];
3119
3120
0
        if (!(md5 = hts_md5_init())) {
3121
0
            hclose_abruptly(fp);
3122
0
            unlink(path_tmp.s);
3123
0
            free(path_tmp.s);
3124
0
            return -1;
3125
0
        }
3126
0
        hts_md5_update(md5, r->seq, r->length);
3127
0
        hts_md5_final(md5_buf1, md5);
3128
0
        hts_md5_destroy(md5);
3129
0
        hts_md5_hex(md5_buf2, md5_buf1);
3130
3131
0
        if (strncmp(tag->str+3, md5_buf2, 32) != 0) {
3132
0
            hts_log_error("Mismatching md5sum for downloaded reference");
3133
0
            hclose_abruptly(fp);
3134
0
            unlink(path_tmp.s);
3135
0
            free(path_tmp.s);
3136
0
            return -1;
3137
0
        }
3138
3139
0
        ssize_t length_written = hwrite(fp, r->seq, r->length);
3140
0
        if (hclose(fp) < 0 || length_written != r->length ||
3141
0
            chmod(path_tmp.s, 0444) < 0 ||
3142
0
            rename(path_tmp.s, path) < 0) {
3143
0
            hts_log_error("Creating reference at %s failed: %s",
3144
0
                          path, strerror(errno));
3145
0
            unlink(path_tmp.s);
3146
0
        }
3147
0
    }
3148
3149
0
    free(path_tmp.s);
3150
0
    return 0;
3151
0
}
3152
3153
0
static void cram_ref_incr_locked(refs_t *r, int id) {
3154
0
    RP("%d INC REF %d, %d %p\n", gettid(), id,
3155
0
       (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count+1:-999),
3156
0
       id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
3157
3158
0
    if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq)
3159
0
        return;
3160
3161
0
    if (r->last_id == id)
3162
0
        r->last_id = -1;
3163
3164
0
    ++r->ref_id[id]->count;
3165
0
}
3166
3167
0
void cram_ref_incr(refs_t *r, int id) {
3168
0
    pthread_mutex_lock(&r->lock);
3169
0
    cram_ref_incr_locked(r, id);
3170
0
    pthread_mutex_unlock(&r->lock);
3171
0
}
3172
3173
0
static void cram_ref_decr_locked(refs_t *r, int id) {
3174
0
    RP("%d DEC REF %d, %d %p\n", gettid(), id,
3175
0
       (int)(id>=0 && r->ref_id[id]?r->ref_id[id]->count-1:-999),
3176
0
       id>=0 && r->ref_id[id]?r->ref_id[id]->seq:(char *)1);
3177
3178
0
    if (id < 0 || !r->ref_id[id] || !r->ref_id[id]->seq) {
3179
0
        return;
3180
0
    }
3181
3182
0
    if (--r->ref_id[id]->count <= 0) {
3183
0
        assert(r->ref_id[id]->count == 0);
3184
0
        if (r->last_id >= 0) {
3185
0
            if (r->ref_id[r->last_id]->count <= 0 &&
3186
0
                r->ref_id[r->last_id]->seq) {
3187
0
                RP("%d FREE REF %d (%p)\n", gettid(),
3188
0
                   r->last_id, r->ref_id[r->last_id]->seq);
3189
0
                ref_entry_free_seq(r->ref_id[r->last_id]);
3190
0
                if (r->ref_id[r->last_id]->is_md5) r->ref_id[r->last_id]->length = 0;
3191
0
            }
3192
0
        }
3193
0
        r->last_id = id;
3194
0
    }
3195
0
}
3196
3197
0
void cram_ref_decr(refs_t *r, int id) {
3198
0
    pthread_mutex_lock(&r->lock);
3199
0
    cram_ref_decr_locked(r, id);
3200
0
    pthread_mutex_unlock(&r->lock);
3201
0
}
3202
3203
/*
3204
 * Used by cram_ref_load and cram_ref_get. The file handle will have
3205
 * already been opened, so we can catch it. The ref_entry *e informs us
3206
 * of whether this is a multi-line fasta file or a raw MD5 style file.
3207
 * Either way we create a single contiguous sequence.
3208
 *
3209
 * Returns all or part of a reference sequence on success (malloced);
3210
 *         NULL on failure.
3211
 */
3212
0
static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) {
3213
0
    off_t offset, len;
3214
0
    char *seq;
3215
3216
0
    if (end < start)
3217
0
        end = start;
3218
3219
    /*
3220
     * Compute locations in file. This is trivial for the MD5 files, but
3221
     * is still necessary for the fasta variants.
3222
     */
3223
0
    offset = e->line_length
3224
0
        ? e->offset + (start-1)/e->bases_per_line * e->line_length +
3225
0
          (start-1) % e->bases_per_line
3226
0
        : start-1;
3227
3228
0
    len = (e->line_length
3229
0
           ? e->offset + (end-1)/e->bases_per_line * e->line_length +
3230
0
             (end-1) % e->bases_per_line
3231
0
           : end-1) - offset + 1;
3232
3233
0
    if (bgzf_useek(fp, offset, SEEK_SET) < 0) {
3234
0
        perror("bgzf_useek() on reference file");
3235
0
        return NULL;
3236
0
    }
3237
3238
0
    if (len == 0 || !(seq = malloc(len))) {
3239
0
        return NULL;
3240
0
    }
3241
3242
0
    if (len != bgzf_read(fp, seq, len)) {
3243
0
        perror("bgzf_read() on reference file");
3244
0
        free(seq);
3245
0
        return NULL;
3246
0
    }
3247
3248
    /* Strip white-space if required. */
3249
0
    if (len != end-start+1) {
3250
0
        int i, j;
3251
0
        char *cp = seq;
3252
0
        char *cp_to;
3253
3254
0
        for (i = j = 0; i < len; i++) {
3255
0
            if (cp[i] >= '!' && cp[i] <= '~')
3256
0
                cp[j++] = toupper_c(cp[i]);
3257
0
        }
3258
0
        cp_to = cp+j;
3259
3260
0
        if (cp_to - seq != end-start+1) {
3261
0
            hts_log_error("Malformed reference file");
3262
0
            free(seq);
3263
0
            return NULL;
3264
0
        }
3265
0
    } else {
3266
0
        int i;
3267
0
        for (i = 0; i < len; i++) {
3268
0
            seq[i] = toupper_c(seq[i]);
3269
0
        }
3270
0
    }
3271
3272
0
    return seq;
3273
0
}
3274
3275
/*
3276
 * Load the entire reference 'id'.
3277
 * This also increments the reference count by 1.
3278
 *
3279
 * Returns ref_entry on success;
3280
 *         NULL on failure
3281
 */
3282
0
ref_entry *cram_ref_load(refs_t *r, int id, int is_md5) {
3283
0
    ref_entry *e = r->ref_id[id];
3284
0
    int start = 1, end = e->length;
3285
0
    char *seq;
3286
3287
0
    if (e->seq) {
3288
0
        return e;
3289
0
    }
3290
3291
0
    assert(e->count == 0);
3292
3293
0
    if (r->last) {
3294
#ifdef REF_DEBUG
3295
        int idx = 0;
3296
        for (idx = 0; idx < r->nref; idx++)
3297
            if (r->last == r->ref_id[idx])
3298
                break;
3299
        RP("%d cram_ref_load DECR %d\n", gettid(), idx);
3300
#endif
3301
0
        assert(r->last->count > 0);
3302
0
        if (--r->last->count <= 0) {
3303
0
            RP("%d FREE REF %d (%p)\n", gettid(), id, r->ref_id[id]->seq);
3304
0
            if (r->last->seq)
3305
0
                ref_entry_free_seq(r->last);
3306
0
        }
3307
0
    }
3308
3309
0
    if (!r->fn)
3310
0
        return NULL;
3311
3312
    /* Open file if it's not already the current open reference */
3313
0
    if (strcmp(r->fn, e->fn) || r->fp == NULL) {
3314
0
        if (r->fp)
3315
0
            if (bgzf_close(r->fp) != 0)
3316
0
                return NULL;
3317
0
        r->fn = e->fn;
3318
0
        if (!(r->fp = bgzf_open_ref(r->fn, "r", is_md5)))
3319
0
            return NULL;
3320
0
    }
3321
3322
0
    RP("%d Loading ref %d (%d..%d)\n", gettid(), id, start, end);
3323
3324
0
    if (!(seq = load_ref_portion(r->fp, e, start, end))) {
3325
0
        return NULL;
3326
0
    }
3327
3328
0
    RP("%d Loaded ref %d (%d..%d) = %p\n", gettid(), id, start, end, seq);
3329
3330
0
    RP("%d INC REF %d, %"PRId64"\n", gettid(), id, (e->count+1));
3331
0
    e->seq = seq;
3332
0
    e->mf = NULL;
3333
0
    e->count++;
3334
3335
    /*
3336
     * Also keep track of last used ref so incr/decr loops on the same
3337
     * sequence don't cause load/free loops.
3338
     */
3339
0
    RP("%d cram_ref_load INCR %d => %"PRId64"\n", gettid(), id, e->count+1);
3340
0
    r->last = e;
3341
0
    e->count++;
3342
3343
0
    return e;
3344
0
}
3345
3346
/*
3347
 * Returns a portion of a reference sequence from start to end inclusive.
3348
 * The returned pointer is owned by either the cram_file fd or by the
3349
 * internal refs_t structure and should not be freed  by the caller.
3350
 *
3351
 * The difference is whether or not this refs_t is in use by just the one
3352
 * cram_fd or by multiples, or whether we have multiple threads accessing
3353
 * references. In either case fd->shared will be true and we start using
3354
 * reference counting to track the number of users of a specific reference
3355
 * sequence.
3356
 *
3357
 * Otherwise the ref seq returned is allocated as part of cram_fd itself
3358
 * and will be freed up on the next call to cram_get_ref or cram_close.
3359
 *
3360
 * To return the entire reference sequence, specify start as 1 and end
3361
 * as 0.
3362
 *
3363
 * To cease using a reference, call cram_ref_decr().
3364
 *
3365
 * Returns reference on success,
3366
 *         NULL on failure
3367
 */
3368
0
char *cram_get_ref(cram_fd *fd, int id, int start, int end) {
3369
0
    ref_entry *r;
3370
0
    char *seq;
3371
0
    int ostart = start;
3372
3373
0
    if (id == -1 || start < 1)
3374
0
        return NULL;
3375
3376
    /* FIXME: axiomatic query of r->seq being true?
3377
     * Or shortcut for unsorted data where we load once and never free?
3378
     */
3379
3380
    //fd->shared_ref = 1; // hard code for now to simplify things
3381
3382
0
    pthread_mutex_lock(&fd->ref_lock);
3383
3384
0
    RP("%d cram_get_ref on fd %p, id %d, range %d..%d\n", gettid(), fd, id, start, end);
3385
3386
    /*
3387
     * Unsorted data implies we want to fetch an entire reference at a time.
3388
     * We just deal with this at the moment by claiming we're sharing
3389
     * references instead, which has the same requirement.
3390
     */
3391
0
    if (fd->unsorted)
3392
0
        fd->shared_ref = 1;
3393
3394
3395
    /* Sanity checking: does this ID exist? */
3396
0
    if (id >= fd->refs->nref) {
3397
0
        hts_log_error("No reference found for id %d", id);
3398
0
        pthread_mutex_unlock(&fd->ref_lock);
3399
0
        return NULL;
3400
0
    }
3401
3402
0
    if (!fd->refs || !fd->refs->ref_id[id]) {
3403
0
        hts_log_error("No reference found for id %d", id);
3404
0
        pthread_mutex_unlock(&fd->ref_lock);
3405
0
        return NULL;
3406
0
    }
3407
3408
0
    if (!(r = fd->refs->ref_id[id])) {
3409
0
        hts_log_error("No reference found for id %d", id);
3410
0
        pthread_mutex_unlock(&fd->ref_lock);
3411
0
        return NULL;
3412
0
    }
3413
3414
3415
    /*
3416
     * It has an entry, but may not have been populated yet.
3417
     * Any manually loaded .fai files have their lengths known.
3418
     * A ref entry computed from @SQ lines (M5 or UR field) will have
3419
     * r->length == 0 unless it's been loaded once and verified that we have
3420
     * an on-disk filename for it.
3421
     *
3422
     * 19 Sep 2013: Moved the lock here as the cram_populate_ref code calls
3423
     * open_path_mfile and libcurl, which isn't multi-thread safe unless I
3424
     * rewrite my code to have one curl handle per thread.
3425
     */
3426
0
    pthread_mutex_lock(&fd->refs->lock);
3427
0
    if (r->length == 0) {
3428
0
        if (fd->ref_fn)
3429
0
            hts_log_warning("Reference file given, but ref '%s' not present",
3430
0
                            r->name);
3431
0
        if (cram_populate_ref(fd, id, r) == -1) {
3432
0
            hts_log_error("Failed to populate reference for id %d", id);
3433
0
            pthread_mutex_unlock(&fd->refs->lock);
3434
0
            pthread_mutex_unlock(&fd->ref_lock);
3435
0
            return NULL;
3436
0
        }
3437
0
        r = fd->refs->ref_id[id];
3438
0
        if (fd->unsorted)
3439
0
            cram_ref_incr_locked(fd->refs, id);
3440
0
    }
3441
3442
3443
    /*
3444
     * We now know that we the filename containing the reference, so check
3445
     * for limits. If it's over half the reference we'll load all of it in
3446
     * memory as this will speed up subsequent calls.
3447
     */
3448
0
    if (end < 1)
3449
0
        end = r->length;
3450
0
    if (end >= r->length)
3451
0
        end  = r->length;
3452
3453
0
    if (end - start >= 0.5*r->length || fd->shared_ref) {
3454
0
        start = 1;
3455
0
        end = r->length;
3456
0
    }
3457
3458
    /*
3459
     * Maybe we have it cached already? If so use it.
3460
     *
3461
     * Alternatively if we don't have the sequence but we're sharing
3462
     * references and/or are asking for the entire length of it, then
3463
     * load the full reference into the refs structure and return
3464
     * a pointer to that one instead.
3465
     */
3466
0
    if (fd->shared_ref || r->seq || (start == 1 && end == r->length)) {
3467
0
        char *cp;
3468
3469
0
        if (id >= 0) {
3470
0
            if (r->seq) {
3471
0
                cram_ref_incr_locked(fd->refs, id);
3472
0
            } else {
3473
0
                ref_entry *e;
3474
0
                if (!(e = cram_ref_load(fd->refs, id, r->is_md5))) {
3475
0
                    pthread_mutex_unlock(&fd->refs->lock);
3476
0
                    pthread_mutex_unlock(&fd->ref_lock);
3477
0
                    return NULL;
3478
0
                }
3479
3480
                /* unsorted data implies cache ref indefinitely, to avoid
3481
                 * continually loading and unloading.
3482
                 */
3483
0
                if (fd->unsorted)
3484
0
                    cram_ref_incr_locked(fd->refs, id);
3485
0
            }
3486
3487
0
            fd->ref = NULL; /* We never access it directly */
3488
0
            fd->ref_start = 1;
3489
0
            fd->ref_end   = r->length;
3490
0
            fd->ref_id    = id;
3491
3492
0
            cp = fd->refs->ref_id[id]->seq + ostart-1;
3493
0
        } else {
3494
0
            fd->ref = NULL;
3495
0
            cp = NULL;
3496
0
        }
3497
3498
0
        RP("%d cram_get_ref returning for id %d, count %d\n", gettid(), id, (int)r->count);
3499
3500
0
        pthread_mutex_unlock(&fd->refs->lock);
3501
0
        pthread_mutex_unlock(&fd->ref_lock);
3502
0
        return cp;
3503
0
    }
3504
3505
    /*
3506
     * Otherwise we're not sharing, we don't have a copy of it already and
3507
     * we're only asking for a small portion of it.
3508
     *
3509
     * In this case load up just that segment ourselves, freeing any old
3510
     * small segments in the process.
3511
     */
3512
3513
    /* Unmapped ref ID */
3514
0
    if (id < 0 || !fd->refs->fn) {
3515
0
        if (fd->ref_free) {
3516
0
            free(fd->ref_free);
3517
0
            fd->ref_free = NULL;
3518
0
        }
3519
0
        fd->ref = NULL;
3520
0
        fd->ref_id = id;
3521
0
        pthread_mutex_unlock(&fd->refs->lock);
3522
0
        pthread_mutex_unlock(&fd->ref_lock);
3523
0
        return NULL;
3524
0
    }
3525
3526
    /* Open file if it's not already the current open reference */
3527
0
    if (strcmp(fd->refs->fn, r->fn) || fd->refs->fp == NULL) {
3528
0
        if (fd->refs->fp)
3529
0
            if (bgzf_close(fd->refs->fp) != 0)
3530
0
                return NULL;
3531
0
        fd->refs->fn = r->fn;
3532
0
        if (!(fd->refs->fp = bgzf_open_ref(fd->refs->fn, "r", r->is_md5))) {
3533
0
            pthread_mutex_unlock(&fd->refs->lock);
3534
0
            pthread_mutex_unlock(&fd->ref_lock);
3535
0
            return NULL;
3536
0
        }
3537
0
    }
3538
3539
0
    if (!(fd->ref = load_ref_portion(fd->refs->fp, r, start, end))) {
3540
0
        pthread_mutex_unlock(&fd->refs->lock);
3541
0
        pthread_mutex_unlock(&fd->ref_lock);
3542
0
        return NULL;
3543
0
    }
3544
3545
0
    if (fd->ref_free)
3546
0
        free(fd->ref_free);
3547
3548
0
    fd->ref_id    = id;
3549
0
    fd->ref_start = start;
3550
0
    fd->ref_end   = end;
3551
0
    fd->ref_free = fd->ref;
3552
0
    seq = fd->ref;
3553
3554
0
    pthread_mutex_unlock(&fd->refs->lock);
3555
0
    pthread_mutex_unlock(&fd->ref_lock);
3556
3557
0
    return seq ? seq + ostart - start : NULL;
3558
0
}
3559
3560
/*
3561
 * If fd has been opened for reading, it may be permitted to specify 'fn'
3562
 * as NULL and let the code auto-detect the reference by parsing the
3563
 * SAM header @SQ lines.
3564
 */
3565
0
int cram_load_reference(cram_fd *fd, char *fn) {
3566
0
    int ret = 0;
3567
3568
0
    if (fn) {
3569
0
        fd->refs = refs_load_fai(fd->refs, fn,
3570
0
                                 !(fd->embed_ref>0 && fd->mode == 'r'));
3571
0
        fn = fd->refs ? fd->refs->fn : NULL;
3572
0
        if (!fn)
3573
0
            ret = -1;
3574
0
        sanitise_SQ_lines(fd);
3575
0
    }
3576
0
    fd->ref_fn = fn;
3577
3578
0
    if ((!fd->refs || (fd->refs->nref == 0 && !fn)) && fd->header) {
3579
0
        if (fd->refs)
3580
0
            refs_free(fd->refs);
3581
0
        if (!(fd->refs = refs_create()))
3582
0
            return -1;
3583
0
        if (-1 == refs_from_header(fd))
3584
0
            return -1;
3585
0
    }
3586
3587
0
    if (fd->header)
3588
0
        if (-1 == refs2id(fd->refs, fd->header))
3589
0
            return -1;
3590
3591
0
    return ret;
3592
0
}
3593
3594
/* ----------------------------------------------------------------------
3595
 * Containers
3596
 */
3597
3598
/*
3599
 * Creates a new container, specifying the maximum number of slices
3600
 * and records permitted.
3601
 *
3602
 * Returns cram_container ptr on success
3603
 *         NULL on failure
3604
 */
3605
0
cram_container *cram_new_container(int nrec, int nslice) {
3606
0
    cram_container *c = calloc(1, sizeof(*c));
3607
0
    enum cram_DS_ID id;
3608
3609
0
    if (!c)
3610
0
        return NULL;
3611
3612
0
    c->curr_ref = -2;
3613
3614
0
    c->max_c_rec = nrec * nslice;
3615
0
    c->curr_c_rec = 0;
3616
3617
0
    c->max_rec = nrec;
3618
0
    c->record_counter = 0;
3619
0
    c->num_bases = 0;
3620
0
    c->s_num_bases = 0;
3621
3622
0
    c->max_slice = nslice;
3623
0
    c->curr_slice = 0;
3624
3625
0
    c->pos_sorted = 1;
3626
0
    c->max_apos   = 0;
3627
0
    c->multi_seq  = 0;
3628
0
    c->qs_seq_orient = 1;
3629
3630
0
    c->bams = NULL;
3631
3632
0
    if (!(c->slices = calloc(nslice != 0 ? nslice : 1, sizeof(cram_slice *))))
3633
0
        goto err;
3634
0
    c->slice = NULL;
3635
3636
0
    if (!(c->comp_hdr = cram_new_compression_header()))
3637
0
        goto err;
3638
0
    c->comp_hdr_block = NULL;
3639
3640
0
    for (id = DS_RN; id < DS_TN; id++)
3641
0
        if (!(c->stats[id] = cram_stats_create())) goto err;
3642
3643
    //c->aux_B_stats = cram_stats_create();
3644
3645
0
    if (!(c->tags_used = kh_init(m_tagmap)))
3646
0
        goto err;
3647
0
    c->refs_used = 0;
3648
0
    c->ref_free = 0;
3649
3650
0
    return c;
3651
3652
0
 err:
3653
0
    if (c) {
3654
0
        if (c->slices)
3655
0
            free(c->slices);
3656
0
        free(c);
3657
0
    }
3658
0
    return NULL;
3659
0
}
3660
3661
3.94k
void cram_free_container(cram_container *c) {
3662
3.94k
    enum cram_DS_ID id;
3663
3.94k
    int i;
3664
3665
3.94k
    if (!c)
3666
0
        return;
3667
3668
3.94k
    if (c->refs_used)
3669
0
        free(c->refs_used);
3670
3671
3.94k
    if (c->landmark)
3672
222
        free(c->landmark);
3673
3674
3.94k
    if (c->comp_hdr)
3675
441
        cram_free_compression_header(c->comp_hdr);
3676
3677
3.94k
    if (c->comp_hdr_block)
3678
640
        cram_free_block(c->comp_hdr_block);
3679
3680
    // Free the slices; filled out by encoder only
3681
3.94k
    if (c->slices) {
3682
0
        for (i = 0; i < c->max_slice; i++) {
3683
0
            if (c->slices[i])
3684
0
                cram_free_slice(c->slices[i]);
3685
0
            if (c->slices[i] == c->slice)
3686
0
                c->slice = NULL;
3687
0
        }
3688
0
        free(c->slices);
3689
0
    }
3690
3691
    // Free the current slice; set by both encoder & decoder
3692
3.94k
    if (c->slice) {
3693
0
        cram_free_slice(c->slice);
3694
0
        c->slice = NULL;
3695
0
    }
3696
3697
114k
    for (id = DS_RN; id < DS_TN; id++)
3698
110k
        if (c->stats[id]) cram_stats_free(c->stats[id]);
3699
3700
    //if (c->aux_B_stats) cram_stats_free(c->aux_B_stats);
3701
3702
3.94k
    if (c->tags_used) {
3703
0
        khint_t k;
3704
3705
0
        for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
3706
0
            if (!kh_exist(c->tags_used, k))
3707
0
                continue;
3708
3709
0
            cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
3710
0
            if (tm) {
3711
0
                cram_codec *c = tm->codec;
3712
3713
0
                if (c) c->free(c);
3714
0
                free(tm);
3715
0
            }
3716
0
        }
3717
3718
0
        kh_destroy(m_tagmap, c->tags_used);
3719
0
    }
3720
3721
3.94k
    if (c->ref_free)
3722
0
        free(c->ref);
3723
3724
3.94k
    free(c);
3725
3.94k
}
3726
3727
/*
3728
 * Reads a container header.
3729
 *
3730
 * Returns cram_container on success
3731
 *         NULL on failure or no container left (fd->err == 0).
3732
 */
3733
3.95k
cram_container *cram_read_container(cram_fd *fd) {
3734
3.95k
    cram_container c2, *c;
3735
3.95k
    int i, s;
3736
3.95k
    size_t rd = 0;
3737
3.95k
    uint32_t crc = 0;
3738
3739
3.95k
    fd->err = 0;
3740
3.95k
    fd->eof = 0;
3741
3742
3.95k
    memset(&c2, 0, sizeof(c2));
3743
3.95k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3744
2.63k
        if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) {
3745
8
            fd->eof = fd->empty_container ? 1 : 2;
3746
8
            return NULL;
3747
2.62k
        } else {
3748
2.62k
            rd+=s;
3749
2.62k
        }
3750
2.63k
    } else if (CRAM_MAJOR_VERS(fd->version) < 4) {
3751
1.31k
        uint32_t len;
3752
1.31k
        if ((s = int32_decode(fd, &c2.length)) == -1) {
3753
0
            if (CRAM_MAJOR_VERS(fd->version) == 2 &&
3754
0
                CRAM_MINOR_VERS(fd->version) == 0)
3755
0
                fd->eof = 1; // EOF blocks arrived in v2.1
3756
0
            else
3757
0
                fd->eof = fd->empty_container ? 1 : 2;
3758
0
            return NULL;
3759
1.31k
        } else {
3760
1.31k
            rd+=s;
3761
1.31k
        }
3762
1.31k
        len = le_int4(c2.length);
3763
1.31k
        crc = crc32(0L, (unsigned char *)&len, 4);
3764
1.31k
    } else {
3765
3
        if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc))   == -1) {
3766
0
            fd->eof = fd->empty_container ? 1 : 2;
3767
0
            return NULL;
3768
3
        } else {
3769
3
            rd+=s;
3770
3
        }
3771
3
    }
3772
3.94k
    if ((s = fd->vv.varint_decode32s_crc(fd, &c2.ref_seq_id, &crc))   == -1) return NULL; else rd+=s;
3773
3.94k
    if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3774
3
        int64_t i64;
3775
3
        if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc))== -1) return NULL; else rd+=s;
3776
3
        c2.ref_seq_start = i64;
3777
3
        if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc)) == -1) return NULL; else rd+=s;
3778
3
        c2.ref_seq_span = i64;
3779
3.94k
    } else {
3780
3.94k
        int32_t i32;
3781
3.94k
        if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s;
3782
3.94k
        c2.ref_seq_start = i32;
3783
3.94k
        if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s;
3784
3.94k
        c2.ref_seq_span = i32;
3785
3.94k
    }
3786
3.94k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_records, &crc))  == -1) return NULL; else rd+=s;
3787
3788
3.94k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3789
2.62k
        c2.record_counter = 0;
3790
2.62k
        c2.num_bases = 0;
3791
2.62k
    } else {
3792
1.32k
        if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3793
3
            if ((s = fd->vv.varint_decode64_crc(fd, &c2.record_counter, &crc)) == -1)
3794
0
                return NULL;
3795
3
            else
3796
3
                rd += s;
3797
1.31k
        } else {
3798
1.31k
            int32_t i32;
3799
1.31k
            if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1)
3800
0
                return NULL;
3801
1.31k
            else
3802
1.31k
                rd += s;
3803
1.31k
            c2.record_counter = i32;
3804
1.31k
        }
3805
3806
1.32k
        if ((s = fd->vv.varint_decode64_crc(fd, &c2.num_bases, &crc))== -1)
3807
0
            return NULL;
3808
1.32k
        else
3809
1.32k
            rd += s;
3810
1.32k
    }
3811
3.94k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_blocks, &crc))   == -1)
3812
2
        return NULL;
3813
3.94k
    else
3814
3.94k
        rd+=s;
3815
3.94k
    if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_landmarks, &crc))== -1)
3816
0
        return NULL;
3817
3.94k
    else
3818
3.94k
        rd+=s;
3819
3820
3.94k
    if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t))
3821
1
        return NULL;
3822
3823
3.94k
    if (!(c = calloc(1, sizeof(*c))))
3824
0
        return NULL;
3825
3826
3.94k
    *c = c2;
3827
3828
3.94k
    if (c->num_landmarks && !(c->landmark = malloc(c->num_landmarks * sizeof(int32_t)))) {
3829
0
        fd->err = errno;
3830
0
        cram_free_container(c);
3831
0
        return NULL;
3832
0
    }
3833
511k
    for (i = 0; i < c->num_landmarks; i++) {
3834
508k
        if ((s = fd->vv.varint_decode32_crc(fd, &c->landmark[i], &crc)) == -1) {
3835
12
            cram_free_container(c);
3836
12
            return NULL;
3837
508k
        } else {
3838
508k
            rd += s;
3839
508k
        }
3840
508k
    }
3841
3842
3.93k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3843
2
        if (-1 == int32_decode(fd, (int32_t *)&c->crc32)) {
3844
0
            cram_free_container(c);
3845
0
            return NULL;
3846
2
        } else {
3847
2
            rd+=4;
3848
2
        }
3849
3850
2
        if (crc != c->crc32) {
3851
2
            hts_log_error("Container header CRC32 failure");
3852
2
            cram_free_container(c);
3853
2
            return NULL;
3854
2
        }
3855
2
    }
3856
3857
3.92k
    c->offset = rd;
3858
3.92k
    c->slices = NULL;
3859
3.92k
    c->slice = NULL;
3860
3.92k
    c->curr_slice = 0;
3861
3.92k
    c->max_slice = c->num_landmarks;
3862
3.92k
    c->slice_rec = 0;
3863
3.92k
    c->curr_rec = 0;
3864
3.92k
    c->max_rec = 0;
3865
3866
3.92k
    if (c->ref_seq_id == -2) {
3867
0
        c->multi_seq = 1;
3868
0
        fd->multi_seq = 1;
3869
0
    }
3870
3871
3.92k
    fd->empty_container =
3872
3.92k
        (c->num_records == 0 &&
3873
3.92k
         c->ref_seq_id == -1 &&
3874
3.92k
         c->ref_seq_start == 0x454f46 /* EOF */) ? 1 : 0;
3875
3876
3.92k
    return c;
3877
3.93k
}
3878
3879
3880
/* MAXIMUM storage size needed for the container. */
3881
0
int cram_container_size(cram_container *c) {
3882
0
    return 55 + 5*c->num_landmarks;
3883
0
}
3884
3885
3886
/*
3887
 * Stores the container structure in dat and returns *size as the
3888
 * number of bytes written to dat[].  The input size of dat is also
3889
 * held in *size and should be initialised to cram_container_size(c).
3890
 *
3891
 * Returns 0 on success;
3892
 *        -1 on failure
3893
 */
3894
int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size)
3895
0
{
3896
0
    char *cp = (char *)dat;
3897
0
    int i;
3898
3899
    // Check the input buffer is large enough according to our stated
3900
    // requirements. (NOTE: it may actually take less.)
3901
0
    if (cram_container_size(c) > *size)
3902
0
        return -1;
3903
3904
0
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3905
0
        cp += itf8_put(cp, c->length);
3906
0
    } else {
3907
0
        *(int32_t *)cp = le_int4(c->length);
3908
0
        cp += 4;
3909
0
    }
3910
0
    if (c->multi_seq) {
3911
0
        cp += fd->vv.varint_put32(cp, NULL, -2);
3912
0
        cp += fd->vv.varint_put32(cp, NULL, 0);
3913
0
        cp += fd->vv.varint_put32(cp, NULL, 0);
3914
0
    } else {
3915
0
        cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
3916
0
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3917
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start);
3918
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span);
3919
0
        } else {
3920
0
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
3921
0
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
3922
0
        }
3923
0
    }
3924
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_records);
3925
0
    if (CRAM_MAJOR_VERS(fd->version) == 2) {
3926
0
        cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
3927
0
    } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3928
0
        cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
3929
0
    }
3930
0
    cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
3931
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
3932
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
3933
0
    for (i = 0; i < c->num_landmarks; i++)
3934
0
        cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
3935
3936
0
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
3937
0
        c->crc32 = crc32(0L, (uc *)dat, cp-dat);
3938
0
        cp[0] =  c->crc32        & 0xff;
3939
0
        cp[1] = (c->crc32 >>  8) & 0xff;
3940
0
        cp[2] = (c->crc32 >> 16) & 0xff;
3941
0
        cp[3] = (c->crc32 >> 24) & 0xff;
3942
0
        cp += 4;
3943
0
    }
3944
3945
0
    *size = cp-dat; // actual used size
3946
3947
0
    return 0;
3948
0
}
3949
3950
3951
/*
3952
 * Writes a container structure.
3953
 *
3954
 * Returns 0 on success
3955
 *        -1 on failure
3956
 */
3957
0
int cram_write_container(cram_fd *fd, cram_container *c) {
3958
0
    char buf_a[1024], *buf = buf_a, *cp;
3959
0
    int i;
3960
3961
0
    if (61 + c->num_landmarks * 10 >= 1024) {
3962
0
        buf = malloc(61 + c->num_landmarks * 10);
3963
0
        if (!buf)
3964
0
            return -1;
3965
0
    }
3966
0
    cp = buf;
3967
3968
0
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
3969
0
        cp += itf8_put(cp, c->length);
3970
0
    } else if (CRAM_MAJOR_VERS(fd->version) <= 3) {
3971
0
        *(int32_t *)cp = le_int4(c->length);
3972
0
        cp += 4;
3973
0
    } else {
3974
0
        cp += fd->vv.varint_put32(cp, NULL, c->length);
3975
0
    }
3976
0
    if (c->multi_seq) {
3977
0
        cp += fd->vv.varint_put32(cp, NULL, (uint32_t)-2);
3978
0
        cp += fd->vv.varint_put32(cp, NULL, 0);
3979
0
        cp += fd->vv.varint_put32(cp, NULL, 0);
3980
0
    } else {
3981
0
        cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id);
3982
0
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3983
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start);
3984
0
            cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span);
3985
0
        } else {
3986
0
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start);
3987
0
            cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span);
3988
0
        }
3989
0
    }
3990
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_records);
3991
0
    if (CRAM_MAJOR_VERS(fd->version) >= 3)
3992
0
        cp += fd->vv.varint_put64(cp, NULL, c->record_counter);
3993
0
    else
3994
0
        cp += fd->vv.varint_put32(cp, NULL, c->record_counter);
3995
0
    cp += fd->vv.varint_put64(cp, NULL, c->num_bases);
3996
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_blocks);
3997
0
    cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks);
3998
0
    for (i = 0; i < c->num_landmarks; i++)
3999
0
        cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]);
4000
4001
0
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
4002
0
        c->crc32 = crc32(0L, (uc *)buf, cp-buf);
4003
0
        cp[0] =  c->crc32        & 0xff;
4004
0
        cp[1] = (c->crc32 >>  8) & 0xff;
4005
0
        cp[2] = (c->crc32 >> 16) & 0xff;
4006
0
        cp[3] = (c->crc32 >> 24) & 0xff;
4007
0
        cp += 4;
4008
0
    }
4009
4010
0
    if (cp-buf != hwrite(fd->fp, buf, cp-buf)) {
4011
0
        if (buf != buf_a)
4012
0
            free(buf);
4013
0
        return -1;
4014
0
    }
4015
4016
0
    if (buf != buf_a)
4017
0
        free(buf);
4018
4019
0
    return 0;
4020
0
}
4021
4022
// common component shared by cram_flush_container{,_mt}
4023
0
static int cram_flush_container2(cram_fd *fd, cram_container *c) {
4024
0
    int i, j;
4025
4026
0
    if (c->curr_slice > 0 && !c->slices)
4027
0
        return -1;
4028
4029
    //fprintf(stderr, "Writing container %d, sum %u\n", c->record_counter, sum);
4030
4031
0
    off_t c_offset = htell(fd->fp); // File offset of container
4032
4033
    /* Write the container struct itself */
4034
0
    if (0 != cram_write_container(fd, c))
4035
0
        return -1;
4036
4037
0
    off_t hdr_size = htell(fd->fp) - c_offset;
4038
4039
    /* And the compression header */
4040
0
    if (0 != cram_write_block(fd, c->comp_hdr_block))
4041
0
        return -1;
4042
4043
    /* Followed by the slice blocks */
4044
0
    off_t file_offset = htell(fd->fp);
4045
0
    for (i = 0; i < c->curr_slice; i++) {
4046
0
        cram_slice *s = c->slices[i];
4047
0
        off_t spos = file_offset - c_offset - hdr_size;
4048
4049
0
        if (0 != cram_write_block(fd, s->hdr_block))
4050
0
            return -1;
4051
4052
0
        for (j = 0; j < s->hdr->num_blocks; j++) {
4053
0
            if (0 != cram_write_block(fd, s->block[j]))
4054
0
                return -1;
4055
0
        }
4056
4057
0
        file_offset = htell(fd->fp);
4058
0
        off_t sz = file_offset - c_offset - hdr_size - spos;
4059
4060
0
        if (fd->idxfp) {
4061
0
            if (cram_index_slice(fd, c, s, fd->idxfp, c_offset, spos, sz) < 0)
4062
0
                return -1;
4063
0
        }
4064
0
    }
4065
4066
0
    return 0;
4067
0
}
4068
4069
/*
4070
 * Flushes a completely or partially full container to disk, writing
4071
 * container structure, header and blocks. This also calls the encoder
4072
 * functions.
4073
 *
4074
 * Returns 0 on success
4075
 *        -1 on failure
4076
 */
4077
0
int cram_flush_container(cram_fd *fd, cram_container *c) {
4078
    /* Encode the container blocks and generate compression header */
4079
0
    if (0 != cram_encode_container(fd, c))
4080
0
        return -1;
4081
4082
0
    return cram_flush_container2(fd, c);
4083
0
}
4084
4085
typedef struct {
4086
    cram_fd *fd;
4087
    cram_container *c;
4088
} cram_job;
4089
4090
0
void *cram_flush_thread(void *arg) {
4091
0
    cram_job *j = (cram_job *)arg;
4092
4093
    /* Encode the container blocks and generate compression header */
4094
0
    if (0 != cram_encode_container(j->fd, j->c)) {
4095
0
        hts_log_error("Call to cram_encode_container failed");
4096
0
        return NULL;
4097
0
    }
4098
4099
0
    return arg;
4100
0
}
4101
4102
0
static int cram_flush_result(cram_fd *fd) {
4103
0
    int i, ret = 0;
4104
0
    hts_tpool_result *r;
4105
0
    cram_container *lc = NULL;
4106
4107
    // NB: we can have one result per slice, not per container,
4108
    // so we need to free the container only after all slices
4109
    // within it have been freed.  (Automatic via reference counting.)
4110
0
    while ((r = hts_tpool_next_result(fd->rqueue))) {
4111
0
        cram_job *j = (cram_job *)hts_tpool_result_data(r);
4112
0
        cram_container *c;
4113
4114
0
        if (!j) {
4115
0
            hts_tpool_delete_result(r, 0);
4116
0
            return -1;
4117
0
        }
4118
4119
0
        fd = j->fd;
4120
0
        c = j->c;
4121
4122
0
        if (fd->mode == 'w')
4123
0
            if (0 != cram_flush_container2(fd, c))
4124
0
                return -1;
4125
4126
        // Free the slices; filled out by encoder only
4127
0
        if (c->slices) {
4128
0
            for (i = 0; i < c->max_slice; i++) {
4129
0
                if (c->slices[i])
4130
0
                    cram_free_slice(c->slices[i]);
4131
0
                if (c->slices[i] == c->slice)
4132
0
                    c->slice = NULL;
4133
0
                c->slices[i] = NULL;
4134
0
            }
4135
0
        }
4136
4137
        // Free the current slice; set by both encoder & decoder
4138
0
        if (c->slice) {
4139
0
            cram_free_slice(c->slice);
4140
0
            c->slice = NULL;
4141
0
        }
4142
0
        c->curr_slice = 0;
4143
4144
        // Our jobs will be in order, so we free the last
4145
        // container when our job has switched to a new one.
4146
0
        if (c != lc) {
4147
0
            if (lc) {
4148
0
                if (fd->ctr == lc)
4149
0
                    fd->ctr = NULL;
4150
0
                if (fd->ctr_mt == lc)
4151
0
                    fd->ctr_mt = NULL;
4152
0
                cram_free_container(lc);
4153
0
            }
4154
0
            lc = c;
4155
0
        }
4156
4157
0
        hts_tpool_delete_result(r, 1);
4158
0
    }
4159
0
    if (lc) {
4160
0
        if (fd->ctr == lc)
4161
0
            fd->ctr = NULL;
4162
0
        if (fd->ctr_mt == lc)
4163
0
            fd->ctr_mt = NULL;
4164
0
        cram_free_container(lc);
4165
0
    }
4166
4167
0
    return ret;
4168
0
}
4169
4170
// Note: called while metrics_lock is held.
4171
// Will be left in this state too, but may temporarily unlock.
4172
0
void reset_metrics(cram_fd *fd) {
4173
0
    int i;
4174
4175
0
    if (fd->pool) {
4176
        // If multi-threaded we have multiple blocks being
4177
        // compressed already and several on the to-do list
4178
        // (fd->rqueue->pending).  It's tricky to reset the
4179
        // metrics exactly the correct point, so instead we
4180
        // just flush the pool, reset, and then continue again.
4181
4182
        // Don't bother starting a new trial before then though.
4183
0
        for (i = 0; i < DS_END; i++) {
4184
0
            cram_metrics *m = fd->m[i];
4185
0
            if (!m)
4186
0
                continue;
4187
0
            m->next_trial = 999;
4188
0
        }
4189
4190
0
        pthread_mutex_unlock(&fd->metrics_lock);
4191
0
        hts_tpool_process_flush(fd->rqueue);
4192
0
        pthread_mutex_lock(&fd->metrics_lock);
4193
0
    }
4194
4195
0
    for (i = 0; i < DS_END; i++) {
4196
0
        cram_metrics *m = fd->m[i];
4197
0
        if (!m)
4198
0
            continue;
4199
4200
0
        m->trial = NTRIALS;
4201
0
        m->next_trial = TRIAL_SPAN;
4202
0
        m->revised_method = 0;
4203
0
        m->unpackable = 0;
4204
4205
0
        memset(m->sz, 0, sizeof(m->sz));
4206
0
    }
4207
0
}
4208
4209
0
int cram_flush_container_mt(cram_fd *fd, cram_container *c) {
4210
0
    cram_job *j;
4211
4212
    // At the junction of mapped to unmapped data the compression
4213
    // methods may need to change due to very different statistical
4214
    // properties; particularly BA if minhash sorted.
4215
    //
4216
    // However with threading we'll have several in-flight blocks
4217
    // arriving out of order.
4218
    //
4219
    // So we do one trial reset of NThreads to last for NThreads
4220
    // duration to get us over this transition period, followed
4221
    // by another retrial of the usual ntrials & trial span.
4222
0
    pthread_mutex_lock(&fd->metrics_lock);
4223
0
    if (c->n_mapped < 0.3*c->curr_rec &&
4224
0
        fd->last_mapped > 0.7*c->max_rec) {
4225
0
        reset_metrics(fd);
4226
0
    }
4227
0
    fd->last_mapped = c->n_mapped * (c->max_rec+1)/(c->curr_rec+1) ;
4228
0
    pthread_mutex_unlock(&fd->metrics_lock);
4229
4230
0
    if (!fd->pool)
4231
0
        return cram_flush_container(fd, c);
4232
4233
0
    if (!(j = malloc(sizeof(*j))))
4234
0
        return -1;
4235
0
    j->fd = fd;
4236
0
    j->c = c;
4237
4238
    // Flush the job.  Note our encoder queue may be full, so we
4239
    // either have to keep trying in non-blocking mode (what we do) or
4240
    // use a dedicated separate thread for draining the queue.
4241
0
    for (;;) {
4242
0
        errno = 0;
4243
0
        hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_flush_thread, j, 1);
4244
0
        int pending = (errno == EAGAIN);
4245
0
        if (cram_flush_result(fd) != 0)
4246
0
            return -1;
4247
0
        if (!pending)
4248
0
            break;
4249
4250
0
        usleep(1000);
4251
0
    }
4252
4253
0
    return 0;
4254
0
}
4255
4256
/* ----------------------------------------------------------------------
4257
 * Compression headers; the first part of the container
4258
 */
4259
4260
/*
4261
 * Creates a new blank container compression header
4262
 *
4263
 * Returns header ptr on success
4264
 *         NULL on failure
4265
 */
4266
0
cram_block_compression_hdr *cram_new_compression_header(void) {
4267
0
    cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
4268
0
    if (!hdr)
4269
0
        return NULL;
4270
4271
0
    if (!(hdr->TD_blk = cram_new_block(CORE, 0))) {
4272
0
        free(hdr);
4273
0
        return NULL;
4274
0
    }
4275
4276
0
    if (!(hdr->TD_hash = kh_init(m_s2i))) {
4277
0
        cram_free_block(hdr->TD_blk);
4278
0
        free(hdr);
4279
0
        return NULL;
4280
0
    }
4281
4282
0
    if (!(hdr->TD_keys = string_pool_create(8192))) {
4283
0
        kh_destroy(m_s2i, hdr->TD_hash);
4284
0
        cram_free_block(hdr->TD_blk);
4285
0
        free(hdr);
4286
0
        return NULL;
4287
0
    }
4288
4289
0
    return hdr;
4290
0
}
4291
4292
498
void cram_free_compression_header(cram_block_compression_hdr *hdr) {
4293
498
    int i;
4294
4295
498
    if (hdr->landmark)
4296
498
        free(hdr->landmark);
4297
4298
498
    if (hdr->preservation_map)
4299
498
        kh_destroy(map, hdr->preservation_map);
4300
4301
16.4k
    for (i = 0; i < CRAM_MAP_HASH; i++) {
4302
15.9k
        cram_map *m, *m2;
4303
19.7k
        for (m = hdr->rec_encoding_map[i]; m; m = m2) {
4304
3.78k
            m2 = m->next;
4305
3.78k
            if (m->codec)
4306
0
                m->codec->free(m->codec);
4307
3.78k
            free(m);
4308
3.78k
        }
4309
15.9k
    }
4310
4311
16.4k
    for (i = 0; i < CRAM_MAP_HASH; i++) {
4312
15.9k
        cram_map *m, *m2;
4313
16.0k
        for (m = hdr->tag_encoding_map[i]; m; m = m2) {
4314
157
            m2 = m->next;
4315
157
            if (m->codec)
4316
157
                m->codec->free(m->codec);
4317
157
            free(m);
4318
157
        }
4319
15.9k
    }
4320
4321
23.9k
    for (i = 0; i < DS_END; i++) {
4322
23.4k
        if (hdr->codecs[i])
4323
99
            hdr->codecs[i]->free(hdr->codecs[i]);
4324
23.4k
    }
4325
4326
498
    if (hdr->TL)
4327
3
        free(hdr->TL);
4328
498
    if (hdr->TD_blk)
4329
3
        cram_free_block(hdr->TD_blk);
4330
498
    if (hdr->TD_hash)
4331
498
        kh_destroy(m_s2i, hdr->TD_hash);
4332
498
    if (hdr->TD_keys)
4333
0
        string_pool_destroy(hdr->TD_keys);
4334
4335
498
    free(hdr);
4336
498
}
4337
4338
4339
/* ----------------------------------------------------------------------
4340
 * Slices and slice headers
4341
 */
4342
4343
110
void cram_free_slice_header(cram_block_slice_hdr *hdr) {
4344
110
    if (!hdr)
4345
0
        return;
4346
4347
110
    if (hdr->block_content_ids)
4348
110
        free(hdr->block_content_ids);
4349
4350
110
    free(hdr);
4351
4352
110
    return;
4353
110
}
4354
4355
118
void cram_free_slice(cram_slice *s) {
4356
118
    if (!s)
4357
0
        return;
4358
4359
118
    if (s->hdr_block)
4360
105
        cram_free_block(s->hdr_block);
4361
4362
118
    if (s->block) {
4363
110
        int i;
4364
4365
110
        if (s->hdr) {
4366
798
            for (i = 0; i < s->hdr->num_blocks; i++) {
4367
688
                if (i > 0 && s->block[i] == s->block[0])
4368
246
                    continue;
4369
442
                cram_free_block(s->block[i]);
4370
442
            }
4371
110
        }
4372
110
        free(s->block);
4373
110
    }
4374
4375
118
    if (s->block_by_id)
4376
105
        free(s->block_by_id);
4377
4378
118
    if (s->hdr)
4379
110
        cram_free_slice_header(s->hdr);
4380
4381
118
    if (s->seqs_blk)
4382
105
        cram_free_block(s->seqs_blk);
4383
4384
118
    if (s->qual_blk)
4385
105
        cram_free_block(s->qual_blk);
4386
4387
118
    if (s->name_blk)
4388
105
        cram_free_block(s->name_blk);
4389
4390
118
    if (s->aux_blk)
4391
105
        cram_free_block(s->aux_blk);
4392
4393
118
    if (s->base_blk)
4394
105
        cram_free_block(s->base_blk);
4395
4396
118
    if (s->soft_blk)
4397
105
        cram_free_block(s->soft_blk);
4398
4399
118
    if (s->cigar)
4400
105
        free(s->cigar);
4401
4402
118
    if (s->crecs)
4403
104
        free(s->crecs);
4404
4405
118
    if (s->features)
4406
0
        free(s->features);
4407
4408
118
    if (s->TN)
4409
0
        free(s->TN);
4410
4411
118
    if (s->pair_keys)
4412
0
        string_pool_destroy(s->pair_keys);
4413
4414
118
    if (s->pair[0])
4415
118
        kh_destroy(m_s2i, s->pair[0]);
4416
118
    if (s->pair[1])
4417
118
        kh_destroy(m_s2i, s->pair[1]);
4418
4419
118
    if (s->aux_block)
4420
0
        free(s->aux_block);
4421
4422
118
    free(s);
4423
118
}
4424
4425
/*
4426
 * Creates a new empty slice in memory, for subsequent writing to
4427
 * disk.
4428
 *
4429
 * Returns cram_slice ptr on success
4430
 *         NULL on failure
4431
 */
4432
0
cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) {
4433
0
    cram_slice *s = calloc(1, sizeof(*s));
4434
0
    if (!s)
4435
0
        return NULL;
4436
4437
0
    if (!(s->hdr = (cram_block_slice_hdr *)calloc(1, sizeof(*s->hdr))))
4438
0
        goto err;
4439
0
    s->hdr->content_type = type;
4440
4441
0
    s->hdr_block = NULL;
4442
0
    s->block = NULL;
4443
0
    s->block_by_id = NULL;
4444
0
    s->last_apos = 0;
4445
0
    if (!(s->crecs = malloc(nrecs * sizeof(cram_record))))  goto err;
4446
0
    s->cigar_alloc = 1024;
4447
0
    if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err;
4448
0
    s->ncigar = 0;
4449
4450
0
    if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))       goto err;
4451
0
    if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))   goto err;
4452
0
    if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))   goto err;
4453
0
    if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux)))  goto err;
4454
0
    if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))   goto err;
4455
0
    if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))   goto err;
4456
4457
0
    s->features = NULL;
4458
0
    s->nfeatures = s->afeatures = 0;
4459
4460
0
#ifndef TN_external
4461
0
    s->TN = NULL;
4462
0
    s->nTN = s->aTN = 0;
4463
0
#endif
4464
4465
    // Volatile keys as we do realloc in dstring
4466
0
    if (!(s->pair_keys = string_pool_create(8192))) goto err;
4467
0
    if (!(s->pair[0] = kh_init(m_s2i)))             goto err;
4468
0
    if (!(s->pair[1] = kh_init(m_s2i)))             goto err;
4469
4470
#ifdef BA_external
4471
    s->BA_len = 0;
4472
#endif
4473
4474
0
    return s;
4475
4476
0
 err:
4477
0
    if (s)
4478
0
        cram_free_slice(s);
4479
4480
0
    return NULL;
4481
0
}
4482
4483
/*
4484
 * Loads an entire slice.
4485
 * FIXME: In 1.0 the native unit of slices within CRAM is broken
4486
 * as slices contain references to objects in other slices.
4487
 * To work around this while keeping the slice oriented outer loop
4488
 * we read all slices and stitch them together into a fake large
4489
 * slice instead.
4490
 *
4491
 * Returns cram_slice ptr on success
4492
 *         NULL on failure
4493
 */
4494
118
cram_slice *cram_read_slice(cram_fd *fd) {
4495
118
    cram_block *b = cram_read_block(fd);
4496
118
    cram_slice *s = calloc(1, sizeof(*s));
4497
118
    int i, n, max_id, min_id;
4498
4499
118
    if (!b || !s)
4500
4
        goto err;
4501
4502
114
    s->hdr_block = b;
4503
114
    switch (b->content_type) {
4504
113
    case MAPPED_SLICE:
4505
113
    case UNMAPPED_SLICE:
4506
113
        if (!(s->hdr = cram_decode_slice_header(fd, b)))
4507
3
            goto err;
4508
110
        break;
4509
4510
110
    default:
4511
1
        hts_log_error("Unexpected block of type %s",
4512
1
                      cram_content_type2str(b->content_type));
4513
1
        goto err;
4514
114
    }
4515
4516
110
    if (s->hdr->num_blocks < 1) {
4517
0
        hts_log_error("Slice does not include any data blocks");
4518
0
        goto err;
4519
0
    }
4520
4521
110
    s->block = calloc(n = s->hdr->num_blocks, sizeof(*s->block));
4522
110
    if (!s->block)
4523
0
        goto err;
4524
4525
720
    for (max_id = i = 0, min_id = INT_MAX; i < n; i++) {
4526
615
        if (!(s->block[i] = cram_read_block(fd)))
4527
5
            goto err;
4528
4529
610
        if (s->block[i]->content_type == EXTERNAL) {
4530
241
            if (max_id < s->block[i]->content_id)
4531
5
                max_id = s->block[i]->content_id;
4532
241
            if (min_id > s->block[i]->content_id)
4533
86
                min_id = s->block[i]->content_id;
4534
241
        }
4535
610
    }
4536
4537
105
    if (!(s->block_by_id = calloc(512, sizeof(s->block[0]))))
4538
0
        goto err;
4539
4540
520
    for (i = 0; i < n; i++) {
4541
415
        if (s->block[i]->content_type != EXTERNAL)
4542
294
            continue;
4543
121
        uint32_t v = s->block[i]->content_id;
4544
121
        if (v >= 256)
4545
7
            v = 256 + v % 251;
4546
121
        s->block_by_id[v] = s->block[i];
4547
121
    }
4548
4549
    /* Initialise encoding/decoding tables */
4550
105
    s->cigar_alloc = 1024;
4551
105
    if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err;
4552
105
    s->ncigar = 0;
4553
4554
105
    if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0)))      goto err;
4555
105
    if (!(s->qual_blk = cram_new_block(EXTERNAL, DS_QS)))  goto err;
4556
105
    if (!(s->name_blk = cram_new_block(EXTERNAL, DS_RN)))  goto err;
4557
105
    if (!(s->aux_blk  = cram_new_block(EXTERNAL, DS_aux))) goto err;
4558
105
    if (!(s->base_blk = cram_new_block(EXTERNAL, DS_IN)))  goto err;
4559
105
    if (!(s->soft_blk = cram_new_block(EXTERNAL, DS_SC)))  goto err;
4560
4561
105
    s->crecs = NULL;
4562
4563
105
    s->last_apos = s->hdr->ref_seq_start;
4564
105
    s->decode_md = fd->decode_md;
4565
4566
105
    return s;
4567
4568
13
 err:
4569
13
    if (b)
4570
9
        cram_free_block(b);
4571
13
    if (s) {
4572
13
        s->hdr_block = NULL;
4573
13
        cram_free_slice(s);
4574
13
    }
4575
13
    return NULL;
4576
105
}
4577
4578
4579
/* ----------------------------------------------------------------------
4580
 * CRAM file definition (header)
4581
 */
4582
4583
/*
4584
 * Reads a CRAM file definition structure.
4585
 * Returns file_def ptr on success
4586
 *         NULL on failure
4587
 */
4588
275
cram_file_def *cram_read_file_def(cram_fd *fd) {
4589
275
    cram_file_def *def = malloc(sizeof(*def));
4590
275
    if (!def)
4591
0
        return NULL;
4592
4593
275
    if (26 != hread(fd->fp, &def->magic[0], 26)) {
4594
0
        free(def);
4595
0
        return NULL;
4596
0
    }
4597
4598
275
    if (memcmp(def->magic, "CRAM", 4) != 0) {
4599
0
        free(def);
4600
0
        return NULL;
4601
0
    }
4602
4603
275
    if (def->major_version > 4) {
4604
0
        hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x, 3.x or 4.x, got %d.%d",
4605
0
                      def->major_version, def->minor_version);
4606
0
        free(def);
4607
0
        return NULL;
4608
0
    }
4609
4610
275
    fd->first_container += 26;
4611
275
    fd->curr_position = fd->first_container;
4612
275
    fd->last_slice = 0;
4613
4614
275
    return def;
4615
275
}
4616
4617
/*
4618
 * Writes a cram_file_def structure to cram_fd.
4619
 * Returns 0 on success
4620
 *        -1 on failure
4621
 */
4622
0
int cram_write_file_def(cram_fd *fd, cram_file_def *def) {
4623
0
    return (hwrite(fd->fp, &def->magic[0], 26) == 26) ? 0 : -1;
4624
0
}
4625
4626
275
void cram_free_file_def(cram_file_def *def) {
4627
275
    if (def) free(def);
4628
275
}
4629
4630
/* ----------------------------------------------------------------------
4631
 * SAM header I/O
4632
 */
4633
4634
4635
/*
4636
 * Reads the SAM header from the first CRAM data block.
4637
 * Also performs minimal parsing to extract read-group
4638
 * and sample information.
4639
4640
 * Returns SAM hdr ptr on success
4641
 *         NULL on failure
4642
 */
4643
275
sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) {
4644
275
    int32_t header_len;
4645
275
    char *header;
4646
275
    sam_hdr_t *hdr;
4647
4648
    /* 1.1 onwards stores the header in the first block of a container */
4649
275
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4650
        /* Length */
4651
241
        if (-1 == int32_decode(fd, &header_len))
4652
0
            return NULL;
4653
4654
        /* Alloc and read */
4655
241
        if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1)))
4656
1
            return NULL;
4657
4658
240
        if (header_len != hread(fd->fp, header, header_len)) {
4659
0
            free(header);
4660
0
            return NULL;
4661
0
        }
4662
240
        header[header_len] = '\0';
4663
4664
240
        fd->first_container += 4 + header_len;
4665
240
    } else {
4666
34
        cram_container *c = cram_read_container(fd);
4667
34
        cram_block *b;
4668
34
        int i;
4669
34
        int64_t len;
4670
4671
34
        if (!c)
4672
7
            return NULL;
4673
4674
27
        fd->first_container += c->length + c->offset;
4675
27
        fd->curr_position = fd->first_container;
4676
4677
27
        if (c->num_blocks < 1) {
4678
0
            cram_free_container(c);
4679
0
            return NULL;
4680
0
        }
4681
4682
27
        if (!(b = cram_read_block(fd))) {
4683
2
            cram_free_container(c);
4684
2
            return NULL;
4685
2
        }
4686
25
        if (cram_uncompress_block(b) != 0) {
4687
10
            cram_free_container(c);
4688
10
            cram_free_block(b);
4689
10
            return NULL;
4690
10
        }
4691
4692
15
        len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4693
15
            fd->vv.varint_size(b->content_id) +
4694
15
            fd->vv.varint_size(b->uncomp_size) +
4695
15
            fd->vv.varint_size(b->comp_size);
4696
4697
        /* Extract header from 1st block */
4698
15
        if (-1 == int32_get_blk(b, &header_len) ||
4699
15
            header_len < 0 || /* Spec. says signed...  why? */
4700
15
            b->uncomp_size - 4 < header_len) {
4701
2
            cram_free_container(c);
4702
2
            cram_free_block(b);
4703
2
            return NULL;
4704
2
        }
4705
13
        if (NULL == (header = malloc((size_t) header_len+1))) {
4706
0
            cram_free_container(c);
4707
0
            cram_free_block(b);
4708
0
            return NULL;
4709
0
        }
4710
13
        memcpy(header, BLOCK_END(b), header_len);
4711
13
        header[header_len] = '\0';
4712
13
        cram_free_block(b);
4713
4714
        /* Consume any remaining blocks */
4715
1.93k
        for (i = 1; i < c->num_blocks; i++) {
4716
1.92k
            if (!(b = cram_read_block(fd))) {
4717
6
                cram_free_container(c);
4718
6
                free(header);
4719
6
                return NULL;
4720
6
            }
4721
1.92k
            len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
4722
1.92k
                fd->vv.varint_size(b->content_id) +
4723
1.92k
                fd->vv.varint_size(b->uncomp_size) +
4724
1.92k
                fd->vv.varint_size(b->comp_size);
4725
1.92k
            cram_free_block(b);
4726
1.92k
        }
4727
4728
7
        if (c->length > 0 && len > 0 && c->length > len) {
4729
            // Consume padding
4730
1
            char *pads = malloc(c->length - len);
4731
1
            if (!pads) {
4732
0
                cram_free_container(c);
4733
0
                free(header);
4734
0
                return NULL;
4735
0
            }
4736
4737
1
            if (c->length - len != hread(fd->fp, pads, c->length - len)) {
4738
1
                cram_free_container(c);
4739
1
                free(header);
4740
1
                free(pads);
4741
1
                return NULL;
4742
1
            }
4743
0
            free(pads);
4744
0
        }
4745
4746
6
        cram_free_container(c);
4747
6
    }
4748
4749
    /* Parse */
4750
246
    hdr = sam_hdr_init();
4751
246
    if (!hdr) {
4752
0
        free(header);
4753
0
        return NULL;
4754
0
    }
4755
4756
246
    if (-1 == sam_hdr_add_lines(hdr, header, header_len)) {
4757
1
        free(header);
4758
1
        sam_hdr_destroy(hdr);
4759
1
        return NULL;
4760
1
    }
4761
4762
245
    hdr->l_text = header_len;
4763
245
    hdr->text = header;
4764
4765
245
    return hdr;
4766
4767
246
}
4768
4769
/*
4770
 * Converts 'in' to a full pathname to store in out.
4771
 * Out must be at least PATH_MAX bytes long.
4772
 */
4773
0
static void full_path(char *out, char *in) {
4774
0
    size_t in_l = strlen(in);
4775
0
    if (hisremote(in)) {
4776
0
        if (in_l > PATH_MAX) {
4777
0
            hts_log_error("Reference path is longer than %d", PATH_MAX);
4778
0
            return;
4779
0
        }
4780
0
        strncpy(out, in, PATH_MAX-1);
4781
0
        out[PATH_MAX-1] = 0;
4782
0
        return;
4783
0
    }
4784
0
    if (*in == '/' ||
4785
        // Windows paths
4786
0
        (in_l > 3 && toupper_c(*in) >= 'A'  && toupper_c(*in) <= 'Z' &&
4787
0
         in[1] == ':' && (in[2] == '/' || in[2] == '\\'))) {
4788
0
        strncpy(out, in, PATH_MAX-1);
4789
0
        out[PATH_MAX-1] = 0;
4790
0
    } else {
4791
0
        int len;
4792
4793
        // unable to get dir or out+in is too long
4794
0
        if (!getcwd(out, PATH_MAX) ||
4795
0
            (len = strlen(out))+1+strlen(in) >= PATH_MAX) {
4796
0
            strncpy(out, in, PATH_MAX-1);
4797
0
            out[PATH_MAX-1] = 0;
4798
0
            return;
4799
0
        }
4800
4801
0
        sprintf(out+len, "/%.*s", PATH_MAX - 2 - len, in);
4802
4803
        // FIXME: cope with `pwd`/../../../foo.fa ?
4804
0
    }
4805
0
}
4806
4807
/*
4808
 * Writes a CRAM SAM header.
4809
 * Returns 0 on success
4810
 *        -1 on failure
4811
 */
4812
0
int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) {
4813
0
    size_t header_len;
4814
0
    int blank_block = (CRAM_MAJOR_VERS(fd->version) >= 3);
4815
4816
    /* Write CRAM MAGIC if not yet written. */
4817
0
    if (fd->file_def->major_version == 0) {
4818
0
        fd->file_def->major_version = CRAM_MAJOR_VERS(fd->version);
4819
0
        fd->file_def->minor_version = CRAM_MINOR_VERS(fd->version);
4820
0
        if (0 != cram_write_file_def(fd, fd->file_def))
4821
0
            return -1;
4822
0
    }
4823
4824
    /* 1.0 requires an UNKNOWN read-group */
4825
0
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
4826
0
        if (!sam_hrecs_find_rg(hdr->hrecs, "UNKNOWN"))
4827
0
            if (sam_hdr_add_line(hdr, "RG",
4828
0
                            "ID", "UNKNOWN", "SM", "UNKNOWN", NULL))
4829
0
                return -1;
4830
0
    }
4831
4832
0
    if (-1 == refs_from_header(fd))
4833
0
        return -1;
4834
0
    if (-1 == refs2id(fd->refs, fd->header))
4835
0
        return -1;
4836
4837
    /* Fix M5 strings */
4838
0
    if (fd->refs && !fd->no_ref && fd->embed_ref <= 1) {
4839
0
        int i;
4840
0
        for (i = 0; i < hdr->hrecs->nref; i++) {
4841
0
            sam_hrec_type_t *ty;
4842
0
            char *ref;
4843
4844
0
            if (!(ty = sam_hrecs_find_type_id(hdr->hrecs, "SQ", "SN", hdr->hrecs->ref[i].name)))
4845
0
                return -1;
4846
4847
0
            if (!sam_hrecs_find_key(ty, "M5", NULL)) {
4848
0
                char unsigned buf[16];
4849
0
                char buf2[33];
4850
0
                int rlen;
4851
0
                hts_md5_context *md5;
4852
4853
0
                if (!fd->refs ||
4854
0
                    !fd->refs->ref_id ||
4855
0
                    !fd->refs->ref_id[i]) {
4856
0
                    return -1;
4857
0
                }
4858
0
                rlen = fd->refs->ref_id[i]->length;
4859
0
                ref = cram_get_ref(fd, i, 1, rlen);
4860
0
                if (NULL == ref) {
4861
0
                    if (fd->embed_ref == -1) {
4862
                        // auto embed-ref
4863
0
                        hts_log_warning("No M5 tags present and could not "
4864
0
                                        "find reference");
4865
0
                        hts_log_warning("Enabling embed_ref=2 option");
4866
0
                        hts_log_warning("NOTE: the CRAM file will be bigger "
4867
0
                                        "than using an external reference");
4868
0
                        pthread_mutex_lock(&fd->ref_lock);
4869
0
                        fd->embed_ref = 2;
4870
0
                        pthread_mutex_unlock(&fd->ref_lock);
4871
0
                        break;
4872
0
                    }
4873
0
                    return -1;
4874
0
                }
4875
0
                <