Coverage Report

Created: 2025-12-08 06:33

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/cram/cram_decode.c
Line
Count
Source
1
/*
2
Copyright (c) 2012-2020, 2022-2025 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * - In-memory decoding of CRAM data structures.
33
 * - Iterator for reading CRAM record by record.
34
 */
35
36
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
37
#include <config.h>
38
39
#include <stdio.h>
40
#include <errno.h>
41
#include <assert.h>
42
#include <stdlib.h>
43
#include <string.h>
44
#include <zlib.h>
45
#include <sys/types.h>
46
#include <sys/stat.h>
47
#include <math.h>
48
#include <stdint.h>
49
#include <inttypes.h>
50
51
#include "cram.h"
52
#include "os.h"
53
#include "../htslib/hts.h"
54
#include "../htslib/hfile.h"
55
56
//Whether CIGAR has just M or uses = and X to indicate match and mismatch
57
//#define USE_X
58
59
/* ----------------------------------------------------------------------
60
 * CRAM compression headers
61
 */
62
63
/*
64
 * Decodes the Tag Dictionary record in the preservation map
65
 * Updates the cram compression header.
66
 *
67
 * Returns number of bytes decoded on success
68
 *        -1 on failure
69
 */
70
int cram_decode_TD(cram_fd *fd, char *cp, const char *endp,
71
303
                   cram_block_compression_hdr *h) {
72
303
    char *op = cp;
73
303
    unsigned char *dat;
74
303
    cram_block *b;
75
303
    int32_t blk_size = 0;
76
303
    int nTL, i, sz, err = 0;
77
78
303
    if (!(b = cram_new_block(0, 0)))
79
0
        return -1;
80
81
303
    if (h->TD_blk || h->TL) {
82
174
        hts_log_warning("More than one TD block found in compression header");
83
174
        cram_free_block(h->TD_blk);
84
174
        free(h->TL);
85
174
        h->TD_blk = NULL;
86
174
        h->TL = NULL;
87
174
    }
88
89
    /* Decode */
90
303
    blk_size = fd->vv.varint_get32(&cp, endp, &err);
91
303
    if (!blk_size) {
92
69
        h->nTL = 0;
93
69
        cram_free_block(b);
94
69
        return cp - op;
95
69
    }
96
97
234
    if (err || blk_size < 0 || endp - cp < blk_size) {
98
6
        cram_free_block(b);
99
6
        return -1;
100
6
    }
101
102
228
    BLOCK_APPEND(b, cp, blk_size);
103
228
    cp += blk_size;
104
228
    sz = cp - op;
105
    // Force nul termination if missing
106
228
    if (BLOCK_DATA(b)[BLOCK_SIZE(b)-1])
107
186
        BLOCK_APPEND_CHAR(b, '\0');
108
109
    /* Set up TL lookup table */
110
228
    dat = BLOCK_DATA(b);
111
112
    // Count
113
2.78k
    for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
114
2.55k
        nTL++;
115
11.6k
        while (dat[i])
116
9.13k
            i++;
117
2.55k
    }
118
119
    // Copy
120
228
    if (!(h->TL = calloc(nTL, sizeof(*h->TL)))) {
121
0
        cram_free_block(b);
122
0
        return -1;
123
0
    }
124
2.78k
    for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
125
2.55k
        h->TL[nTL++] = &dat[i];
126
11.6k
        while (dat[i])
127
9.13k
            i++;
128
2.55k
    }
129
228
    h->TD_blk = b;
130
228
    h->nTL = nTL;
131
132
228
    return sz;
133
134
0
 block_err:
135
0
    cram_free_block(b);
136
0
    return -1;
137
228
}
138
139
/*
140
 * Decodes a CRAM block compression header.
141
 * Returns header ptr on success
142
 *         NULL on failure
143
 */
144
cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
145
3.61k
                                                           cram_block *b) {
146
3.61k
    char *cp, *endp, *cp_copy;
147
3.61k
    cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
148
3.61k
    int i, err = 0;
149
3.61k
    int32_t map_size = 0, map_count = 0;
150
151
3.61k
    if (!hdr)
152
0
        return NULL;
153
154
3.61k
    if (b->method != RAW) {
155
2.42k
        if (cram_uncompress_block(b)) {
156
2.04k
            free(hdr);
157
2.04k
            return NULL;
158
2.04k
        }
159
2.42k
    }
160
161
1.56k
    cp = (char *)b->data;
162
1.56k
    endp = cp + b->uncomp_size;
163
164
1.56k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
165
1.41k
        hdr->ref_seq_id = fd->vv.varint_get32(&cp, endp, &err);
166
1.41k
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
167
0
            hdr->ref_seq_start = fd->vv.varint_get64(&cp, endp, &err);
168
0
            hdr->ref_seq_span  = fd->vv.varint_get64(&cp, endp, &err);
169
1.41k
        } else {
170
1.41k
            hdr->ref_seq_start = fd->vv.varint_get32(&cp, endp, &err);
171
1.41k
            hdr->ref_seq_span  = fd->vv.varint_get32(&cp, endp, &err);
172
1.41k
        }
173
1.41k
        hdr->num_records   = fd->vv.varint_get32(&cp, endp, &err);
174
1.41k
        hdr->num_landmarks = fd->vv.varint_get32(&cp, endp, &err);
175
1.41k
        if (hdr->num_landmarks < 0 ||
176
1.40k
            hdr->num_landmarks >= SIZE_MAX / sizeof(int32_t) ||
177
1.40k
            endp - cp < hdr->num_landmarks) {
178
42
            free(hdr);
179
42
            return NULL;
180
42
        }
181
1.37k
        if (!(hdr->landmark = malloc(hdr->num_landmarks * sizeof(int32_t)))) {
182
0
            free(hdr);
183
0
            return NULL;
184
0
        }
185
27.1k
        for (i = 0; i < hdr->num_landmarks; i++)
186
25.7k
            hdr->landmark[i] = fd->vv.varint_get32(&cp, endp, &err);;
187
1.37k
    }
188
189
1.52k
    hdr->preservation_map = kh_init(map);
190
191
1.52k
    memset(hdr->rec_encoding_map, 0,
192
1.52k
           CRAM_MAP_HASH * sizeof(hdr->rec_encoding_map[0]));
193
1.52k
    memset(hdr->tag_encoding_map, 0,
194
1.52k
           CRAM_MAP_HASH * sizeof(hdr->tag_encoding_map[0]));
195
196
1.52k
    if (!hdr->preservation_map) {
197
0
        cram_free_compression_header(hdr);
198
0
        return NULL;
199
0
    }
200
201
    /* Initialise defaults for preservation map */
202
1.52k
    hdr->read_names_included = 0;
203
1.52k
    hdr->AP_delta = 1;
204
1.52k
    hdr->qs_seq_orient = 1;
205
1.52k
    memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20);
206
207
    /* Preservation map */
208
1.52k
    map_size  = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp;
209
1.52k
    map_count = fd->vv.varint_get32(&cp, endp, &err);
210
36.2k
    for (i = 0; i < map_count; i++) {
211
34.8k
        pmap_t hd;
212
34.8k
        khint_t k;
213
34.8k
        int r;
214
215
34.8k
        if (endp - cp < 3) {
216
132
            cram_free_compression_header(hdr);
217
132
            return NULL;
218
132
        }
219
34.7k
        cp += 2;
220
34.7k
        switch(CRAM_KEY(cp[-2],cp[-1])) {
221
3
        case CRAM_KEY('M','I'): // was mapped QS included in V1.0
222
42
        case CRAM_KEY('U','I'): // was unmapped QS included in V1.0
223
75
        case CRAM_KEY('P','I'): // was unmapped placed in V1.0
224
75
            hd.i = *cp++;
225
75
            break;
226
227
63
        case CRAM_KEY('R','N'):
228
63
            hd.i = *cp++;
229
63
            k = kh_put(map, hdr->preservation_map, "RN", &r);
230
63
            if (-1 == r) {
231
0
                cram_free_compression_header(hdr);
232
0
                return NULL;
233
0
            }
234
235
63
            kh_val(hdr->preservation_map, k) = hd;
236
63
            hdr->read_names_included = hd.i;
237
63
            break;
238
239
150
        case CRAM_KEY('A','P'):
240
150
            hd.i = *cp++;
241
150
            k = kh_put(map, hdr->preservation_map, "AP", &r);
242
150
            if (-1 == r) {
243
0
                cram_free_compression_header(hdr);
244
0
                return NULL;
245
0
            }
246
247
150
            kh_val(hdr->preservation_map, k) = hd;
248
150
            hdr->AP_delta = hd.i;
249
150
            break;
250
251
564
        case CRAM_KEY('R','R'):
252
564
            hd.i = *cp++;
253
564
            k = kh_put(map, hdr->preservation_map, "RR", &r);
254
564
            if (-1 == r) {
255
0
                cram_free_compression_header(hdr);
256
0
                return NULL;
257
0
            }
258
259
564
            kh_val(hdr->preservation_map, k) = hd;
260
564
            hdr->no_ref = !hd.i;
261
564
            break;
262
263
84
        case CRAM_KEY('Q','O'):
264
84
            hd.i = *cp++;
265
84
            k = kh_put(map, hdr->preservation_map, "QO", &r);
266
84
            if (-1 == r) {
267
0
                cram_free_compression_header(hdr);
268
0
                return NULL;
269
0
            }
270
271
84
            kh_val(hdr->preservation_map, k) = hd;
272
84
            hdr->qs_seq_orient = hd.i;
273
84
            break;
274
275
303
        case CRAM_KEY('S','M'):
276
303
            if (endp - cp < 5) {
277
3
                cram_free_compression_header(hdr);
278
3
                return NULL;
279
3
            }
280
300
            hdr->substitution_matrix[0][(cp[0]>>6)&3] = 'C';
281
300
            hdr->substitution_matrix[0][(cp[0]>>4)&3] = 'G';
282
300
            hdr->substitution_matrix[0][(cp[0]>>2)&3] = 'T';
283
300
            hdr->substitution_matrix[0][(cp[0]>>0)&3] = 'N';
284
285
300
            hdr->substitution_matrix[1][(cp[1]>>6)&3] = 'A';
286
300
            hdr->substitution_matrix[1][(cp[1]>>4)&3] = 'G';
287
300
            hdr->substitution_matrix[1][(cp[1]>>2)&3] = 'T';
288
300
            hdr->substitution_matrix[1][(cp[1]>>0)&3] = 'N';
289
290
300
            hdr->substitution_matrix[2][(cp[2]>>6)&3] = 'A';
291
300
            hdr->substitution_matrix[2][(cp[2]>>4)&3] = 'C';
292
300
            hdr->substitution_matrix[2][(cp[2]>>2)&3] = 'T';
293
300
            hdr->substitution_matrix[2][(cp[2]>>0)&3] = 'N';
294
295
300
            hdr->substitution_matrix[3][(cp[3]>>6)&3] = 'A';
296
300
            hdr->substitution_matrix[3][(cp[3]>>4)&3] = 'C';
297
300
            hdr->substitution_matrix[3][(cp[3]>>2)&3] = 'G';
298
300
            hdr->substitution_matrix[3][(cp[3]>>0)&3] = 'N';
299
300
300
            hdr->substitution_matrix[4][(cp[4]>>6)&3] = 'A';
301
300
            hdr->substitution_matrix[4][(cp[4]>>4)&3] = 'C';
302
300
            hdr->substitution_matrix[4][(cp[4]>>2)&3] = 'G';
303
300
            hdr->substitution_matrix[4][(cp[4]>>0)&3] = 'T';
304
305
300
            hd.p = cp;
306
300
            cp += 5;
307
308
300
            k = kh_put(map, hdr->preservation_map, "SM", &r);
309
300
            if (-1 == r) {
310
0
                cram_free_compression_header(hdr);
311
0
                return NULL;
312
0
            }
313
300
            kh_val(hdr->preservation_map, k) = hd;
314
300
            break;
315
316
303
        case CRAM_KEY('T','D'): {
317
303
            int sz = cram_decode_TD(fd, cp, endp, hdr); // tag dictionary
318
303
            if (sz < 0) {
319
6
                cram_free_compression_header(hdr);
320
6
                return NULL;
321
6
            }
322
323
297
            hd.p = cp;
324
297
            cp += sz;
325
326
297
            k = kh_put(map, hdr->preservation_map, "TD", &r);
327
297
            if (-1 == r) {
328
0
                cram_free_compression_header(hdr);
329
0
                return NULL;
330
0
            }
331
297
            kh_val(hdr->preservation_map, k) = hd;
332
297
            break;
333
297
        }
334
335
33.2k
        default:
336
33.2k
            hts_log_warning("Unrecognised preservation map key %c%c", cp[-2], cp[-1]);
337
            // guess byte;
338
33.2k
            cp++;
339
33.2k
            break;
340
34.7k
        }
341
34.7k
    }
342
1.38k
    if (cp - cp_copy != map_size) {
343
75
        cram_free_compression_header(hdr);
344
75
        return NULL;
345
75
    }
346
347
    /* Record encoding map */
348
1.31k
    map_size  = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp;
349
1.31k
    map_count = fd->vv.varint_get32(&cp, endp, &err);
350
1.31k
    int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0;
351
33.1k
    for (i = 0; i < map_count; i++) {
352
32.5k
        char *key = cp;
353
32.5k
        int32_t encoding = E_NULL;
354
32.5k
        int32_t size = 0;
355
32.5k
        ptrdiff_t offset;
356
32.5k
        cram_map *m;
357
32.5k
        enum cram_DS_ID ds_id;
358
32.5k
        enum cram_external_type type;
359
360
32.5k
        if (endp - cp < 4) {
361
42
            cram_free_compression_header(hdr);
362
42
            return NULL;
363
42
        }
364
365
32.4k
        cp += 2;
366
32.4k
        encoding = fd->vv.varint_get32(&cp, endp, &err);
367
32.4k
        size     = fd->vv.varint_get32(&cp, endp, &err);
368
369
32.4k
        offset = cp - (char *)b->data;
370
371
32.4k
        if (encoding == E_NULL)
372
9.26k
            continue;
373
374
23.2k
        if (size < 0 || endp - cp < size) {
375
226
            cram_free_compression_header(hdr);
376
226
            return NULL;
377
226
        }
378
379
        //printf("%s codes for %.2s\n", cram_encoding2str(encoding), key);
380
381
        /*
382
         * For CRAM1.0 CF and BF are Byte and not Int.
383
         * Practically speaking it makes no difference unless we have a
384
         * 1.0 format file that stores these in EXTERNAL as only then
385
         * does Byte vs Int matter.
386
         *
387
         * Neither this C code nor Java reference implementations did this,
388
         * so we gloss over it and treat them as int.
389
         */
390
23.0k
        ds_id = DS_CORE;
391
23.0k
        if (key[0] == 'B' && key[1] == 'F') {
392
66
            ds_id = DS_BF; type = E_INT;
393
22.9k
        } else if (key[0] == 'C' && key[1] == 'F') {
394
153
            ds_id = DS_CF; type = E_INT;
395
22.7k
        } else if (key[0] == 'R' && key[1] == 'I') {
396
62
            ds_id = DS_RI; type = E_INT;
397
22.7k
        } else if (key[0] == 'R' && key[1] == 'L') {
398
27
            ds_id = DS_RL; type = E_INT;
399
22.6k
        } else if (key[0] == 'A' && key[1] == 'P') {
400
288
            ds_id = DS_AP;
401
288
            type = is_v4 ? E_SLONG : E_INT;
402
22.4k
        } else if (key[0] == 'R' && key[1] == 'G') {
403
36
            ds_id = DS_RG;
404
36
            type = E_INT;
405
22.3k
        } else if (key[0] == 'M' && key[1] == 'F') {
406
38
            ds_id = DS_MF; type = E_INT;
407
22.3k
        } else if (key[0] == 'N' && key[1] == 'S') {
408
105
            ds_id = DS_NS; type = E_INT;
409
22.2k
        } else if (key[0] == 'N' && key[1] == 'P') {
410
624
            ds_id = DS_NP;
411
624
            type = is_v4 ? E_LONG : E_INT;
412
21.6k
        } else if (key[0] == 'T' && key[1] == 'S') {
413
60
            ds_id = DS_TS;
414
60
            type = is_v4 ? E_SLONG : E_INT;
415
21.5k
        } else if (key[0] == 'N' && key[1] == 'F') {
416
114
            ds_id = DS_NF; type = E_INT;
417
21.4k
        } else if (key[0] == 'T' && key[1] == 'C') {
418
48
            ds_id = DS_TC; type = E_BYTE;
419
21.3k
        } else if (key[0] == 'T' && key[1] == 'N') {
420
3
            ds_id = DS_TN; type = E_INT;
421
21.3k
        } else if (key[0] == 'F' && key[1] == 'N') {
422
0
            ds_id = DS_FN; type = E_INT;
423
21.3k
        } else if (key[0] == 'F' && key[1] == 'C') {
424
243
            ds_id = DS_FC; type = E_BYTE;
425
21.1k
        } else if (key[0] == 'F' && key[1] == 'P') {
426
6
            ds_id = DS_FP; type = E_INT;
427
21.1k
        } else if (key[0] == 'B' && key[1] == 'S') {
428
21
            ds_id = DS_BS; type = E_BYTE;
429
21.1k
        } else if (key[0] == 'I' && key[1] == 'N') {
430
60
            ds_id = DS_IN; type = E_BYTE_ARRAY;
431
21.0k
        } else if (key[0] == 'S' && key[1] == 'C') {
432
3
            ds_id = DS_SC; type = E_BYTE_ARRAY;
433
21.0k
        } else if (key[0] == 'D' && key[1] == 'L') {
434
66
            ds_id = DS_DL; type = E_INT;
435
20.9k
        } else if (key[0] == 'B' && key[1] == 'A') {
436
192
            ds_id = DS_BA; type = E_BYTE;
437
20.7k
        } else if (key[0] == 'B' && key[1] == 'B') {
438
99
            ds_id = DS_BB; type = E_BYTE_ARRAY;
439
20.6k
        } else if (key[0] == 'R' && key[1] == 'S') {
440
88
            ds_id = DS_RS; type = E_INT;
441
20.6k
        } else if (key[0] == 'P' && key[1] == 'D') {
442
3
            ds_id = DS_PD; type = E_INT;
443
20.6k
        } else if (key[0] == 'H' && key[1] == 'C') {
444
6
            ds_id = DS_HC; type = E_INT;
445
20.5k
        } else if (key[0] == 'M' && key[1] == 'Q') {
446
1.52k
            ds_id = DS_MQ; type = E_INT;
447
19.0k
        } else if (key[0] == 'R' && key[1] == 'N') {
448
21
            ds_id = DS_RN; type = E_BYTE_ARRAY_BLOCK;
449
19.0k
        } else if (key[0] == 'Q' && key[1] == 'S') {
450
99
            ds_id = DS_QS; type = E_BYTE;
451
18.9k
        } else if (key[0] == 'Q' && key[1] == 'Q') {
452
3
            ds_id = DS_QQ; type = E_BYTE_ARRAY;
453
18.9k
        } else if (key[0] == 'T' && key[1] == 'L') {
454
3
            ds_id = DS_TL; type = E_INT;
455
18.9k
        } else if (key[0] == 'T' && key[1] == 'M') {
456
18.9k
        } else if (key[0] == 'T' && key[1] == 'V') {
457
18.9k
        } else {
458
18.9k
            hts_log_warning("Unrecognised key: %.2s", key);
459
18.9k
        }
460
461
23.0k
        if (ds_id != DS_CORE) {
462
4.06k
            if (hdr->codecs[ds_id] != NULL) {
463
3.22k
                hts_log_warning("Codec for key %.2s defined more than once",
464
3.22k
                                key);
465
3.22k
                hdr->codecs[ds_id]->free(hdr->codecs[ds_id]);
466
3.22k
            }
467
4.06k
            hdr->codecs[ds_id] = cram_decoder_init(hdr, encoding, cp, size,
468
4.06k
                                                   type, fd->version, &fd->vv);
469
4.06k
            if (!hdr->codecs[ds_id]) {
470
451
                cram_free_compression_header(hdr);
471
451
                return NULL;
472
451
            }
473
4.06k
        }
474
475
22.5k
        cp += size;
476
477
        // Fill out cram_map purely for cram_dump to dump out.
478
22.5k
        m = malloc(sizeof(*m));
479
22.5k
        if (!m) {
480
0
            cram_free_compression_header(hdr);
481
0
            return NULL;
482
0
        }
483
22.5k
        m->key = CRAM_KEY(key[0], key[1]);
484
22.5k
        m->encoding = encoding;
485
22.5k
        m->size     = size;
486
22.5k
        m->offset   = offset;
487
22.5k
        m->codec = NULL;
488
489
22.5k
        m->next = hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])];
490
22.5k
        hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])] = m;
491
22.5k
    }
492
591
    if (cp - cp_copy != map_size) {
493
72
        cram_free_compression_header(hdr);
494
72
        return NULL;
495
72
    }
496
497
    /* Tag encoding map */
498
519
    map_size  = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp;
499
519
    map_count = fd->vv.varint_get32(&cp, endp, &err);
500
1.63k
    for (i = 0; i < map_count; i++) {
501
1.21k
        int32_t encoding = E_NULL;
502
1.21k
        int32_t size = 0;
503
1.21k
        cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
504
1.21k
        uint8_t key[3];
505
506
1.21k
        if (!m || endp - cp < 6) {
507
9
            free(m);
508
9
            cram_free_compression_header(hdr);
509
9
            return NULL;
510
9
        }
511
512
1.20k
        m->key = fd->vv.varint_get32(&cp, endp, &err);
513
1.20k
        key[0] = m->key>>16;
514
1.20k
        key[1] = m->key>>8;
515
1.20k
        key[2] = m->key;
516
1.20k
        encoding = fd->vv.varint_get32(&cp, endp, &err);
517
1.20k
        size     = fd->vv.varint_get32(&cp, endp, &err);
518
519
1.20k
        m->encoding = encoding;
520
1.20k
        m->size     = size;
521
1.20k
        m->offset   = cp - (char *)b->data;
522
1.20k
        if (size < 0 || endp - cp < size ||
523
1.19k
            !(m->codec = cram_decoder_init(hdr, encoding, cp, size,
524
1.19k
                                           E_BYTE_ARRAY_BLOCK, fd->version, &fd->vv))) {
525
93
            cram_free_compression_header(hdr);
526
93
            free(m);
527
93
            return NULL;
528
93
        }
529
530
1.11k
        cp += size;
531
532
1.11k
        m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])];
533
1.11k
        hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m;
534
1.11k
    }
535
417
    if (err || cp - cp_copy != map_size) {
536
73
        cram_free_compression_header(hdr);
537
73
        return NULL;
538
73
    }
539
540
344
    return hdr;
541
417
}
542
543
/*
544
 * Note we also need to scan through the record encoding map to
545
 * see which data series share the same block, either external or
546
 * CORE. For example if we need the BF data series but MQ and CF
547
 * are also encoded in the same block then we need to add those in
548
 * as a dependency in order to correctly decode BF.
549
 *
550
 * Returns 0 on success
551
 *        -1 on failure
552
 */
553
int cram_dependent_data_series(cram_fd *fd,
554
                               cram_block_compression_hdr *hdr,
555
477
                               cram_slice *s) {
556
477
    int *block_used;
557
477
    int core_used = 0;
558
477
    int i;
559
477
    static int i_to_id[] = {
560
477
        DS_BF, DS_AP, DS_FP, DS_RL, DS_DL, DS_NF, DS_BA, DS_QS,
561
477
        DS_FC, DS_FN, DS_BS, DS_IN, DS_RG, DS_MQ, DS_TL, DS_RN,
562
477
        DS_NS, DS_NP, DS_TS, DS_MF, DS_CF, DS_RI, DS_RS, DS_PD,
563
477
        DS_HC, DS_SC, DS_BB, DS_QQ,
564
477
    };
565
477
    uint32_t orig_ds;
566
567
    /*
568
     * Set the data_series bit field based on fd->required_fields
569
     * contents.
570
     */
571
477
    if (fd->required_fields && fd->required_fields != INT_MAX) {
572
0
        s->data_series = 0;
573
574
0
        if (fd->required_fields & SAM_QNAME)
575
0
            s->data_series |= CRAM_RN;
576
577
0
        if (fd->required_fields & SAM_FLAG)
578
0
            s->data_series |= CRAM_BF;
579
580
0
        if (fd->required_fields & SAM_RNAME)
581
0
            s->data_series |= CRAM_RI | CRAM_BF;
582
583
0
        if (fd->required_fields & SAM_POS)
584
0
            s->data_series |= CRAM_AP | CRAM_BF;
585
586
0
        if (fd->required_fields & SAM_MAPQ)
587
0
            s->data_series |= CRAM_MQ;
588
589
0
        if (fd->required_fields & SAM_CIGAR)
590
0
            s->data_series |= CRAM_CIGAR;
591
592
0
        if (fd->required_fields & SAM_RNEXT)
593
0
            s->data_series |= CRAM_CF | CRAM_NF | CRAM_RI | CRAM_NS |CRAM_BF;
594
595
0
        if (fd->required_fields & SAM_PNEXT)
596
0
            s->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_NP | CRAM_BF;
597
598
0
        if (fd->required_fields & SAM_TLEN)
599
0
            s->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_TS |
600
0
                CRAM_BF | CRAM_MF | CRAM_RI | CRAM_CIGAR;
601
602
0
        if (fd->required_fields & SAM_SEQ)
603
0
            s->data_series |= CRAM_SEQ;
604
605
0
        if (!(fd->required_fields & SAM_AUX))
606
            // No easy way to get MD/NM without other tags at present
607
0
            s->decode_md = 0;
608
609
0
        if (fd->required_fields & SAM_QUAL)
610
0
            s->data_series |= CRAM_QUAL;
611
612
0
        if (fd->required_fields & SAM_AUX)
613
0
            s->data_series |= CRAM_RG | CRAM_TL | CRAM_aux;
614
615
0
        if (fd->required_fields & SAM_RGAUX)
616
0
            s->data_series |= CRAM_RG | CRAM_BF;
617
618
        // Always uncompress CORE block
619
0
        if (cram_uncompress_block(s->block[0]))
620
0
            return -1;
621
477
    } else {
622
477
        s->data_series = CRAM_ALL;
623
624
2.31k
        for (i = 0; i < s->hdr->num_blocks; i++) {
625
1.84k
            if (cram_uncompress_block(s->block[i]))
626
3
                return -1;
627
1.84k
        }
628
629
474
        return 0;
630
477
    }
631
632
0
    block_used = calloc(s->hdr->num_blocks+1, sizeof(int));
633
0
    if (!block_used)
634
0
        return -1;
635
636
0
    do {
637
        /*
638
         * Also set data_series based on code prerequisites. Eg if we need
639
         * CRAM_QS then we also need to know CRAM_RL so we know how long it
640
         * is, or if we need FC/FP then we also need FN (number of features).
641
         *
642
         * It's not reciprocal though. We may be needing to decode FN
643
         * but have no need to decode FC, FP and cigar ops.
644
         */
645
0
        if (s->data_series & CRAM_RS)    s->data_series |= CRAM_FC|CRAM_FP;
646
0
        if (s->data_series & CRAM_PD)    s->data_series |= CRAM_FC|CRAM_FP;
647
0
        if (s->data_series & CRAM_HC)    s->data_series |= CRAM_FC|CRAM_FP;
648
0
        if (s->data_series & CRAM_QS)    s->data_series |= CRAM_FC|CRAM_FP;
649
0
        if (s->data_series & CRAM_IN)    s->data_series |= CRAM_FC|CRAM_FP;
650
0
        if (s->data_series & CRAM_SC)    s->data_series |= CRAM_FC|CRAM_FP;
651
0
        if (s->data_series & CRAM_BS)    s->data_series |= CRAM_FC|CRAM_FP;
652
0
        if (s->data_series & CRAM_DL)    s->data_series |= CRAM_FC|CRAM_FP;
653
0
        if (s->data_series & CRAM_BA)    s->data_series |= CRAM_FC|CRAM_FP;
654
0
        if (s->data_series & CRAM_BB)    s->data_series |= CRAM_FC|CRAM_FP;
655
0
        if (s->data_series & CRAM_QQ)    s->data_series |= CRAM_FC|CRAM_FP;
656
657
        // cram_decode_seq() needs seq[] array
658
0
        if (s->data_series & (CRAM_SEQ|CRAM_CIGAR)) s->data_series |= CRAM_RL;
659
660
0
        if (s->data_series & CRAM_FP)    s->data_series |= CRAM_FC;
661
0
        if (s->data_series & CRAM_FC)    s->data_series |= CRAM_FN;
662
0
        if (s->data_series & CRAM_aux)   s->data_series |= CRAM_TL;
663
0
        if (s->data_series & CRAM_MF)    s->data_series |= CRAM_CF;
664
0
        if (s->data_series & CRAM_MQ)    s->data_series |= CRAM_BF;
665
0
        if (s->data_series & CRAM_BS)    s->data_series |= CRAM_RI;
666
0
        if (s->data_series & (CRAM_MF |CRAM_NS |CRAM_NP |CRAM_TS |CRAM_NF))
667
0
            s->data_series |= CRAM_CF;
668
0
        if (!hdr->read_names_included && s->data_series & CRAM_RN)
669
0
            s->data_series |= CRAM_CF | CRAM_NF;
670
0
        if (s->data_series & (CRAM_BA | CRAM_QS | CRAM_BB | CRAM_QQ))
671
0
            s->data_series |= CRAM_BF | CRAM_CF | CRAM_RL;
672
0
        if (s->data_series & CRAM_FN) {
673
            // The CRAM_FN loop checks for reference length boundaries,
674
            // which needs a working seq_pos.  Some fields are fixed size
675
            // irrespective of if we decode (BS), but others need to know
676
            // the size of the string fetched back (SC, IN, BB).
677
0
            s->data_series |= CRAM_SC | CRAM_IN | CRAM_BB;
678
0
        }
679
680
0
        orig_ds = s->data_series;
681
682
        // Find which blocks are in use.
683
0
        for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
684
0
            int bnum1, bnum2, j;
685
0
            cram_codec *c = hdr->codecs[i_to_id[i]];
686
687
0
            if (!(s->data_series & (1<<i)))
688
0
                continue;
689
690
0
            if (!c)
691
0
                continue;
692
693
0
            bnum1 = cram_codec_to_id(c, &bnum2);
694
695
0
            for (;;) {
696
0
                switch (bnum1) {
697
0
                case -2:
698
0
                    break;
699
700
0
                case -1:
701
0
                    core_used = 1;
702
0
                    break;
703
704
0
                default:
705
0
                    for (j = 0; j < s->hdr->num_blocks; j++) {
706
0
                        if (s->block[j]->content_type == EXTERNAL &&
707
0
                            s->block[j]->content_id == bnum1) {
708
0
                            block_used[j] = 1;
709
0
                            if (cram_uncompress_block(s->block[j])) {
710
0
                                free(block_used);
711
0
                                return -1;
712
0
                            }
713
0
                        }
714
0
                    }
715
0
                    break;
716
0
                }
717
718
0
                if (bnum2 == -2 || bnum1 == bnum2)
719
0
                    break;
720
721
0
                bnum1 = bnum2; // 2nd pass
722
0
            }
723
0
        }
724
725
        // Tags too
726
0
        if ((fd->required_fields & SAM_AUX) ||
727
0
            (s->data_series & CRAM_aux)) {
728
0
            for (i = 0; i < CRAM_MAP_HASH; i++) {
729
0
                int bnum1, bnum2, j;
730
0
                cram_map *m = hdr->tag_encoding_map[i];
731
732
0
                while (m) {
733
0
                    cram_codec *c = m->codec;
734
0
                    if (!c) {
735
0
                        m = m->next;
736
0
                        continue;
737
0
                    }
738
739
0
                    bnum1 = cram_codec_to_id(c, &bnum2);
740
741
0
                    for (;;) {
742
0
                        switch (bnum1) {
743
0
                        case -2:
744
0
                            break;
745
746
0
                        case -1:
747
0
                            core_used = 1;
748
0
                            break;
749
750
0
                        default:
751
0
                            for (j = 0; j < s->hdr->num_blocks; j++) {
752
0
                                if (s->block[j]->content_type == EXTERNAL &&
753
0
                                    s->block[j]->content_id == bnum1) {
754
0
                                    block_used[j] = 1;
755
0
                                    if (cram_uncompress_block(s->block[j])) {
756
0
                                        free(block_used);
757
0
                                        return -1;
758
0
                                    }
759
0
                                }
760
0
                            }
761
0
                            break;
762
0
                        }
763
764
0
                        if (bnum2 == -2 || bnum1 == bnum2)
765
0
                            break;
766
767
0
                        bnum1 = bnum2; // 2nd pass
768
0
                    }
769
770
0
                    m = m->next;
771
0
                }
772
0
            }
773
0
        }
774
775
        // We now know which blocks are in used, so repeat and find
776
        // which other data series need to be added.
777
0
        for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
778
0
            int bnum1, bnum2, j;
779
0
            cram_codec *c = hdr->codecs[i_to_id[i]];
780
781
0
            if (!c)
782
0
                continue;
783
784
0
            bnum1 = cram_codec_to_id(c, &bnum2);
785
786
0
            for (;;) {
787
0
                switch (bnum1) {
788
0
                case -2:
789
0
                    break;
790
791
0
                case -1:
792
0
                    if (core_used) {
793
                        //printf(" + data series %08x:\n", 1<<i);
794
0
                        s->data_series |= 1<<i;
795
0
                    }
796
0
                    break;
797
798
0
                default:
799
0
                    for (j = 0; j < s->hdr->num_blocks; j++) {
800
0
                        if (s->block[j]->content_type == EXTERNAL &&
801
0
                            s->block[j]->content_id == bnum1) {
802
0
                            if (block_used[j]) {
803
                                //printf(" + data series %08x:\n", 1<<i);
804
0
                                s->data_series |= 1<<i;
805
0
                            }
806
0
                        }
807
0
                    }
808
0
                    break;
809
0
                }
810
811
0
                if (bnum2 == -2 || bnum1 == bnum2)
812
0
                    break;
813
814
0
                bnum1 = bnum2; // 2nd pass
815
0
            }
816
0
        }
817
818
        // Tags too
819
0
        for (i = 0; i < CRAM_MAP_HASH; i++) {
820
0
            int bnum1, bnum2, j;
821
0
            cram_map *m = hdr->tag_encoding_map[i];
822
823
0
            while (m) {
824
0
                cram_codec *c = m->codec;
825
0
                if (!c) {
826
0
                    m = m->next;
827
0
                    continue;
828
0
                }
829
830
0
                bnum1 = cram_codec_to_id(c, &bnum2);
831
832
0
                for (;;) {
833
0
                    switch (bnum1) {
834
0
                    case -2:
835
0
                        break;
836
837
0
                    case -1:
838
                        //printf(" + data series %08x:\n", CRAM_aux);
839
0
                        s->data_series |= CRAM_aux;
840
0
                        break;
841
842
0
                    default:
843
0
                        for (j = 0; j < s->hdr->num_blocks; j++) {
844
0
                            if (s->block[j]->content_type == EXTERNAL &&
845
0
                                s->block[j]->content_id == bnum1) {
846
0
                                if (block_used[j]) {
847
                                    //printf(" + data series %08x:\n",
848
                                    //       CRAM_aux);
849
0
                                    s->data_series |= CRAM_aux;
850
0
                                }
851
0
                            }
852
0
                        }
853
0
                        break;
854
0
                    }
855
856
0
                    if (bnum2 == -2 || bnum1 == bnum2)
857
0
                        break;
858
859
0
                    bnum1 = bnum2; // 2nd pass
860
0
                }
861
862
0
                m = m->next;
863
0
            }
864
0
        }
865
0
    } while (orig_ds != s->data_series);
866
867
0
    free(block_used);
868
0
    return 0;
869
0
}
870
871
/*
872
 * Checks whether an external block is used solely by a single data series.
873
 * Returns the codec type if so (EXTERNAL, BYTE_ARRAY_LEN, BYTE_ARRAY_STOP)
874
 *         or 0 if not (E_NULL).
875
 */
876
static int cram_ds_unique(cram_block_compression_hdr *hdr, cram_codec *c,
877
0
                          int id) {
878
0
    int i, n_id = 0;
879
0
    enum cram_encoding e_type = 0;
880
881
0
    for (i = 0; i < DS_END; i++) {
882
0
        cram_codec *c;
883
0
        int bnum1, bnum2, old_n_id;
884
885
0
        if (!(c = hdr->codecs[i]))
886
0
            continue;
887
888
0
        bnum1 = cram_codec_to_id(c, &bnum2);
889
890
0
        old_n_id = n_id;
891
0
        if (bnum1 == id) {
892
0
            n_id++;
893
0
            e_type = c->codec;
894
0
        }
895
0
        if (bnum2 == id) {
896
0
            n_id++;
897
0
            e_type = c->codec;
898
0
        }
899
900
0
        if (n_id == old_n_id+2)
901
0
            n_id--; // len/val in same place counts once only.
902
0
    }
903
904
0
    return n_id == 1 ? e_type : 0;
905
0
}
906
907
/*
908
 * Attempts to estimate the size of some blocks so we can preallocate them
909
 * before decoding.  Although decoding will automatically grow the blocks,
910
 * it is typically more efficient to preallocate.
911
 */
912
void cram_decode_estimate_sizes(cram_block_compression_hdr *hdr, cram_slice *s,
913
                                int *qual_size, int *name_size,
914
0
                                int *q_id) {
915
0
    int bnum1, bnum2;
916
0
    cram_codec *cd;
917
918
0
    *qual_size = 0;
919
0
    *name_size = 0;
920
921
    /* Qual */
922
0
    cd = hdr->codecs[DS_QS];
923
0
    if (cd == NULL) return;
924
0
    bnum1 = cram_codec_to_id(cd, &bnum2);
925
0
    if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
926
0
    if (cram_ds_unique(hdr, cd, bnum1)) {
927
0
        cram_block *b = cram_get_block_by_id(s, bnum1);
928
0
        if (b) *qual_size = b->uncomp_size;
929
0
        if (q_id && cd->codec == E_EXTERNAL)
930
0
            *q_id = bnum1;
931
0
    }
932
933
    /* Name */
934
0
    cd = hdr->codecs[DS_RN];
935
0
    if (cd == NULL) return;
936
0
    bnum1 = cram_codec_to_id(cd, &bnum2);
937
0
    if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
938
0
    if (cram_ds_unique(hdr, cd, bnum1)) {
939
0
        cram_block *b = cram_get_block_by_id(s, bnum1);
940
0
        if (b) *name_size = b->uncomp_size;
941
0
    }
942
0
}
943
944
945
/* ----------------------------------------------------------------------
946
 * CRAM slices
947
 */
948
949
/*
950
 * Decodes a CRAM (un)mapped slice header block.
951
 * Returns slice header ptr on success
952
 *         NULL on failure
953
 */
954
558
cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) {
955
558
    cram_block_slice_hdr *hdr;
956
558
    unsigned char *cp;
957
558
    unsigned char *cp_end;
958
558
    int i, err = 0;
959
960
558
    if (b->method != RAW) {
961
        /* Spec. says slice header should be RAW, but we can future-proof
962
           by trying to decode it if it isn't. */
963
6
        if (cram_uncompress_block(b) < 0)
964
3
            return NULL;
965
6
    }
966
555
    cp =  (unsigned char *)BLOCK_DATA(b);
967
555
    cp_end = cp + b->uncomp_size;
968
969
555
    if (b->content_type != MAPPED_SLICE &&
970
3
        b->content_type != UNMAPPED_SLICE)
971
0
        return NULL;
972
973
555
    if (!(hdr  = calloc(1, sizeof(*hdr))))
974
0
        return NULL;
975
976
555
    hdr->content_type = b->content_type;
977
978
555
    if (b->content_type == MAPPED_SLICE) {
979
552
        hdr->ref_seq_id = fd->vv.varint_get32s((char **)&cp, (char *)cp_end, &err);
980
552
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
981
0
            hdr->ref_seq_start = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err);
982
0
            hdr->ref_seq_span  = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err);
983
552
        } else {
984
552
            hdr->ref_seq_start = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
985
552
            hdr->ref_seq_span  = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
986
552
        }
987
552
        if (hdr->ref_seq_start < 0 || hdr->ref_seq_span < 0) {
988
18
            free(hdr);
989
18
            hts_log_error("Negative values not permitted for header "
990
18
                          "sequence start or span fields");
991
18
            return NULL;
992
18
        }
993
552
    }
994
537
    hdr->num_records = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err);
995
537
    hdr->record_counter = 0;
996
537
    if (CRAM_MAJOR_VERS(fd->version) == 2) {
997
0
        hdr->record_counter = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
998
537
    } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
999
0
        hdr->record_counter = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err);
1000
0
    }
1001
537
    hdr->num_blocks      = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
1002
537
    hdr->num_content_ids = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
1003
537
    if (hdr->num_content_ids < 1 ||
1004
531
        hdr->num_content_ids >= 10000) {
1005
        // Slice must have at least one data block, and there is no need
1006
        // for more than 2 per possible aux-tag plus ancillary.
1007
12
        free(hdr);
1008
12
        return NULL;
1009
12
    }
1010
525
    hdr->block_content_ids = malloc(hdr->num_content_ids * sizeof(int32_t));
1011
525
    if (!hdr->block_content_ids) {
1012
0
        free(hdr);
1013
0
        return NULL;
1014
0
    }
1015
1016
37.8k
    for (i = 0; i < hdr->num_content_ids; i++)
1017
37.3k
        hdr->block_content_ids[i] = fd->vv.varint_get32((char **)&cp,
1018
37.3k
                                                         (char *)cp_end,
1019
37.3k
                                                         &err);
1020
525
    if (err) {
1021
9
        free(hdr->block_content_ids);
1022
9
        free(hdr);
1023
9
        return NULL;
1024
9
    }
1025
1026
516
    if (b->content_type == MAPPED_SLICE)
1027
513
        hdr->ref_base_id = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err);
1028
1029
516
    if (CRAM_MAJOR_VERS(fd->version) != 1) {
1030
0
        if (cp_end - cp < 16) {
1031
0
            free(hdr->block_content_ids);
1032
0
            free(hdr);
1033
0
            return NULL;
1034
0
        }
1035
0
        memcpy(hdr->md5, cp, 16);
1036
516
    } else {
1037
516
        memset(hdr->md5, 0, 16);
1038
516
    }
1039
1040
516
    if (!err)
1041
510
        return hdr;
1042
1043
6
    free(hdr->block_content_ids);
1044
6
    free(hdr);
1045
6
    return NULL;
1046
516
}
1047
1048
1049
#if 0
1050
/* Returns the number of bits set in val; it the highest bit used */
1051
static int nbits(int v) {
1052
    static const int MultiplyDeBruijnBitPosition[32] = {
1053
        1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31,
1054
        9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32
1055
    };
1056
1057
    v |= v >> 1; // first up to set all bits 1 after the first 1 */
1058
    v |= v >> 2;
1059
    v |= v >> 4;
1060
    v |= v >> 8;
1061
    v |= v >> 16;
1062
1063
    // DeBruijn magic to find top bit
1064
    return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
1065
}
1066
#endif
1067
1068
#if 0
1069
static int sort_freqs(const void *vp1, const void *vp2) {
1070
    const int i1 = *(const int *)vp1;
1071
    const int i2 = *(const int *)vp2;
1072
    return i1-i2;
1073
}
1074
#endif
1075
1076
/* ----------------------------------------------------------------------
1077
 * Primary CRAM sequence decoder
1078
 */
1079
1080
0
static inline int add_md_char(cram_slice *s, int decode_md, char c, int32_t *md_dist) {
1081
0
    if (decode_md) {
1082
0
        BLOCK_APPEND_UINT(s->aux_blk, *md_dist);
1083
0
        BLOCK_APPEND_CHAR(s->aux_blk, c);
1084
0
        *md_dist = 0;
1085
0
    }
1086
0
    return 0;
1087
1088
0
 block_err:
1089
0
    return -1;
1090
0
}
1091
1092
/*
1093
 * Internal part of cram_decode_slice().
1094
 * Generates the sequence, quality and cigar components.
1095
 */
1096
static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
1097
                           cram_block *blk, cram_record *cr, sam_hdr_t *sh,
1098
                           int cf, char *seq, char *qual,
1099
0
                           int has_MD, int has_NM) {
1100
0
    int prev_pos = 0, f, r = 0, out_sz = 1;
1101
0
    int seq_pos = 1;
1102
0
    int cig_len = 0;
1103
0
    int64_t ref_pos = cr->apos;
1104
0
    int32_t fn, i32;
1105
0
    enum cigar_op cig_op = BAM_CMATCH;
1106
0
    uint32_t *cigar = s->cigar;
1107
0
    uint32_t ncigar = s->ncigar;
1108
0
    uint32_t cigar_alloc = s->cigar_alloc;
1109
0
    uint32_t nm = 0;
1110
0
    int32_t md_dist = 0;
1111
0
    int orig_aux = 0;
1112
    // CRAM <  4.0 decode_md is off/on
1113
    // CRAM >= 4.0 decode_md is auto/on (auto=on if MD* present, off otherwise)
1114
0
    int do_md = CRAM_MAJOR_VERS(fd->version) >= 4
1115
0
        ? (s->decode_md > 0)
1116
0
        : (s->decode_md != 0);
1117
0
    int decode_md = s->ref && cr->ref_id >= 0 && ((do_md && !has_MD) || has_MD < 0);
1118
0
    int decode_nm = s->ref && cr->ref_id >= 0 && ((do_md && !has_NM) || has_NM < 0);
1119
0
    uint32_t ds = s->data_series;
1120
0
    sam_hrecs_t *bfd = sh->hrecs;
1121
1122
0
    cram_codec **codecs = c->comp_hdr->codecs;
1123
1124
0
    if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1125
0
        memset(qual, 255, cr->len);
1126
0
    }
1127
1128
0
    if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
1129
0
        decode_md = decode_nm = 0;
1130
1131
0
    if (decode_md) {
1132
0
        orig_aux = BLOCK_SIZE(s->aux_blk);
1133
0
        if (has_MD == 0)
1134
0
            BLOCK_APPEND(s->aux_blk, "MDZ", 3);
1135
0
    }
1136
1137
0
    if (ds & CRAM_FN) {
1138
0
        if (!codecs[DS_FN]) return -1;
1139
0
        r |= codecs[DS_FN]->decode(s,codecs[DS_FN],
1140
0
                                   blk, (char *)&fn, &out_sz);
1141
0
        if (r) return r;
1142
0
    } else {
1143
0
        fn = 0;
1144
0
    }
1145
1146
0
    ref_pos--; // count from 0
1147
0
    cr->cigar = ncigar;
1148
1149
0
    if (!(ds & (CRAM_FC | CRAM_FP)))
1150
0
        goto skip_cigar;
1151
1152
0
    if (fn) {
1153
0
        if ((ds & CRAM_FC) && !codecs[DS_FC])
1154
0
            return -1;
1155
0
        if ((ds & CRAM_FP) && !codecs[DS_FP])
1156
0
            return -1;
1157
0
    }
1158
1159
0
    for (f = 0; f < fn; f++) {
1160
0
        int32_t pos = 0;
1161
0
        char op;
1162
1163
0
        if (ncigar+2 >= cigar_alloc) {
1164
0
            cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1165
0
            if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar))))
1166
0
                return -1;
1167
0
            s->cigar = cigar;
1168
0
        }
1169
1170
0
        if (ds & CRAM_FC) {
1171
0
            r |= codecs[DS_FC]->decode(s,
1172
0
                                       codecs[DS_FC],
1173
0
                                       blk,
1174
0
                                       &op,  &out_sz);
1175
0
            if (r) return r;
1176
0
        }
1177
1178
0
        if (!(ds & CRAM_FP))
1179
0
            continue;
1180
1181
0
        r |= codecs[DS_FP]->decode(s,
1182
0
                                   codecs[DS_FP],
1183
0
                                   blk,
1184
0
                                   (char *)&pos, &out_sz);
1185
0
        if (r) return r;
1186
0
        pos += prev_pos;
1187
1188
0
        if (pos <= 0) {
1189
0
            hts_log_error("Feature position %d before start of read", pos);
1190
0
            return -1;
1191
0
        }
1192
1193
0
        if (pos > seq_pos) {
1194
0
            if (pos > cr->len+1)
1195
0
                return -1;
1196
1197
0
            if (s->ref && cr->ref_id >= 0) {
1198
0
                if (ref_pos + pos - seq_pos > bfd->ref[cr->ref_id].len) {
1199
0
                    static int whinged = 0;
1200
0
                    int rlen;
1201
0
                    if (!whinged)
1202
0
                        hts_log_warning("Ref pos outside of ref sequence boundary");
1203
0
                    whinged = 1;
1204
0
                    rlen = bfd->ref[cr->ref_id].len - ref_pos;
1205
                    // May miss MD/NM cases where both seq/ref are N, but this is a
1206
                    // malformed cram file anyway.
1207
0
                    if (rlen > 0) {
1208
0
                        if (ref_pos + rlen > s->ref_end)
1209
0
                            goto beyond_slice;
1210
1211
0
                        memcpy(&seq[seq_pos-1],
1212
0
                               &s->ref[ref_pos - s->ref_start +1], rlen);
1213
0
                        if ((pos - seq_pos) - rlen > 0)
1214
0
                            memset(&seq[seq_pos-1+rlen], 'N',
1215
0
                                   (pos - seq_pos) - rlen);
1216
0
                    } else {
1217
0
                        memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
1218
0
                    }
1219
0
                    if (md_dist >= 0)
1220
0
                        md_dist += pos - seq_pos;
1221
0
                } else {
1222
                    // 'N' in both ref and seq is also mismatch for NM/MD
1223
0
                    if (ref_pos + pos-seq_pos > s->ref_end)
1224
0
                        goto beyond_slice;
1225
1226
0
                    const char *refp = s->ref + ref_pos - s->ref_start + 1;
1227
0
                    const int frag_len = pos - seq_pos;
1228
0
                    int do_cpy = 1;
1229
0
                    if (decode_md || decode_nm) {
1230
0
                        char *N = memchr(refp, 'N', frag_len);
1231
0
                        if (N) {
1232
0
                            int i;
1233
0
                            for (i = 0; i < frag_len; i++) {
1234
0
                                char base = refp[i];
1235
0
                                if (base == 'N') {
1236
0
                                    if (add_md_char(s, decode_md,
1237
0
                                                    'N', &md_dist) < 0)
1238
0
                                        return -1;
1239
0
                                    nm++;
1240
0
                                } else {
1241
0
                                    md_dist++;
1242
0
                                }
1243
0
                                seq[seq_pos-1+i] = base;
1244
0
                            }
1245
0
                            do_cpy = 0;
1246
0
                        } else {
1247
0
                            md_dist += frag_len;
1248
0
                        }
1249
0
                    }
1250
0
                    if (do_cpy)
1251
0
                        memcpy(&seq[seq_pos-1], refp, frag_len);
1252
0
                }
1253
0
            }
1254
#ifdef USE_X
1255
            if (cig_len && cig_op != BAM_CBASE_MATCH) {
1256
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1257
                cig_len = 0;
1258
            }
1259
            cig_op = BAM_CBASE_MATCH;
1260
#else
1261
0
            if (cig_len && cig_op != BAM_CMATCH) {
1262
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1263
0
                cig_len = 0;
1264
0
            }
1265
0
            cig_op = BAM_CMATCH;
1266
0
#endif
1267
0
            cig_len += pos - seq_pos;
1268
0
            ref_pos += pos - seq_pos;
1269
0
            seq_pos = pos;
1270
0
        }
1271
1272
0
        prev_pos = pos;
1273
1274
0
        if (!(ds & CRAM_FC))
1275
0
            goto skip_cigar;
1276
1277
0
        switch(op) {
1278
0
        case 'S': { // soft clip: IN
1279
0
            int32_t out_sz2 = cr->len ? cr->len-(pos-1) : 1;
1280
0
            int have_sc = 0;
1281
1282
0
            if (cig_len) {
1283
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1284
0
                cig_len = 0;
1285
0
            }
1286
0
            switch (CRAM_MAJOR_VERS(fd->version)) {
1287
0
            case 1:
1288
0
                if (ds & CRAM_IN) {
1289
0
                    r |= codecs[DS_IN]
1290
0
                        ? codecs[DS_IN]->decode(s, codecs[DS_IN],
1291
0
                                                blk,
1292
0
                                                cr->len ? &seq[pos-1] : NULL,
1293
0
                                                &out_sz2)
1294
0
                        : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1295
0
                    have_sc = 1;
1296
0
                }
1297
0
                break;
1298
0
            case 2:
1299
0
            default:
1300
0
                if (ds & CRAM_SC) {
1301
0
                    r |= codecs[DS_SC]
1302
0
                        ? codecs[DS_SC]->decode(s, codecs[DS_SC],
1303
0
                                                blk,
1304
0
                                                cr->len ? &seq[pos-1] : NULL,
1305
0
                                                &out_sz2)
1306
0
                        : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1307
0
                    have_sc = 1;
1308
0
                }
1309
0
                break;
1310
1311
                //default:
1312
                //    r |= codecs[DS_BB]
1313
                //        ? codecs[DS_BB]->decode(s, codecs[DS_BB],
1314
                //                                blk, &seq[pos-1], &out_sz2)
1315
                //        : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1316
0
            }
1317
0
            if (have_sc) {
1318
0
                if (r) return r;
1319
0
                cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP;
1320
0
                cig_op = BAM_CSOFT_CLIP;
1321
0
                seq_pos += out_sz2;
1322
0
            }
1323
0
            break;
1324
0
        }
1325
1326
0
        case 'X': { // Substitution; BS
1327
0
            unsigned char base;
1328
#ifdef USE_X
1329
            if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1330
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1331
                cig_len = 0;
1332
            }
1333
            if (ds & CRAM_BS) {
1334
                if (!codecs[DS_BS]) return -1;
1335
                r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk,
1336
                                           (char *)&base, &out_sz);
1337
                if (pos-1 < cr->len)
1338
                    seq[pos-1] = 'N'; // FIXME look up BS=base value
1339
            }
1340
            cig_op = BAM_CBASE_MISMATCH;
1341
#else
1342
0
            int ref_base;
1343
0
            if (cig_len && cig_op != BAM_CMATCH) {
1344
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1345
0
                cig_len = 0;
1346
0
            }
1347
0
            if (ds & CRAM_BS) {
1348
0
                if (!codecs[DS_BS]) return -1;
1349
0
                r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk,
1350
0
                                           (char *)&base, &out_sz);
1351
0
                if (r) return -1;
1352
0
                if (cr->ref_id < 0 || ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
1353
0
                    if (pos-1 < cr->len)
1354
0
                        seq[pos-1] = c->comp_hdr->
1355
0
                            substitution_matrix[fd->L1['N']][base];
1356
0
                    if (decode_md || decode_nm) {
1357
0
                        if (md_dist >= 0 && decode_md)
1358
0
                            BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1359
0
                        md_dist = -1;
1360
0
                        nm--;
1361
0
                    }
1362
0
                } else {
1363
0
                    unsigned char ref_call = ref_pos < s->ref_end
1364
0
                        ? (uc)s->ref[ref_pos - s->ref_start +1]
1365
0
                        : 'N';
1366
0
                    ref_base = fd->L1[ref_call];
1367
0
                    if (pos-1 < cr->len)
1368
0
                        seq[pos-1] = c->comp_hdr->
1369
0
                            substitution_matrix[ref_base][base];
1370
0
                    if (add_md_char(s, decode_md, ref_call, &md_dist) < 0)
1371
0
                        return -1;
1372
0
                }
1373
0
            }
1374
0
            cig_op = BAM_CMATCH;
1375
0
#endif
1376
0
            nm++;
1377
0
            cig_len++;
1378
0
            seq_pos++;
1379
0
            ref_pos++;
1380
0
            break;
1381
0
        }
1382
1383
0
        case 'D': { // Deletion; DL
1384
0
            if (cig_len && cig_op != BAM_CDEL) {
1385
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1386
0
                cig_len = 0;
1387
0
            }
1388
0
            if (ds & CRAM_DL) {
1389
0
                if (!codecs[DS_DL]) return -1;
1390
0
                r |= codecs[DS_DL]->decode(s, codecs[DS_DL], blk,
1391
0
                                           (char *)&i32, &out_sz);
1392
0
                if (r) return r;
1393
0
                if (decode_md || decode_nm) {
1394
0
                    if (ref_pos + i32 > s->ref_end)
1395
0
                        goto beyond_slice;
1396
0
                    if (md_dist >= 0 && decode_md)
1397
0
                        BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1398
0
                    if (ref_pos + i32 <= bfd->ref[cr->ref_id].len) {
1399
0
                        if (decode_md) {
1400
0
                            BLOCK_APPEND_CHAR(s->aux_blk, '^');
1401
0
                            BLOCK_APPEND(s->aux_blk,
1402
0
                                         &s->ref[ref_pos - s->ref_start +1],
1403
0
                                         i32);
1404
0
                            md_dist = 0;
1405
0
                        }
1406
0
                        nm += i32;
1407
0
                    } else {
1408
0
                        uint32_t dlen;
1409
0
                        if (bfd->ref[cr->ref_id].len >= ref_pos) {
1410
0
                            if (decode_md) {
1411
0
                                BLOCK_APPEND_CHAR(s->aux_blk, '^');
1412
0
                                BLOCK_APPEND(s->aux_blk,
1413
0
                                             &s->ref[ref_pos - s->ref_start+1],
1414
0
                                             bfd->ref[cr->ref_id].len-ref_pos);
1415
0
                                BLOCK_APPEND_UINT(s->aux_blk, 0);
1416
0
                            }
1417
0
                            dlen = i32 - (bfd->ref[cr->ref_id].len - ref_pos);
1418
0
                            nm += i32 - dlen;
1419
0
                        } else {
1420
0
                            dlen = i32;
1421
0
                        }
1422
1423
0
                        md_dist = -1;
1424
0
                    }
1425
0
                }
1426
0
                cig_op = BAM_CDEL;
1427
0
                cig_len += i32;
1428
0
                ref_pos += i32;
1429
                //printf("  %d: DL = %d (ret %d)\n", f, i32, r);
1430
0
            }
1431
0
            break;
1432
0
        }
1433
1434
0
        case 'I': { // Insertion (several bases); IN
1435
0
            int32_t out_sz2 = cr->len ? cr->len-(pos-1) : 1;
1436
1437
0
            if (cig_len && cig_op != BAM_CINS) {
1438
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1439
0
                cig_len = 0;
1440
0
            }
1441
1442
0
            if (ds & CRAM_IN) {
1443
0
                if (!codecs[DS_IN]) return -1;
1444
0
                r |= codecs[DS_IN]->decode(s, codecs[DS_IN], blk,
1445
0
                                           cr->len ? &seq[pos-1] : NULL,
1446
0
                                           &out_sz2);
1447
0
                if (r) return r;
1448
0
                cig_op = BAM_CINS;
1449
0
                cig_len += out_sz2;
1450
0
                seq_pos += out_sz2;
1451
0
                nm      += out_sz2;
1452
                //printf("  %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2);
1453
0
            }
1454
0
            break;
1455
0
        }
1456
1457
0
        case 'i': { // Insertion (single base); BA
1458
0
            if (cig_len && cig_op != BAM_CINS) {
1459
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1460
0
                cig_len = 0;
1461
0
            }
1462
0
            if (ds & CRAM_BA) {
1463
0
                if (!codecs[DS_BA]) return -1;
1464
0
                r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk,
1465
0
                                           cr->len ? &seq[pos-1] : NULL,
1466
0
                                           &out_sz);
1467
0
                if (r) return r;
1468
0
            }
1469
0
            cig_op = BAM_CINS;
1470
0
            cig_len++;
1471
0
            seq_pos++;
1472
0
            nm++;
1473
0
            break;
1474
0
        }
1475
1476
0
        case 'b': { // Several bases
1477
0
            int32_t len = cr->len ? cr->len-(pos-1) : 1;
1478
1479
0
            if (cig_len && cig_op != BAM_CMATCH) {
1480
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1481
0
                cig_len = 0;
1482
0
            }
1483
1484
0
            if (ds & CRAM_BB) {
1485
0
                if (!codecs[DS_BB]) return -1;
1486
0
                r |= codecs[DS_BB]->decode(s, codecs[DS_BB], blk,
1487
0
                                           cr->len ? &seq[pos-1] : NULL,
1488
0
                                           &len);
1489
0
                if (r) return r;
1490
1491
0
                if (decode_md || decode_nm) {
1492
0
                    int x;
1493
0
                    if (md_dist >= 0 && decode_md)
1494
0
                        BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1495
1496
0
                    for (x = 0; x < len; x++) {
1497
0
                        if (x && decode_md)
1498
0
                            BLOCK_APPEND_UINT(s->aux_blk, 0);
1499
0
                        if (ref_pos+x >= bfd->ref[cr->ref_id].len || !s->ref) {
1500
0
                            md_dist = -1;
1501
0
                            break;
1502
0
                        } else {
1503
0
                            if (decode_md) {
1504
0
                                if (ref_pos + x > s->ref_end)
1505
0
                                    goto beyond_slice;
1506
0
                                char r = s->ref[ref_pos+x-s->ref_start +1];
1507
0
                                BLOCK_APPEND_CHAR(s->aux_blk, r);
1508
0
                            }
1509
0
                        }
1510
0
                    }
1511
1512
0
                    nm += x;
1513
0
                    md_dist = 0;
1514
0
                }
1515
0
            }
1516
1517
0
            cig_op = BAM_CMATCH;
1518
1519
0
            cig_len+=len;
1520
0
            seq_pos+=len;
1521
0
            ref_pos+=len;
1522
            //prev_pos+=len;
1523
0
            break;
1524
0
        }
1525
1526
0
        case 'q': { // Several quality values
1527
0
            int32_t len = cr->len ? cr->len - (pos-1) : 1;
1528
1529
0
            if (cig_len && cig_op != BAM_CMATCH) {
1530
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1531
0
                cig_len = 0;
1532
0
            }
1533
1534
0
            if (ds & CRAM_QQ) {
1535
0
                if (!codecs[DS_QQ]) return -1;
1536
0
                if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)
1537
0
                    && (unsigned char)*qual == 255)
1538
0
                    memset(qual, 30, cr->len); // ?
1539
0
                r |= codecs[DS_QQ]->decode(s, codecs[DS_QQ], blk,
1540
0
                                           (char *)&qual[pos-1], &len);
1541
0
                if (r) return r;
1542
0
            }
1543
1544
0
            cig_op = BAM_CMATCH;
1545
1546
            //prev_pos+=len;
1547
0
            break;
1548
0
        }
1549
1550
0
        case 'B': { // Read base; BA, QS
1551
#ifdef USE_X
1552
            if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1553
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1554
                cig_len = 0;
1555
            }
1556
#else
1557
0
            if (cig_len && cig_op != BAM_CMATCH) {
1558
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1559
0
                cig_len = 0;
1560
0
            }
1561
0
#endif
1562
0
            if (ds & CRAM_BA) {
1563
0
                if (!codecs[DS_BA]) return -1;
1564
0
                r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk,
1565
0
                                           cr->len ? &seq[pos-1] : NULL,
1566
0
                                           &out_sz);
1567
1568
0
                if (decode_md || decode_nm) {
1569
0
                    if (md_dist >= 0 && decode_md)
1570
0
                        BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1571
0
                    if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
1572
0
                        md_dist = -1;
1573
0
                    } else {
1574
0
                        if (decode_md) {
1575
0
                            if (ref_pos > s->ref_end)
1576
0
                                goto beyond_slice;
1577
0
                            BLOCK_APPEND_CHAR(s->aux_blk,
1578
0
                                              s->ref[ref_pos-s->ref_start +1]);
1579
0
                        }
1580
0
                        nm++;
1581
0
                        md_dist = 0;
1582
0
                    }
1583
0
                }
1584
0
            }
1585
0
            if (ds & CRAM_QS) {
1586
0
                if (!codecs[DS_QS]) return -1;
1587
0
                if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)
1588
0
                    && (unsigned char)*qual == 255)
1589
0
                    memset(qual, 30, cr->len); // ASCII ?.  Same as htsjdk
1590
0
                r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk,
1591
0
                                           (char *)&qual[pos-1], &out_sz);
1592
0
            }
1593
#ifdef USE_X
1594
            cig_op = BAM_CBASE_MISMATCH;
1595
#else
1596
0
            cig_op = BAM_CMATCH;
1597
0
#endif
1598
0
            cig_len++;
1599
0
            seq_pos++;
1600
0
            ref_pos++;
1601
            //printf("  %d: BA/QS(B) = %c/%d (ret %d)\n", f, i32, qc, r);
1602
0
            break;
1603
0
        }
1604
1605
0
        case 'Q': { // Quality score; QS
1606
0
            if (ds & CRAM_QS) {
1607
0
                if (!codecs[DS_QS]) return -1;
1608
0
                if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) &&
1609
0
                    (unsigned char)*qual == 255)
1610
0
                    memset(qual, 30, cr->len); // ?
1611
0
                r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk,
1612
0
                                           (char *)&qual[pos-1], &out_sz);
1613
                //printf("  %d: QS = %d (ret %d)\n", f, qc, r);
1614
0
            }
1615
0
            break;
1616
0
        }
1617
1618
0
        case 'H': { // hard clip; HC
1619
0
            if (cig_len && cig_op != BAM_CHARD_CLIP) {
1620
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1621
0
                cig_len = 0;
1622
0
            }
1623
0
            if (ds & CRAM_HC) {
1624
0
                if (!codecs[DS_HC]) return -1;
1625
0
                r |= codecs[DS_HC]->decode(s, codecs[DS_HC], blk,
1626
0
                                           (char *)&i32, &out_sz);
1627
0
                if (r) return r;
1628
0
                cig_op = BAM_CHARD_CLIP;
1629
0
                cig_len += i32;
1630
0
            }
1631
0
            break;
1632
0
        }
1633
1634
0
        case 'P': { // padding; PD
1635
0
            if (cig_len && cig_op != BAM_CPAD) {
1636
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1637
0
                cig_len = 0;
1638
0
            }
1639
0
            if (ds & CRAM_PD) {
1640
0
                if (!codecs[DS_PD]) return -1;
1641
0
                r |= codecs[DS_PD]->decode(s, codecs[DS_PD], blk,
1642
0
                                           (char *)&i32, &out_sz);
1643
0
                if (r) return r;
1644
0
                cig_op = BAM_CPAD;
1645
0
                cig_len += i32;
1646
0
            }
1647
0
            break;
1648
0
        }
1649
1650
0
        case 'N': { // Ref skip; RS
1651
0
            if (cig_len && cig_op != BAM_CREF_SKIP) {
1652
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1653
0
                cig_len = 0;
1654
0
            }
1655
0
            if (ds & CRAM_RS) {
1656
0
                if (!codecs[DS_RS]) return -1;
1657
0
                r |= codecs[DS_RS]->decode(s, codecs[DS_RS], blk,
1658
0
                                           (char *)&i32, &out_sz);
1659
0
                if (r) return r;
1660
0
                cig_op = BAM_CREF_SKIP;
1661
0
                cig_len += i32;
1662
0
                ref_pos += i32;
1663
0
            }
1664
0
            break;
1665
0
        }
1666
1667
0
        default:
1668
0
            hts_log_error("Unknown feature code '%c'", op);
1669
0
            return -1;
1670
0
        }
1671
0
    }
1672
1673
0
    if (!(ds & CRAM_FC))
1674
0
        goto skip_cigar;
1675
1676
    /* An implicit match op for any unaccounted for bases */
1677
0
    if ((ds & CRAM_FN) && cr->len >= seq_pos) {
1678
0
        if (s->ref && cr->ref_id >= 0) {
1679
0
            if (ref_pos + cr->len - seq_pos + 1 > bfd->ref[cr->ref_id].len) {
1680
0
                static int whinged = 0;
1681
0
                int rlen;
1682
0
                if (!whinged)
1683
0
                    hts_log_warning("Ref pos outside of ref sequence boundary");
1684
0
                whinged = 1;
1685
0
                rlen = bfd->ref[cr->ref_id].len - ref_pos;
1686
                // May miss MD/NM cases where both seq/ref are N, but this is a
1687
                // malformed cram file anyway.
1688
0
                if (rlen > 0) {
1689
0
                    if (seq_pos-1 + rlen < cr->len)
1690
0
                        memcpy(&seq[seq_pos-1],
1691
0
                               &s->ref[ref_pos - s->ref_start +1], rlen);
1692
0
                    if ((cr->len - seq_pos + 1) - rlen > 0)
1693
0
                        memset(&seq[seq_pos-1+rlen], 'N',
1694
0
                               (cr->len - seq_pos + 1) - rlen);
1695
0
                } else {
1696
0
                    if (cr->len - seq_pos + 1 > 0)
1697
0
                        memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
1698
0
                }
1699
0
                if (md_dist >= 0)
1700
0
                    md_dist += cr->len - seq_pos + 1;
1701
0
            } else {
1702
0
                if (cr->len - seq_pos + 1 > 0) {
1703
0
                    if (ref_pos + cr->len-seq_pos +1 > s->ref_end)
1704
0
                        goto beyond_slice;
1705
0
                    int remainder = cr->len - (seq_pos-1);
1706
0
                    int j = ref_pos - s->ref_start + 1;
1707
0
                    if (decode_md || decode_nm) {
1708
0
                        int i;
1709
0
                        char *N = memchr(&s->ref[j], 'N', remainder);
1710
0
                        if (!N) {
1711
                            // short cut the common case
1712
0
                            md_dist += cr->len - (seq_pos-1);
1713
0
                        } else {
1714
0
                            char *refp = &s->ref[j-(seq_pos-1)];
1715
0
                            md_dist += N-&s->ref[j];
1716
0
                            int i_start = seq_pos-1 + (N - &s->ref[j]);
1717
0
                            for (i = i_start; i < cr->len; i++) {
1718
0
                                char base = refp[i];
1719
0
                                if (base == 'N') {
1720
0
                                    if (add_md_char(s, decode_md, 'N',
1721
0
                                                    &md_dist) < 0)
1722
0
                                        return -1;
1723
0
                                    nm++;
1724
0
                                } else {
1725
0
                                    md_dist++;
1726
0
                                }
1727
0
                            }
1728
0
                        }
1729
0
                    }
1730
0
                    memcpy(&seq[seq_pos-1], &s->ref[j], remainder);
1731
0
                }
1732
0
                ref_pos += cr->len - seq_pos + 1;
1733
0
            }
1734
0
        } else if (cr->ref_id >= 0) {
1735
            // So alignment end can be computed even when not decoding sequence
1736
0
            ref_pos += cr->len - seq_pos + 1;
1737
0
        }
1738
1739
0
        if (ncigar+1 >= cigar_alloc) {
1740
0
            cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1741
0
            if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar))))
1742
0
                return -1;
1743
0
            s->cigar = cigar;
1744
0
        }
1745
#ifdef USE_X
1746
        if (cig_len && cig_op != BAM_CBASE_MATCH) {
1747
            cigar[ncigar++] = (cig_len<<4) + cig_op;
1748
            cig_len = 0;
1749
        }
1750
        cig_op = BAM_CBASE_MATCH;
1751
#else
1752
0
        if (cig_len && cig_op != BAM_CMATCH) {
1753
0
            cigar[ncigar++] = (cig_len<<4) + cig_op;
1754
0
            cig_len = 0;
1755
0
        }
1756
0
        cig_op = BAM_CMATCH;
1757
0
#endif
1758
0
        cig_len += cr->len - seq_pos+1;
1759
0
    }
1760
1761
0
 skip_cigar:
1762
1763
0
    if ((ds & CRAM_FN) && decode_md) {
1764
0
        if (md_dist >= 0)
1765
0
            BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1766
0
    }
1767
1768
0
    if (cig_len) {
1769
0
        if (ncigar >= cigar_alloc) {
1770
0
            cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1771
0
            if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar))))
1772
0
                return -1;
1773
0
            s->cigar = cigar;
1774
0
        }
1775
1776
0
        cigar[ncigar++] = (cig_len<<4) + cig_op;
1777
0
    }
1778
1779
0
    cr->ncigar = ncigar - cr->cigar;
1780
0
    cr->aend = ref_pos > cr->apos ? ref_pos : cr->apos;
1781
1782
    //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos);
1783
1784
0
    if (ds & CRAM_MQ) {
1785
0
        if (!codecs[DS_MQ]) return -1;
1786
0
        r |= codecs[DS_MQ]->decode(s, codecs[DS_MQ], blk,
1787
0
                                   (char *)&cr->mqual, &out_sz);
1788
0
    } else {
1789
0
        cr->mqual = 40;
1790
0
    }
1791
1792
0
    if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1793
0
        int32_t out_sz2 = cr->len;
1794
1795
0
        if (!codecs[DS_QS]) return -1;
1796
0
        r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk,
1797
0
                                   qual, &out_sz2);
1798
0
    }
1799
1800
0
    s->cigar = cigar;
1801
0
    s->cigar_alloc = cigar_alloc;
1802
0
    s->ncigar = ncigar;
1803
1804
0
    if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
1805
0
        cr->len = 0;
1806
1807
0
    if (decode_md) {
1808
0
        BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z:
1809
0
        size_t sz = BLOCK_SIZE(s->aux_blk) - orig_aux;
1810
0
        if (has_MD < 0) {
1811
            // has_MD < 0; already have MDZ allocated in aux at -has_MD,
1812
            // but wrote MD to end of aux (at orig_aux).
1813
            // We need some memmoves to shuffle it around.
1814
0
            char tmp_MD_[1024], *tmp_MD = tmp_MD_;
1815
0
            unsigned char *orig_aux_p = BLOCK_DATA(s->aux_blk) + orig_aux;
1816
0
            if (sz > 1024) {
1817
0
                tmp_MD = malloc(sz);
1818
0
                if (!tmp_MD)
1819
0
                    return -1;
1820
0
            }
1821
0
            memcpy(tmp_MD, orig_aux_p, sz);
1822
0
            memmove(&BLOCK_DATA(s->aux_blk)[-has_MD] + sz,
1823
0
                    &BLOCK_DATA(s->aux_blk)[-has_MD],
1824
0
                    orig_aux_p - &BLOCK_DATA(s->aux_blk)[-has_MD]);
1825
0
            memcpy(&BLOCK_DATA(s->aux_blk)[-has_MD], tmp_MD, sz);
1826
0
            if (tmp_MD != tmp_MD_)
1827
0
                free(tmp_MD);
1828
1829
0
            if (-has_NM > -has_MD)
1830
                // we inserted before NM, so move it up a bit
1831
0
                has_NM -= sz;
1832
0
        }
1833
        // else has_MD == 0 and we've already appended MD to the end.
1834
1835
0
        cr->aux_size += sz;
1836
0
    }
1837
1838
0
    if (decode_nm) {
1839
0
        if (has_NM == 0) {
1840
0
            char buf[7];
1841
0
            size_t buf_size;
1842
0
            buf[0] = 'N'; buf[1] = 'M';
1843
0
            if (nm <= UINT8_MAX) {
1844
0
                buf_size = 4;
1845
0
                buf[2] = 'C';
1846
0
                buf[3] = (nm>> 0) & 0xff;
1847
0
            } else if (nm <= UINT16_MAX) {
1848
0
                buf_size = 5;
1849
0
                buf[2] = 'S';
1850
0
                buf[3] = (nm>> 0) & 0xff;
1851
0
                buf[4] = (nm>> 8) & 0xff;
1852
0
            } else {
1853
0
                buf_size = 7;
1854
0
                buf[2] = 'I';
1855
0
                buf[3] = (nm>> 0) & 0xff;
1856
0
                buf[4] = (nm>> 8) & 0xff;
1857
0
                buf[5] = (nm>>16) & 0xff;
1858
0
                buf[6] = (nm>>24) & 0xff;
1859
0
            }
1860
0
            BLOCK_APPEND(s->aux_blk, buf, buf_size);
1861
0
            cr->aux_size += buf_size;
1862
0
        } else {
1863
            // Preallocated space for NM at -has_NM into aux block
1864
0
            unsigned char *buf = BLOCK_DATA(s->aux_blk) + -has_NM;
1865
0
            buf[0] = (nm>> 0) & 0xff;
1866
0
            buf[1] = (nm>> 8) & 0xff;
1867
0
            buf[2] = (nm>>16) & 0xff;
1868
0
            buf[3] = (nm>>24) & 0xff;
1869
0
        }
1870
0
    }
1871
1872
0
    return r;
1873
1874
0
 beyond_slice:
1875
    // Cramtools can create CRAMs that have sequence features outside the
1876
    // stated range of the container & slice reference extents (start + span).
1877
    // We have to check for these in many places, but for brevity have the
1878
    // error reporting in only one.
1879
0
    hts_log_error("CRAM CIGAR extends beyond slice reference extents");
1880
0
    return -1;
1881
1882
0
 block_err:
1883
0
    return -1;
1884
0
}
1885
1886
/*
1887
 * Quick and simple hash lookup for cram_map arrays
1888
 */
1889
0
static cram_map *map_find(cram_map **map, unsigned char *key, int id) {
1890
0
    cram_map *m;
1891
1892
0
    m = map[CRAM_MAP(key[0],key[1])];
1893
0
    while (m && m->key != id)
1894
0
        m= m->next;
1895
1896
0
    return m;
1897
0
}
1898
1899
//#define map_find(M,K,I) M[CRAM_MAP(K[0],K[1])];while (m && m->key != I);m= m->next
1900
1901
1902
static int cram_decode_aux_1_0(cram_container *c, cram_slice *s,
1903
0
                               cram_block *blk, cram_record *cr) {
1904
0
    int i, r = 0, out_sz = 1;
1905
0
    unsigned char ntags;
1906
1907
0
    if (!c->comp_hdr->codecs[DS_TC]) return -1;
1908
0
    r |= c->comp_hdr->codecs[DS_TC]->decode(s, c->comp_hdr->codecs[DS_TC], blk,
1909
0
                                            (char *)&ntags, &out_sz);
1910
0
    cr->ntags = ntags;
1911
1912
    //printf("TC=%d\n", cr->ntags);
1913
0
    cr->aux_size = 0;
1914
0
    cr->aux = BLOCK_SIZE(s->aux_blk);
1915
1916
0
    for (i = 0; i < cr->ntags; i++) {
1917
0
        int32_t id, out_sz = 1;
1918
0
        unsigned char tag_data[3];
1919
0
        cram_map *m;
1920
1921
        //printf("Tag %d/%d\n", i+1, cr->ntags);
1922
0
        if (!c->comp_hdr->codecs[DS_TN]) return -1;
1923
0
        r |= c->comp_hdr->codecs[DS_TN]->decode(s, c->comp_hdr->codecs[DS_TN],
1924
0
                                                blk, (char *)&id, &out_sz);
1925
0
        if (out_sz == 3) {
1926
            // Tag name stored as 3 chars instead of an int?
1927
0
            memcpy(tag_data, &id, 3);
1928
0
        } else {
1929
0
            tag_data[0] = (id>>16) & 0xff;
1930
0
            tag_data[1] = (id>>8)  & 0xff;
1931
0
            tag_data[2] = id       & 0xff;
1932
0
        }
1933
1934
0
        m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
1935
0
        if (!m)
1936
0
            return -1;
1937
0
        BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
1938
1939
0
        if (!m->codec) return -1;
1940
0
        r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
1941
1942
0
        cr->aux_size += out_sz + 3;
1943
0
    }
1944
1945
0
    return r;
1946
1947
0
 block_err:
1948
0
    return -1;
1949
0
}
1950
1951
// has_MD and has_NM are filled out with 0 for none present,
1952
// 1 for present and verbatim, and -pos for present as placeholder
1953
// (MD*, NM*) to be generated and filled out at offset +pos.
1954
static int cram_decode_aux(cram_fd *fd,
1955
                           cram_container *c, cram_slice *s,
1956
                           cram_block *blk, cram_record *cr,
1957
0
                           int *has_MD, int *has_NM) {
1958
0
    int i, r = 0, out_sz = 1;
1959
0
    int32_t TL = 0;
1960
0
    unsigned char *TN;
1961
0
    uint32_t ds = s->data_series;
1962
1963
0
    if (!(ds & (CRAM_TL|CRAM_aux))) {
1964
0
        cr->aux = 0;
1965
0
        cr->aux_size = 0;
1966
0
        return 0;
1967
0
    }
1968
1969
0
    if (!c->comp_hdr->codecs[DS_TL]) return -1;
1970
0
    r |= c->comp_hdr->codecs[DS_TL]->decode(s, c->comp_hdr->codecs[DS_TL], blk,
1971
0
                                            (char *)&TL, &out_sz);
1972
0
    if (r || TL < 0 || TL >= c->comp_hdr->nTL)
1973
0
        return -1;
1974
1975
0
    TN = c->comp_hdr->TL[TL];
1976
0
    cr->ntags = strlen((char *)TN)/3; // optimise to remove strlen
1977
1978
    //printf("TC=%d\n", cr->ntags);
1979
0
    cr->aux_size = 0;
1980
0
    cr->aux = BLOCK_SIZE(s->aux_blk);
1981
1982
0
    if (!(ds & CRAM_aux))
1983
0
        return 0;
1984
1985
0
    for (i = 0; i < cr->ntags; i++) {
1986
0
        int32_t id, out_sz = 1;
1987
0
        unsigned char tag_data[7];
1988
0
        cram_map *m;
1989
1990
0
        if (TN[0] == 'M' && TN[1] == 'D' && has_MD)
1991
0
            *has_MD = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1);
1992
0
        if (TN[0] == 'N' && TN[1] == 'M' && has_NM)
1993
0
            *has_NM = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1);;
1994
1995
        //printf("Tag %d/%d\n", i+1, cr->ntags);
1996
0
        tag_data[0] = TN[0];
1997
0
        tag_data[1] = TN[1];
1998
0
        tag_data[2] = TN[2];
1999
0
        id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2];
2000
2001
0
        if (CRAM_MAJOR_VERS(fd->version) >= 4 && TN[2] == '*') {
2002
            // Place holder, fill out contents later.
2003
0
            int tag_data_size;
2004
0
            if (TN[0] == 'N' && TN[1] == 'M') {
2005
                // Use a fixed size, so we can allocate room for it now.
2006
0
                memcpy(&tag_data[2], "I\0\0\0\0", 5);
2007
0
                tag_data_size = 7;
2008
0
            } else if (TN[0] == 'R' && TN[1] == 'G') {
2009
                // RG is variable size, but known already.  Insert now
2010
0
                TN += 3;
2011
                // Equiv to fd->header->hrecs->rg[cr->rg], but this is the
2012
                // new header API equivalent.
2013
0
                const char *rg = sam_hdr_line_name(fd->header, "RG", cr->rg);
2014
0
                if (!rg)
2015
0
                    continue;
2016
2017
0
                size_t rg_len = strlen(rg);
2018
0
                tag_data[2] = 'Z';
2019
0
                BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
2020
0
                BLOCK_APPEND(s->aux_blk, rg, rg_len);
2021
0
                BLOCK_APPEND_CHAR(s->aux_blk, '\0');
2022
0
                cr->aux_size += 3 + rg_len + 1;
2023
0
                cr->rg = -1; // prevents auto-add later
2024
0
                continue;
2025
0
            } else {
2026
                // Unknown size.  We'll insert MD into stream later.
2027
0
                tag_data[2] = 'Z';
2028
0
                tag_data_size = 3;
2029
0
            }
2030
0
            BLOCK_APPEND(s->aux_blk, (char *)tag_data, tag_data_size);
2031
0
            cr->aux_size += tag_data_size;
2032
0
            TN += 3;
2033
0
        } else {
2034
0
            TN += 3;
2035
0
            m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
2036
0
            if (!m)
2037
0
                return -1;
2038
2039
0
            BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
2040
2041
0
            if (!m->codec) return -1;
2042
0
            r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
2043
0
            if (r) break;
2044
0
            cr->aux_size += out_sz + 3;
2045
2046
            // cF CRAM flags.
2047
0
            if (TN[-3]=='c' && TN[-2]=='F' && TN[-1]=='C' && out_sz == 1) {
2048
                // Remove cF tag
2049
0
                uint8_t cF = BLOCK_END(s->aux_blk)[-1];
2050
0
                BLOCK_SIZE(s->aux_blk) -= out_sz+3;
2051
0
                cr->aux_size -= out_sz+3;
2052
2053
                // bit 1 => don't auto-decode MD.
2054
                // Pretend MD is present verbatim, so we don't auto-generate
2055
0
                if ((cF & 1) && has_MD && *has_MD == 0)
2056
0
                    *has_MD = 1;
2057
2058
                // bit 1 => don't auto-decode NM
2059
0
                if ((cF & 2) && has_NM && *has_NM == 0)
2060
0
                    *has_NM = 1;
2061
0
            }
2062
0
        }
2063
2064
        // We could go to 2^32 fine, but we shouldn't be hitting this anyway,
2065
        // and it's protecting against memory hogs too.
2066
0
        if (BLOCK_SIZE(s->aux_blk) > (1u<<31)) {
2067
0
            hts_log_error("CRAM->BAM aux block size overflow");
2068
0
            goto block_err;
2069
0
        }
2070
0
    }
2071
2072
0
    return r;
2073
2074
0
 block_err:
2075
0
    return -1;
2076
0
}
2077
2078
/* Resolve mate pair cross-references between recs within this slice */
2079
417
static int cram_decode_slice_xref(cram_slice *s, int required_fields) {
2080
417
    int rec;
2081
2082
417
    if (!(required_fields & (SAM_RNEXT | SAM_PNEXT | SAM_TLEN))) {
2083
0
        for (rec = 0; rec < s->hdr->num_records; rec++) {
2084
0
            cram_record *cr = &s->crecs[rec];
2085
2086
0
            cr->tlen = 0;
2087
0
            cr->mate_pos = 0;
2088
0
            cr->mate_ref_id = -1;
2089
0
        }
2090
2091
0
        return 0;
2092
0
    }
2093
2094
417
    for (rec = 0; rec < s->hdr->num_records; rec++) {
2095
0
        cram_record *cr = &s->crecs[rec];
2096
2097
0
        if (cr->mate_line >= 0) {
2098
0
            if (cr->mate_line < s->hdr->num_records) {
2099
                /*
2100
                 * On the first read, loop through computing lengths.
2101
                 * It's not perfect as we have one slice per reference so we
2102
                 * cannot detect when TLEN should be zero due to seqs that
2103
                 * map to multiple references.
2104
                 *
2105
                 * We also cannot set tlen correct when it spans a slice for
2106
                 * other reasons. This may make tlen too small. Should we
2107
                 * fix this by forcing TLEN to be stored verbatim in such cases?
2108
                 *
2109
                 * Or do we just admit defeat and output 0 for tlen? It's the
2110
                 * safe option...
2111
                 */
2112
0
                if (cr->tlen == INT64_MIN) {
2113
0
                    int id1 = rec, id2 = rec;
2114
0
                    int64_t aleft = cr->apos, aright = cr->aend;
2115
0
                    int64_t tlen;
2116
0
                    int ref = cr->ref_id;
2117
2118
                    // number of segments starting at the same point.
2119
0
                    int left_cnt = 0;
2120
0
                    int right_cnt = 0;
2121
2122
0
                    do {
2123
0
                        if (aleft > s->crecs[id2].apos)
2124
0
                            aleft = s->crecs[id2].apos, left_cnt = 1;
2125
0
                        else if (aleft == s->crecs[id2].apos)
2126
0
                            left_cnt++;
2127
0
                        if (aright < s->crecs[id2].aend) {
2128
0
                            aright = s->crecs[id2].aend;
2129
0
                            right_cnt = 1;
2130
0
                        } else if (aright == s->crecs[id2].aend) {
2131
0
                            right_cnt++;
2132
0
                        }
2133
0
                        if (s->crecs[id2].mate_line == -1) {
2134
0
                            s->crecs[id2].mate_line = rec;
2135
0
                            break;
2136
0
                        }
2137
0
                        if (s->crecs[id2].mate_line <= id2 ||
2138
0
                            s->crecs[id2].mate_line >= s->hdr->num_records)
2139
0
                            return -1;
2140
0
                        id2 = s->crecs[id2].mate_line;
2141
2142
0
                        if (s->crecs[id2].ref_id != ref)
2143
0
                            ref = -1;
2144
0
                    } while (id2 != id1);
2145
2146
0
                    if (ref != -1) {
2147
0
                        tlen = aright - aleft + 1;
2148
0
                        id1 = id2 = rec;
2149
2150
                        /*
2151
                         * When we have two seqs with identical start and
2152
                         * end coordinates, set +/- tlen based on 1st/last
2153
                         * bit flags instead, as a tie breaker.
2154
                         */
2155
0
                        if (s->crecs[id2].apos == aleft &&
2156
0
                            (s->crecs[id2].aend < aright ||
2157
0
                             left_cnt <= 1)) {
2158
                            // Leftmost, and not the rightmost
2159
0
                            s->crecs[id2].tlen = tlen;
2160
0
                            tlen = -tlen;
2161
0
                        } else if (s->crecs[id2].apos == aleft &&
2162
0
                                   s->crecs[id2].aend == aright &&
2163
0
                                   left_cnt > 1 && right_cnt > 1) {
2164
                            // Both leftmost and rightmost, resolve tie via
2165
                            // the BAM flags so changing order doesn't change
2166
                            // TLEN signs.
2167
0
                            if (s->crecs[id2].flags & BAM_FREAD1) {
2168
0
                                s->crecs[id2].tlen = tlen;
2169
0
                                tlen = -tlen;
2170
0
                            } else {
2171
0
                                s->crecs[id2].tlen = -tlen;
2172
0
                            }
2173
0
                        } else {
2174
                            // Rightmost or an internal
2175
0
                            s->crecs[id2].tlen = -tlen;
2176
0
                        }
2177
2178
0
                        id2 = s->crecs[id2].mate_line;
2179
0
                        while (id2 != id1) {
2180
0
                            s->crecs[id2].tlen = tlen;
2181
0
                            id2 = s->crecs[id2].mate_line;
2182
0
                        }
2183
0
                    } else {
2184
0
                        id1 = id2 = rec;
2185
2186
0
                        s->crecs[id2].tlen = 0;
2187
0
                        id2 = s->crecs[id2].mate_line;
2188
0
                        while (id2 != id1) {
2189
0
                            s->crecs[id2].tlen = 0;
2190
0
                            id2 = s->crecs[id2].mate_line;
2191
0
                        }
2192
0
                    }
2193
0
                }
2194
2195
0
                cr->mate_pos = s->crecs[cr->mate_line].apos;
2196
0
                cr->mate_ref_id = s->crecs[cr->mate_line].ref_id;
2197
2198
                // paired
2199
0
                cr->flags |= BAM_FPAIRED;
2200
2201
                // set mate unmapped if needed
2202
0
                if (s->crecs[cr->mate_line].flags & BAM_FUNMAP) {
2203
0
                    cr->flags |= BAM_FMUNMAP;
2204
0
                    cr->tlen = 0;
2205
0
                }
2206
0
                if (cr->flags & BAM_FUNMAP) {
2207
0
                    cr->tlen = 0;
2208
0
                }
2209
2210
                // set mate reversed if needed
2211
0
                if (s->crecs[cr->mate_line].flags & BAM_FREVERSE)
2212
0
                    cr->flags |= BAM_FMREVERSE;
2213
0
            } else {
2214
0
                hts_log_error("Mate line out of bounds: %d vs [0, %d]",
2215
0
                              cr->mate_line, s->hdr->num_records-1);
2216
0
            }
2217
2218
            /* FIXME: construct read names here too if needed */
2219
0
        } else {
2220
0
            if (cr->mate_flags & CRAM_M_REVERSE) {
2221
0
                cr->flags |= BAM_FPAIRED | BAM_FMREVERSE;
2222
0
            }
2223
0
            if (cr->mate_flags & CRAM_M_UNMAP) {
2224
0
                cr->flags |= BAM_FMUNMAP;
2225
                //cr->mate_ref_id = -1;
2226
0
            }
2227
0
            if (!(cr->flags & BAM_FPAIRED))
2228
0
                cr->mate_ref_id = -1;
2229
0
        }
2230
2231
0
        if (cr->tlen == INT64_MIN)
2232
0
            cr->tlen = 0; // Just incase
2233
0
    }
2234
2235
417
    for (rec = 0; rec < s->hdr->num_records; rec++) {
2236
0
        cram_record *cr = &s->crecs[rec];
2237
0
        if (cr->explicit_tlen != INT64_MIN)
2238
0
            cr->tlen = cr->explicit_tlen;
2239
0
    }
2240
2241
417
    return 0;
2242
417
}
2243
2244
0
static char *md5_print(unsigned char *md5, char *out) {
2245
0
    int i;
2246
0
    for (i = 0; i < 16; i++) {
2247
0
        out[i*2+0] = "0123456789abcdef"[md5[i]>>4];
2248
0
        out[i*2+1] = "0123456789abcdef"[md5[i]&15];
2249
0
    }
2250
0
    out[32] = 0;
2251
2252
0
    return out;
2253
0
}
2254
2255
/*
2256
 * Utility function to decode tlen (ISIZE), as it's called
2257
 * in multiple places.
2258
 *
2259
 * Returns codec return value (0 on success).
2260
 */
2261
static int cram_decode_tlen(cram_fd *fd, cram_container *c, cram_slice *s,
2262
0
                            cram_block *blk, int64_t *tlen) {
2263
0
    int out_sz = 1, r = 0;
2264
2265
0
    if (!c->comp_hdr->codecs[DS_TS]) return -1;
2266
0
    if (CRAM_MAJOR_VERS(fd->version) < 4) {
2267
0
        int32_t i32;
2268
0
        r |= c->comp_hdr->codecs[DS_TS]
2269
0
            ->decode(s, c->comp_hdr->codecs[DS_TS], blk,
2270
0
                     (char *)&i32, &out_sz);
2271
0
        *tlen = i32;
2272
0
    } else {
2273
0
        r |= c->comp_hdr->codecs[DS_TS]
2274
0
            ->decode(s, c->comp_hdr->codecs[DS_TS], blk,
2275
0
                     (char *)tlen, &out_sz);
2276
0
    }
2277
0
    return r;
2278
0
}
2279
2280
/*
2281
 * Decode an entire slice from container blocks. Fills out s->crecs[] array.
2282
 * Returns 0 on success
2283
 *        -1 on failure
2284
 */
2285
int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
2286
477
                      sam_hdr_t *sh) {
2287
477
    cram_block *blk = s->block[0];
2288
477
    int32_t bf, ref_id;
2289
477
    unsigned char cf;
2290
477
    int out_sz, r = 0;
2291
477
    int rec;
2292
477
    char *seq = NULL, *qual = NULL;
2293
477
    int unknown_rg = -1;
2294
477
    int embed_ref;
2295
477
    char **refs = NULL;
2296
477
    uint32_t ds;
2297
477
    sam_hrecs_t *bfd = sh->hrecs;
2298
2299
477
    if (cram_dependent_data_series(fd, c->comp_hdr, s) != 0)
2300
3
        return -1;
2301
2302
474
    ds = s->data_series;
2303
2304
474
    blk->bit = 7; // MSB first
2305
2306
    // Study the blocks and estimate approx sizes to preallocate.
2307
    // This looks to speed up decoding by around 8-9%.
2308
    // We can always shrink back down at the end if we overestimated.
2309
    // However it's likely that this also saves memory as own growth
2310
    // factor (*=1.5) is never applied.
2311
474
    {
2312
474
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
2313
474
        int qsize=0, nsize=0, q_id=0;
2314
#else
2315
        int qsize, nsize, q_id;
2316
        cram_decode_estimate_sizes(c->comp_hdr, s, &qsize, &nsize, &q_id);
2317
        //fprintf(stderr, "qsize=%d nsize=%d\n", qsize, nsize);
2318
#endif
2319
2320
474
        if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->seqs_blk, qsize+1);
2321
474
        if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->qual_blk, qsize+1);
2322
474
        if (nsize && (ds & CRAM_NS)) BLOCK_RESIZE_EXACT(s->name_blk, nsize+1);
2323
2324
        // To do - consider using q_id here to usurp the quality block and
2325
        // avoid a memcpy during decode.
2326
        // Specifically when quality is an external block uniquely used by
2327
        // DS_QS only, then we can set s->qual_blk directly to this
2328
        // block and save the codec->decode() calls. (Approx 3% cpu saving)
2329
474
    }
2330
2331
    /* Look for unknown RG, added as last by Java CRAM? */
2332
474
    if (bfd->nrg > 0 &&
2333
330
        bfd->rg[bfd->nrg-1].name != NULL &&
2334
330
        !strcmp(bfd->rg[bfd->nrg-1].name, "UNKNOWN"))
2335
0
        unknown_rg = bfd->nrg-1;
2336
2337
474
    if (blk->content_type != CORE)
2338
3
        return -1;
2339
2340
471
    if (s->crecs)
2341
0
        free(s->crecs);
2342
471
    if (!(s->crecs = malloc(s->hdr->num_records * sizeof(*s->crecs))))
2343
3
        return -1;
2344
2345
468
    ref_id = s->hdr->ref_seq_id;
2346
468
    if (CRAM_MAJOR_VERS(fd->version) < 4)
2347
468
       embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0;
2348
0
    else
2349
0
       embed_ref = s->hdr->ref_base_id > 0 ? 1 : 0;
2350
2351
468
    if (ref_id >= 0) {
2352
432
        if (embed_ref) {
2353
411
            cram_block *b;
2354
411
            if (s->hdr->ref_base_id < 0) {
2355
0
                hts_log_error("No reference specified and no embedded reference is available"
2356
0
                              " at #%d:%"PRId64"-%"PRId64, ref_id, s->hdr->ref_seq_start,
2357
0
                              s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2358
0
                return -1;
2359
0
            }
2360
411
            b = cram_get_block_by_id(s, s->hdr->ref_base_id);
2361
411
            if (!b)
2362
12
                return -1;
2363
399
            if (cram_uncompress_block(b) != 0)
2364
0
                return -1;
2365
399
            s->ref = (char *)BLOCK_DATA(b);
2366
399
            s->ref_start = s->hdr->ref_seq_start;
2367
399
            s->ref_end   = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
2368
399
            if (s->hdr->ref_seq_span > b->uncomp_size) {
2369
3
                hts_log_error("Embedded reference is too small at #%d:%"PRIhts_pos"-%"PRIhts_pos,
2370
3
                              ref_id, s->ref_start, s->ref_end);
2371
3
                return -1;
2372
3
            }
2373
399
        } else if (!c->comp_hdr->no_ref) {
2374
            //// Avoid Java cramtools bug by loading entire reference seq
2375
            //s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0);
2376
            //s->ref_start = 1;
2377
2378
21
            if (fd->required_fields & SAM_SEQ) {
2379
21
                s->ref =
2380
21
                cram_get_ref(fd, s->hdr->ref_seq_id,
2381
21
                             s->hdr->ref_seq_start,
2382
21
                             s->hdr->ref_seq_start + s->hdr->ref_seq_span -1);
2383
21
            }
2384
21
            s->ref_start = s->hdr->ref_seq_start;
2385
21
            s->ref_end   = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
2386
2387
            /* Sanity check */
2388
21
            if (s->ref_start < 0) {
2389
0
                hts_log_warning("Slice starts before base 1"
2390
0
                                " at #%d:%"PRId64"-%"PRId64, ref_id, s->hdr->ref_seq_start,
2391
0
                                s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2392
0
                s->ref_start = 0;
2393
0
            }
2394
21
            pthread_mutex_lock(&fd->ref_lock);
2395
21
            pthread_mutex_lock(&fd->refs->lock);
2396
21
            if ((fd->required_fields & SAM_SEQ) &&
2397
21
                ref_id < fd->refs->nref && fd->refs->ref_id &&
2398
0
                s->ref_end > fd->refs->ref_id[ref_id]->length) {
2399
0
                s->ref_end = fd->refs->ref_id[ref_id]->length;
2400
0
            }
2401
21
            pthread_mutex_unlock(&fd->refs->lock);
2402
21
            pthread_mutex_unlock(&fd->ref_lock);
2403
21
        }
2404
432
    }
2405
2406
453
    if ((fd->required_fields & SAM_SEQ) &&
2407
453
        s->ref == NULL && s->hdr->ref_seq_id >= 0 && !c->comp_hdr->no_ref) {
2408
21
        hts_log_error("Unable to fetch reference %s:%"PRId64"-%"PRId64,
2409
21
                      fd->refs->ref_id && ref_id >= 0 && ref_id < fd->refs->nref
2410
21
                      ? fd->refs->ref_id[ref_id]->name
2411
21
                      : "unknown",
2412
21
                      s->hdr->ref_seq_start,
2413
21
                      s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2414
21
        return -1;
2415
21
    }
2416
2417
432
    if (CRAM_MAJOR_VERS(fd->version) != 1
2418
0
        && (fd->required_fields & SAM_SEQ)
2419
0
        && s->hdr->ref_seq_id >= 0
2420
0
        && !fd->ignore_md5
2421
0
        && memcmp(s->hdr->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) {
2422
0
        hts_md5_context *md5;
2423
0
        unsigned char digest[16];
2424
2425
0
        if (s->ref && s->hdr->ref_seq_id >= 0) {
2426
0
            int start, len;
2427
2428
0
            if (s->hdr->ref_seq_start >= s->ref_start) {
2429
0
                start = s->hdr->ref_seq_start - s->ref_start;
2430
0
            } else {
2431
0
                hts_log_warning("Slice starts before base 1 at #%d:%"PRIhts_pos"-%"PRIhts_pos,
2432
0
                                ref_id, s->ref_start, s->ref_end);
2433
0
                start = 0;
2434
0
            }
2435
2436
0
            if (s->hdr->ref_seq_span <= s->ref_end - s->ref_start + 1) {
2437
0
                len = s->hdr->ref_seq_span;
2438
0
            } else {
2439
0
                hts_log_warning("Slice ends beyond reference end at #%d:%"PRIhts_pos"-%"PRIhts_pos,
2440
0
                                ref_id, s->ref_start, s->ref_end);
2441
0
                len = s->ref_end - s->ref_start + 1;
2442
0
            }
2443
2444
0
            if (!(md5 = hts_md5_init()))
2445
0
                return -1;
2446
0
            if (start + len > s->ref_end - s->ref_start + 1)
2447
0
                len = s->ref_end - s->ref_start + 1 - start;
2448
0
            if (len >= 0)
2449
0
                hts_md5_update(md5, s->ref + start, len);
2450
0
            hts_md5_final(digest, md5);
2451
0
            hts_md5_destroy(md5);
2452
0
        } else if (!s->ref && s->hdr->ref_base_id >= 0) {
2453
0
            cram_block *b = cram_get_block_by_id(s, s->hdr->ref_base_id);
2454
0
            if (b) {
2455
0
                if (!(md5 = hts_md5_init()))
2456
0
                    return -1;
2457
0
                hts_md5_update(md5, b->data, b->uncomp_size);
2458
0
                hts_md5_final(digest, md5);
2459
0
                hts_md5_destroy(md5);
2460
0
            }
2461
0
        }
2462
2463
0
        if (!c->comp_hdr->no_ref &&
2464
0
            ((!s->ref && s->hdr->ref_base_id < 0)
2465
0
             || memcmp(digest, s->hdr->md5, 16) != 0)) {
2466
0
            char M[33];
2467
0
            const char *rname = sam_hdr_tid2name(sh, ref_id);
2468
0
            if (!rname) rname="?"; // cannot happen normally
2469
0
            hts_log_error("MD5 checksum reference mismatch at %s:%"PRIhts_pos"-%"PRIhts_pos,
2470
0
                          rname, s->ref_start, s->ref_end);
2471
0
            hts_log_error("CRAM  : %s", md5_print(s->hdr->md5, M));
2472
0
            hts_log_error("Ref   : %s", md5_print(digest, M));
2473
0
            kstring_t ks = KS_INITIALIZE;
2474
0
            if (sam_hdr_find_tag_id(sh, "SQ", "SN", rname, "M5", &ks) == 0)
2475
0
                hts_log_error("@SQ M5: %s", ks.s);
2476
0
            hts_log_error("Please check the reference given is correct");
2477
0
            ks_free(&ks);
2478
0
            return -1;
2479
0
        }
2480
0
    }
2481
2482
432
    if (ref_id == -2) {
2483
27
        pthread_mutex_lock(&fd->ref_lock);
2484
27
        pthread_mutex_lock(&fd->refs->lock);
2485
27
        refs = calloc(fd->refs->nref, sizeof(char *));
2486
27
        pthread_mutex_unlock(&fd->refs->lock);
2487
27
        pthread_mutex_unlock(&fd->ref_lock);
2488
27
        if (!refs)
2489
0
            return -1;
2490
27
    }
2491
2492
432
    int last_ref_id = -9; // Arbitrary -ve marker for not-yet-set
2493
432
    for (rec = 0; rec < s->hdr->num_records; rec++) {
2494
15
        cram_record *cr = &s->crecs[rec];
2495
15
        int has_MD, has_NM;
2496
2497
        //fprintf(stderr, "Decode seq %d, %d/%d\n", rec, blk->byte, blk->bit);
2498
2499
15
        cr->s = s;
2500
2501
15
        out_sz = 1; /* decode 1 item */
2502
15
        if (ds & CRAM_BF) {
2503
15
            if (!c->comp_hdr->codecs[DS_BF]) goto block_err;
2504
0
            r |= c->comp_hdr->codecs[DS_BF]
2505
0
                            ->decode(s, c->comp_hdr->codecs[DS_BF], blk,
2506
0
                                     (char *)&bf, &out_sz);
2507
0
            if (r || bf < 0 ||
2508
0
                bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap))
2509
0
                goto block_err;
2510
0
            bf = fd->bam_flag_swap[bf];
2511
0
            cr->flags = bf;
2512
0
        } else {
2513
0
            cr->flags = bf = 0x4; // unmapped
2514
0
        }
2515
2516
0
        if (ds & CRAM_CF) {
2517
0
            if (CRAM_MAJOR_VERS(fd->version) == 1) {
2518
                /* CF is byte in 1.0, int32 in 2.0 */
2519
0
                if (!c->comp_hdr->codecs[DS_CF]) goto block_err;
2520
0
                r |= c->comp_hdr->codecs[DS_CF]
2521
0
                                ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2522
0
                                         (char *)&cf, &out_sz);
2523
0
                if (r) goto block_err;
2524
0
                cr->cram_flags = cf;
2525
0
            } else {
2526
0
                if (!c->comp_hdr->codecs[DS_CF]) goto block_err;
2527
0
                r |= c->comp_hdr->codecs[DS_CF]
2528
0
                                ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2529
0
                                         (char *)&cr->cram_flags, &out_sz);
2530
0
                if (r) goto block_err;
2531
0
                cf = cr->cram_flags;
2532
0
            }
2533
0
        } else {
2534
0
            cf = cr->cram_flags = 0;
2535
0
        }
2536
2537
0
        if (CRAM_MAJOR_VERS(fd->version) != 1 && ref_id == -2) {
2538
0
            if (ds & CRAM_RI) {
2539
0
                if (!c->comp_hdr->codecs[DS_RI]) goto block_err;
2540
0
                r |= c->comp_hdr->codecs[DS_RI]
2541
0
                                ->decode(s, c->comp_hdr->codecs[DS_RI], blk,
2542
0
                                         (char *)&cr->ref_id, &out_sz);
2543
0
                if (r) goto block_err;
2544
0
                if ((fd->required_fields & (SAM_SEQ|SAM_TLEN))
2545
0
                    && cr->ref_id >= 0
2546
0
                    && cr->ref_id != last_ref_id) {
2547
0
                    if (!c->comp_hdr->no_ref) {
2548
                        // Range(fd):  seq >= 0, unmapped -1, unspecified   -2
2549
                        // Slice(s):   seq >= 0, unmapped -1, multiple refs -2
2550
                        // Record(cr): seq >= 0, unmapped -1
2551
0
                        pthread_mutex_lock(&fd->range_lock);
2552
0
                        int need_ref = (fd->range.refid == -2 || cr->ref_id == fd->range.refid);
2553
0
                        pthread_mutex_unlock(&fd->range_lock);
2554
0
                        if  (need_ref) {
2555
0
                            if (!refs[cr->ref_id])
2556
0
                                refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id, 1, 0);
2557
0
                            if (!(s->ref = refs[cr->ref_id]))
2558
0
                                goto block_err;
2559
0
                        } else {
2560
                            // For multi-ref containers, we don't need to fetch all
2561
                            // refs if we're only querying one.
2562
0
                            s->ref = NULL;
2563
0
                        }
2564
2565
0
                        pthread_mutex_lock(&fd->range_lock);
2566
0
                        int discard_last_ref = (last_ref_id >= 0 &&
2567
0
                                                refs[last_ref_id] &&
2568
0
                                                (fd->range.refid == -2 ||
2569
0
                                                 last_ref_id == fd->range.refid));
2570
0
                        pthread_mutex_unlock(&fd->range_lock);
2571
0
                        if (discard_last_ref) {
2572
0
                            pthread_mutex_lock(&fd->ref_lock);
2573
0
                            discard_last_ref = !fd->unsorted;
2574
0
                            pthread_mutex_unlock(&fd->ref_lock);
2575
0
                        }
2576
0
                        if (discard_last_ref) {
2577
0
                            cram_ref_decr(fd->refs, last_ref_id);
2578
0
                            refs[last_ref_id] = NULL;
2579
0
                        }
2580
0
                    }
2581
0
                    s->ref_start = 1;
2582
0
                    pthread_mutex_lock(&fd->ref_lock);
2583
0
                    pthread_mutex_lock(&fd->refs->lock);
2584
0
                    s->ref_end = fd->refs->ref_id[cr->ref_id]->length;
2585
0
                    pthread_mutex_unlock(&fd->refs->lock);
2586
0
                    pthread_mutex_unlock(&fd->ref_lock);
2587
2588
0
                    last_ref_id = cr->ref_id;
2589
0
                }
2590
0
            } else {
2591
0
                cr->ref_id = -1;
2592
0
            }
2593
0
        } else {
2594
0
            cr->ref_id = ref_id; // Forced constant in CRAM 1.0
2595
0
        }
2596
0
        if (cr->ref_id < -1 || cr->ref_id >= bfd->nref) {
2597
0
            hts_log_error("Requested unknown reference ID %d", cr->ref_id);
2598
0
            goto block_err;
2599
0
        }
2600
2601
0
        if (ds & CRAM_RL) {
2602
0
            if (!c->comp_hdr->codecs[DS_RL]) goto block_err;
2603
0
            r |= c->comp_hdr->codecs[DS_RL]
2604
0
                            ->decode(s, c->comp_hdr->codecs[DS_RL], blk,
2605
0
                                     (char *)&cr->len, &out_sz);
2606
0
            if (r) goto block_err;
2607
0
            if (cr->len < 0) {
2608
0
                hts_log_error("Read has negative length");
2609
0
                goto block_err;
2610
0
            }
2611
0
        }
2612
2613
0
        if (ds & CRAM_AP) {
2614
0
            if (!c->comp_hdr->codecs[DS_AP]) goto block_err;
2615
0
            if (CRAM_MAJOR_VERS(fd->version) >= 4) {
2616
0
                r |= c->comp_hdr->codecs[DS_AP]
2617
0
                                ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
2618
0
                                         (char *)&cr->apos, &out_sz);
2619
0
            } else  {
2620
0
                int32_t i32;
2621
0
                r |= c->comp_hdr->codecs[DS_AP]
2622
0
                                ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
2623
0
                                         (char *)&i32, &out_sz);
2624
0
                cr->apos = i32;
2625
0
            }
2626
0
            if (r) goto block_err;;
2627
0
            if (c->comp_hdr->AP_delta) {
2628
0
                if (cr->apos < 0 && c->unsorted == 0) {
2629
                    // cache locally in c->unsorted so we don't have an
2630
                    // excessive number of locks
2631
0
                    pthread_mutex_lock(&fd->ref_lock);
2632
0
                    c->unsorted = fd->unsorted = 1;
2633
0
                    pthread_mutex_unlock(&fd->ref_lock);
2634
0
                }
2635
0
                cr->apos += s->last_apos;
2636
0
            }
2637
0
            s->last_apos=  cr->apos;
2638
0
        } else {
2639
0
            cr->apos = c->ref_seq_start;
2640
0
        }
2641
2642
0
        if (ds & CRAM_RG) {
2643
0
            if (!c->comp_hdr->codecs[DS_RG]) goto block_err;
2644
0
            r |= c->comp_hdr->codecs[DS_RG]
2645
0
                           ->decode(s, c->comp_hdr->codecs[DS_RG], blk,
2646
0
                                    (char *)&cr->rg, &out_sz);
2647
0
            if (r) goto block_err;
2648
0
            if (cr->rg == unknown_rg)
2649
0
                cr->rg = -1;
2650
0
        } else {
2651
0
            cr->rg = -1;
2652
0
        }
2653
2654
0
        cr->name_len = 0;
2655
2656
0
        if (c->comp_hdr->read_names_included) {
2657
0
            int32_t out_sz2 = 1; // block auto grows in decode()
2658
2659
            // Read directly into name cram_block
2660
0
            cr->name = BLOCK_SIZE(s->name_blk);
2661
0
            if (ds & CRAM_RN) {
2662
0
                if (!c->comp_hdr->codecs[DS_RN]) goto block_err;
2663
0
                r |= c->comp_hdr->codecs[DS_RN]
2664
0
                                ->decode(s, c->comp_hdr->codecs[DS_RN], blk,
2665
0
                                         (char *)s->name_blk, &out_sz2);
2666
0
                if (r) goto block_err;
2667
0
                cr->name_len = out_sz2;
2668
0
            }
2669
0
        }
2670
2671
0
        cr->mate_pos = 0;
2672
0
        cr->mate_line = -1;
2673
0
        cr->mate_ref_id = -1;
2674
0
        cr->explicit_tlen = INT64_MIN;
2675
0
        if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) {
2676
0
            if (ds & CRAM_MF) {
2677
0
                if (CRAM_MAJOR_VERS(fd->version) == 1) {
2678
                    /* MF is byte in 1.0, int32 in 2.0 */
2679
0
                    unsigned char mf;
2680
0
                    if (!c->comp_hdr->codecs[DS_MF]) goto block_err;
2681
0
                    r |= c->comp_hdr->codecs[DS_MF]
2682
0
                                    ->decode(s, c->comp_hdr->codecs[DS_MF],
2683
0
                                             blk, (char *)&mf, &out_sz);
2684
0
                    if (r) goto block_err;
2685
0
                    cr->mate_flags = mf;
2686
0
                } else {
2687
0
                    if (!c->comp_hdr->codecs[DS_MF]) goto block_err;
2688
0
                    r |= c->comp_hdr->codecs[DS_MF]
2689
0
                                    ->decode(s, c->comp_hdr->codecs[DS_MF],
2690
0
                                             blk,
2691
0
                                             (char *)&cr->mate_flags,
2692
0
                                             &out_sz);
2693
0
                    if (r) goto block_err;
2694
0
                }
2695
0
            } else {
2696
0
                cr->mate_flags = 0;
2697
0
            }
2698
2699
0
            if (!c->comp_hdr->read_names_included) {
2700
0
                int32_t out_sz2 = 1;
2701
2702
                // Read directly into name cram_block
2703
0
                cr->name = BLOCK_SIZE(s->name_blk);
2704
0
                if (ds & CRAM_RN) {
2705
0
                    if (!c->comp_hdr->codecs[DS_RN]) goto block_err;
2706
0
                    r |= c->comp_hdr->codecs[DS_RN]
2707
0
                                    ->decode(s, c->comp_hdr->codecs[DS_RN],
2708
0
                                             blk, (char *)s->name_blk,
2709
0
                                             &out_sz2);
2710
0
                    if (r) goto block_err;
2711
0
                    cr->name_len = out_sz2;
2712
0
                }
2713
0
            }
2714
2715
0
            if (ds & CRAM_NS) {
2716
0
                if (!c->comp_hdr->codecs[DS_NS]) goto block_err;
2717
0
                r |= c->comp_hdr->codecs[DS_NS]
2718
0
                                ->decode(s, c->comp_hdr->codecs[DS_NS], blk,
2719
0
                                         (char *)&cr->mate_ref_id, &out_sz);
2720
0
                if (r) goto block_err;
2721
0
            }
2722
2723
            // Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown.
2724
            // if (cr->mate_ref_id == -1 && cr->flags & 0x01) {
2725
            //     /* Paired, but unmapped */
2726
            //     cr->flags |= BAM_FMUNMAP;
2727
            // }
2728
2729
0
            if (ds & CRAM_NP) {
2730
0
                if (!c->comp_hdr->codecs[DS_NP]) goto block_err;;
2731
0
                if (CRAM_MAJOR_VERS(fd->version) < 4) {
2732
0
                    int32_t i32;
2733
0
                    r |= c->comp_hdr->codecs[DS_NP]
2734
0
                                    ->decode(s, c->comp_hdr->codecs[DS_NP], blk,
2735
0
                                             (char *)&i32, &out_sz);
2736
0
                    cr->mate_pos = i32;
2737
0
                } else {
2738
0
                    r |= c->comp_hdr->codecs[DS_NP]
2739
0
                        ->decode(s, c->comp_hdr->codecs[DS_NP], blk,
2740
0
                                 (char *)&cr->mate_pos, &out_sz);
2741
0
                }
2742
0
                if (r) goto block_err;
2743
0
            }
2744
2745
0
            if (ds & CRAM_TS) {
2746
0
                if (!c->comp_hdr->codecs[DS_TS]) goto block_err;
2747
0
                r = cram_decode_tlen(fd, c, s, blk, &cr->tlen);
2748
0
                if (r) goto block_err;
2749
0
            } else {
2750
0
                cr->tlen = INT64_MIN;
2751
0
            }
2752
0
        } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) {
2753
            // else not detached
2754
0
            if (ds & CRAM_NF) {
2755
0
                if (!c->comp_hdr->codecs[DS_NF]) goto block_err;
2756
0
                r |= c->comp_hdr->codecs[DS_NF]
2757
0
                                ->decode(s, c->comp_hdr->codecs[DS_NF], blk,
2758
0
                                         (char *)&cr->mate_line, &out_sz);
2759
0
                if (r) goto block_err;
2760
0
                cr->mate_line += rec + 1;
2761
2762
                //cr->name_len = sprintf(name, "%d", name_id++);
2763
                //cr->name = DSTRING_LEN(name_ds);
2764
                //dstring_nappend(name_ds, name, cr->name_len);
2765
2766
0
                cr->mate_ref_id = -1;
2767
0
                cr->tlen = INT64_MIN;
2768
0
                cr->mate_pos = 0;
2769
0
            } else  {
2770
0
                cr->mate_flags = 0;
2771
0
                cr->tlen = INT64_MIN;
2772
0
            }
2773
0
            if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) {
2774
0
                if (ds & CRAM_TS) {
2775
0
                    r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen);
2776
0
                    if (r) return r;
2777
0
                } else {
2778
0
                    cr->mate_flags = 0;
2779
0
                    cr->tlen = INT64_MIN;
2780
0
                }
2781
0
            }
2782
0
        } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) {
2783
0
            if (ds & CRAM_TS) {
2784
0
                r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen);
2785
0
                if (r) return r;
2786
0
            } else {
2787
0
                cr->mate_flags = 0;
2788
0
                cr->tlen = INT64_MIN;
2789
0
            }
2790
0
        } else {
2791
0
            cr->mate_flags = 0;
2792
0
            cr->tlen = INT64_MIN;
2793
0
        }
2794
        /*
2795
        else if (!name[0]) {
2796
            //name[0] = '?'; name[1] = 0;
2797
            //cr->name_len = 1;
2798
            //cr->name=  DSTRING_LEN(s->name_ds);
2799
            //dstring_nappend(s->name_ds, "?", 1);
2800
2801
            cr->mate_ref_id = -1;
2802
            cr->tlen = 0;
2803
            cr->mate_pos = 0;
2804
        }
2805
        */
2806
2807
        /* Auxiliary tags */
2808
0
        has_MD = has_NM = 0;
2809
0
        if (CRAM_MAJOR_VERS(fd->version) == 1)
2810
0
            r |= cram_decode_aux_1_0(c, s, blk, cr);
2811
0
        else
2812
0
            r |= cram_decode_aux(fd, c, s, blk, cr, &has_MD, &has_NM);
2813
0
        if (r) goto block_err;
2814
2815
        /* Fake up dynamic string growth and appending */
2816
0
        if (ds & CRAM_RL) {
2817
0
            cr->seq = BLOCK_SIZE(s->seqs_blk);
2818
0
            BLOCK_RESIZE(s->seqs_blk, cr->seq + cr->len);
2819
0
            seq = (char *)BLOCK_END(s->seqs_blk);
2820
0
            BLOCK_SIZE(s->seqs_blk) += cr->len;
2821
2822
0
            if (!seq)
2823
0
                goto block_err;
2824
2825
0
            cr->qual = BLOCK_SIZE(s->qual_blk);
2826
0
            BLOCK_RESIZE(s->qual_blk, cr->qual + cr->len);
2827
0
            qual = (char *)BLOCK_END(s->qual_blk);
2828
0
            BLOCK_SIZE(s->qual_blk) += cr->len;
2829
2830
0
            if (!s->ref)
2831
0
                memset(seq, '=', cr->len);
2832
0
        }
2833
2834
0
        if (!(bf & BAM_FUNMAP)) {
2835
0
            if ((ds & CRAM_AP) && cr->apos <= 0) {
2836
0
                hts_log_error("Read has alignment position %"PRId64
2837
0
                              " but no unmapped flag",
2838
0
                              cr->apos);
2839
0
                goto block_err;
2840
0
            }
2841
            /* Decode sequence and generate CIGAR */
2842
0
            if (ds & (CRAM_SEQ | CRAM_MQ)) {
2843
0
                r |= cram_decode_seq(fd, c, s, blk, cr, sh, cf, seq, qual,
2844
0
                                     has_MD, has_NM);
2845
0
                if (r) goto block_err;
2846
0
            } else {
2847
0
                cr->cigar = 0;
2848
0
                cr->ncigar = 0;
2849
0
                cr->aend = cr->apos;
2850
0
                cr->mqual = 0;
2851
0
            }
2852
0
        } else {
2853
0
            int out_sz2 = cr->len;
2854
2855
            //puts("Unmapped");
2856
0
            cr->cigar = 0;
2857
0
            cr->ncigar = 0;
2858
0
            cr->aend = cr->apos;
2859
0
            cr->mqual = 0;
2860
2861
0
            if (ds & CRAM_BA && cr->len) {
2862
0
                if (!c->comp_hdr->codecs[DS_BA]) goto block_err;
2863
0
                r |= c->comp_hdr->codecs[DS_BA]
2864
0
                                ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
2865
0
                                         (char *)seq, &out_sz2);
2866
0
                if (r) goto block_err;
2867
0
            }
2868
2869
0
            if ((ds & CRAM_CF) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
2870
0
                out_sz2 = cr->len;
2871
0
                if (ds & CRAM_QS && cr->len >= 0) {
2872
0
                    if (!c->comp_hdr->codecs[DS_QS]) goto block_err;
2873
0
                    r |= c->comp_hdr->codecs[DS_QS]
2874
0
                                    ->decode(s, c->comp_hdr->codecs[DS_QS],
2875
0
                                             blk, qual, &out_sz2);
2876
0
                    if (r) goto block_err;
2877
0
                }
2878
0
            } else {
2879
0
                if (ds & CRAM_RL)
2880
0
                    memset(qual, 255, cr->len);
2881
0
            }
2882
0
        }
2883
2884
0
        if (!c->comp_hdr->qs_seq_orient && (ds & CRAM_QS) && (cr->flags & BAM_FREVERSE)) {
2885
0
            int i, j;
2886
0
            for (i = 0, j = cr->len-1; i < j; i++, j--) {
2887
0
                unsigned char c;
2888
0
                c = qual[i];
2889
0
                qual[i] = qual[j];
2890
0
                qual[j] = c;
2891
0
            }
2892
0
        }
2893
0
    }
2894
2895
417
    pthread_mutex_lock(&fd->ref_lock);
2896
417
    if (refs) {
2897
15
        int i;
2898
24
        for (i = 0; i < fd->refs->nref; i++) {
2899
9
            if (refs[i])
2900
0
                cram_ref_decr(fd->refs, i);
2901
9
        }
2902
15
        free(refs);
2903
15
        refs = NULL;
2904
402
    } else if (ref_id >= 0 && s->ref != fd->ref_free && !embed_ref) {
2905
0
        cram_ref_decr(fd->refs, ref_id);
2906
0
    }
2907
417
    pthread_mutex_unlock(&fd->ref_lock);
2908
2909
    /* Resolve mate pair cross-references between recs within this slice */
2910
417
    r |= cram_decode_slice_xref(s, fd->required_fields);
2911
2912
    // Free the original blocks as we no longer need these.
2913
417
    {
2914
417
        int i;
2915
2.12k
        for (i = 0; i < s->hdr->num_blocks; i++) {
2916
1.70k
            cram_block *b = s->block[i];
2917
1.70k
            cram_free_block(b);
2918
1.70k
            s->block[i] = NULL;
2919
1.70k
        }
2920
417
    }
2921
2922
    // Also see initial BLOCK_RESIZE_EXACT at top of function.
2923
    // As we grow blocks we overallocate by up to 50%. So shrink
2924
    // back to their final sizes here.
2925
    //
2926
    //fprintf(stderr, "%d %d // %d %d // %d %d // %d %d\n",
2927
    //      (int)s->seqs_blk->byte, (int)s->seqs_blk->alloc,
2928
    //      (int)s->qual_blk->byte, (int)s->qual_blk->alloc,
2929
    //      (int)s->name_blk->byte, (int)s->name_blk->alloc,
2930
    //      (int)s->aux_blk->byte,  (int)s->aux_blk->alloc);
2931
417
    BLOCK_RESIZE_EXACT(s->seqs_blk, BLOCK_SIZE(s->seqs_blk)+1);
2932
417
    BLOCK_RESIZE_EXACT(s->qual_blk, BLOCK_SIZE(s->qual_blk)+1);
2933
417
    BLOCK_RESIZE_EXACT(s->name_blk, BLOCK_SIZE(s->name_blk)+1);
2934
417
    BLOCK_RESIZE_EXACT(s->aux_blk,  BLOCK_SIZE(s->aux_blk)+1);
2935
2936
417
    return r;
2937
2938
15
 block_err:
2939
15
    if (refs) {
2940
12
        int i;
2941
12
        pthread_mutex_lock(&fd->ref_lock);
2942
30
        for (i = 0; i < fd->refs->nref; i++) {
2943
18
            if (refs[i])
2944
0
                cram_ref_decr(fd->refs, i);
2945
18
        }
2946
12
        free(refs);
2947
12
        pthread_mutex_unlock(&fd->ref_lock);
2948
12
    }
2949
2950
15
    return -1;
2951
417
}
2952
2953
typedef struct {
2954
    cram_fd *fd;
2955
    cram_container *c;
2956
    cram_slice *s;
2957
    sam_hdr_t *h;
2958
    int exit_code;
2959
} cram_decode_job;
2960
2961
0
void *cram_decode_slice_thread(void *arg) {
2962
0
    cram_decode_job *j = (cram_decode_job *)arg;
2963
2964
0
    j->exit_code = cram_decode_slice(j->fd, j->c, j->s, j->h);
2965
2966
0
    return j;
2967
0
}
2968
2969
/*
2970
 * Spawn a multi-threaded version of cram_decode_slice().
2971
 */
2972
int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s,
2973
477
                         sam_hdr_t *bfd) {
2974
477
    cram_decode_job *j;
2975
477
    int nonblock;
2976
2977
477
    if (!fd->pool)
2978
477
        return cram_decode_slice(fd, c, s, bfd);
2979
2980
0
    if (!(j = malloc(sizeof(*j))))
2981
0
        return -1;
2982
2983
0
    j->fd = fd;
2984
0
    j->c  = c;
2985
0
    j->s  = s;
2986
0
    j->h  = bfd;
2987
2988
0
    nonblock = hts_tpool_process_sz(fd->rqueue) ? 1 : 0;
2989
2990
0
    int saved_errno = errno;
2991
0
    errno = 0;
2992
0
    if (-1 == hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_decode_slice_thread,
2993
0
                                  j, nonblock)) {
2994
        /* Would block */
2995
0
        if (errno != EAGAIN)
2996
0
            return -1;
2997
0
        fd->job_pending = j;
2998
0
    } else {
2999
0
        fd->job_pending = NULL;
3000
0
    }
3001
0
    errno = saved_errno;
3002
3003
    // flush too
3004
0
    return 0;
3005
0
}
3006
3007
3008
/* ----------------------------------------------------------------------
3009
 * CRAM sequence iterators.
3010
 */
3011
3012
/*
3013
 * Converts a cram in-memory record into a bam in-memory record. We
3014
 * pass a pointer to a bam_seq_t pointer along with the a pointer to
3015
 * the allocated size. These can initially be pointers to NULL and zero.
3016
 *
3017
 * This function will reallocate the bam buffer as required and update
3018
 * (*bam)->alloc accordingly, allowing it to be used within a loop
3019
 * efficiently without needing to allocate new bam objects over and
3020
 * over again.
3021
 *
3022
 * Returns the used size of the bam record on success
3023
 *         -1 on failure.
3024
 */
3025
int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s,
3026
0
                cram_record *cr, int rec, bam_seq_t **bam) {
3027
0
    int ret, rg_len;
3028
0
    char name_a[1024], *name;
3029
0
    int name_len;
3030
0
    char *aux;
3031
0
    char *seq, *qual;
3032
0
    sam_hrecs_t *bfd = sh->hrecs;
3033
3034
    /* Assign names if not explicitly set */
3035
0
    if (fd->required_fields & SAM_QNAME) {
3036
0
        if (cr->name_len) {
3037
0
            name = (char *)BLOCK_DATA(s->name_blk) + cr->name;
3038
0
            name_len = cr->name_len;
3039
0
        } else {
3040
0
            name = name_a;
3041
0
            if (cr->mate_line >= 0 && cr->mate_line < s->max_rec &&
3042
0
                s->crecs[cr->mate_line].name_len > 0) {
3043
                // Copy our mate if non-zero.
3044
0
                memcpy(name_a, BLOCK_DATA(s->name_blk)+s->crecs[cr->mate_line].name,
3045
0
                       s->crecs[cr->mate_line].name_len);
3046
0
                name = name_a + s->crecs[cr->mate_line].name_len;
3047
0
            } else {
3048
                // Otherwise generate a name based on prefix
3049
0
                name_len = strlen(fd->prefix);
3050
0
                memcpy(name, fd->prefix, name_len);
3051
0
                name += name_len;
3052
0
                *name++ = ':';
3053
0
                if (cr->mate_line >= 0 && cr->mate_line < rec) {
3054
0
                    name = (char *)append_uint64((unsigned char *)name,
3055
0
                                                 s->hdr->record_counter +
3056
0
                                                 cr->mate_line + 1);
3057
0
                } else {
3058
0
                    name = (char *)append_uint64((unsigned char *)name,
3059
0
                                                 s->hdr->record_counter +
3060
0
                                                 rec + 1);
3061
0
                }
3062
0
            }
3063
0
            name_len = name - name_a;
3064
0
            name = name_a;
3065
0
        }
3066
0
    } else {
3067
0
        name = "?";
3068
0
        name_len = 1;
3069
0
    }
3070
3071
    /* Generate BAM record */
3072
0
    if (cr->rg < -1 || cr->rg >= bfd->nrg)
3073
0
        return -1;
3074
0
    rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0;
3075
3076
0
    if (fd->required_fields & (SAM_SEQ | SAM_QUAL)) {
3077
0
        if (!BLOCK_DATA(s->seqs_blk))
3078
0
            return -1;
3079
0
        seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
3080
0
    } else {
3081
0
        seq = "*";
3082
0
        cr->len = 0;
3083
0
    }
3084
3085
0
    if (fd->required_fields & SAM_QUAL) {
3086
0
        if (!BLOCK_DATA(s->qual_blk))
3087
0
            return -1;
3088
0
        qual = (char *)BLOCK_DATA(s->qual_blk) + cr->qual;
3089
0
    } else {
3090
0
        qual = NULL;
3091
0
    }
3092
3093
0
    ret = bam_set1(*bam,
3094
0
                   name_len, name,
3095
0
                   cr->flags, cr->ref_id, cr->apos - 1, cr->mqual,
3096
0
                   cr->ncigar, &s->cigar[cr->cigar],
3097
0
                   cr->mate_ref_id, cr->mate_pos - 1, cr->tlen,
3098
0
                   cr->len, seq, qual,
3099
0
                   cr->aux_size + rg_len);
3100
0
    if (ret < 0) {
3101
0
        return ret;
3102
0
    }
3103
3104
0
    aux = (char *)bam_aux(*bam);
3105
3106
    /* Auxiliary strings */
3107
0
    if (cr->aux_size != 0) {
3108
0
        memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size);
3109
0
        aux += cr->aux_size;
3110
0
        (*bam)->l_data += cr->aux_size;
3111
0
    }
3112
3113
    /* RG:Z: */
3114
0
    if (rg_len > 0) {
3115
0
        *aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z';
3116
0
        int len = bfd->rg[cr->rg].name_len;
3117
0
        memcpy(aux, bfd->rg[cr->rg].name, len);
3118
0
        aux += len;
3119
0
        *aux++ = 0;
3120
0
        (*bam)->l_data += rg_len;
3121
0
    }
3122
3123
0
    return (*bam)->l_data;
3124
0
}
3125
3126
/*
3127
 * Here be dragons! The multi-threading code in this is crufty beyond belief.
3128
 */
3129
3130
/*
3131
 * Load first container.
3132
 * Called when fd->ctr is NULL>
3133
 *
3134
 * Returns container on success
3135
 *        NULL on failure.
3136
 */
3137
3.88k
static cram_container *cram_first_slice(cram_fd *fd) {
3138
3.88k
    cram_container *c;
3139
3140
8.98k
    do {
3141
8.98k
        if (fd->ctr)
3142
5.09k
            cram_free_container(fd->ctr);
3143
3144
8.98k
        if (!(c = fd->ctr = cram_read_container(fd)))
3145
277
            return NULL;
3146
8.70k
        c->curr_slice_mt = c->curr_slice;
3147
8.70k
    } while (c->length == 0);
3148
3149
    /*
3150
     * The first container may be a result of a sub-range query.
3151
     * In which case it may still not be the optimal starting point
3152
     * due to skipped containers/slices in the index.
3153
     */
3154
    // No need for locks here as we're in the main thread.
3155
3.60k
    if (fd->range.refid != -2) {
3156
0
        while (c->ref_seq_id != -2 &&
3157
0
               (c->ref_seq_id < fd->range.refid ||
3158
0
                (fd->range.refid >= 0 && c->ref_seq_id == fd->range.refid
3159
0
                 && c->ref_seq_start + c->ref_seq_span-1 < fd->range.start))) {
3160
0
            if (0 != cram_seek(fd, c->length, SEEK_CUR))
3161
0
                return NULL;
3162
0
            cram_free_container(fd->ctr);
3163
0
            do {
3164
0
                if (!(c = fd->ctr = cram_read_container(fd)))
3165
0
                    return NULL;
3166
0
            } while (c->length == 0);
3167
0
        }
3168
3169
0
        if (c->ref_seq_id != -2 && c->ref_seq_id != fd->range.refid) {
3170
0
            fd->eof = 1;
3171
0
            return NULL;
3172
0
        }
3173
0
    }
3174
3175
3.60k
    if (!(c->comp_hdr_block = cram_read_block(fd)))
3176
88
        return NULL;
3177
3.51k
    if (c->comp_hdr_block->content_type != COMPRESSION_HEADER)
3178
23
        return NULL;
3179
3180
3.49k
    c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
3181
3.49k
    if (!c->comp_hdr)
3182
3.25k
        return NULL;
3183
236
    if (!c->comp_hdr->AP_delta &&
3184
0
        sam_hrecs_sort_order(fd->header->hrecs) != ORDER_COORD) {
3185
0
        pthread_mutex_lock(&fd->ref_lock);
3186
0
        fd->unsorted = 1;
3187
0
        pthread_mutex_unlock(&fd->ref_lock);
3188
0
    }
3189
3190
236
    return c;
3191
3.49k
}
3192
3193
4.29k
cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
3194
4.29k
    cram_container *c_curr;  // container being consumed via cram_get_seq()
3195
4.29k
    cram_slice *s_curr = NULL;
3196
3197
    // Populate the first container if unknown.
3198
4.29k
    if (!(c_curr = fd->ctr)) {
3199
3.88k
        if (!(c_curr = cram_first_slice(fd)))
3200
3.64k
            return NULL;
3201
3.88k
    }
3202
3203
    // Discard previous slice
3204
653
    if ((s_curr = c_curr->slice)) {
3205
417
        c_curr->slice = NULL;
3206
417
        cram_free_slice(s_curr);
3207
417
        s_curr = NULL;
3208
417
    }
3209
3210
    // If we've consumed all slices in this container, also discard
3211
    // the container too.
3212
653
    if (c_curr->curr_slice == c_curr->max_slice) {
3213
60
        if (fd->ctr == c_curr)
3214
60
            fd->ctr = NULL;
3215
60
        if (fd->ctr_mt == c_curr)
3216
30
            fd->ctr_mt = NULL;
3217
60
        cram_free_container(c_curr);
3218
60
        c_curr = NULL;
3219
60
    }
3220
3221
653
    if (!fd->ctr_mt)
3222
266
        fd->ctr_mt = c_curr;
3223
3224
    // Fetch the next slice (and the container if necessary).
3225
    //
3226
    // If single threaded this loop bails out as soon as it finds
3227
    // a slice in range.  In this case c_next and c_curr end up being
3228
    // the same thing.
3229
    //
3230
    // If multi-threaded, we loop until we have filled out
3231
    // thread pool input queue.  Here c_next and c_curr *may* differ, as
3232
    // can fd->ctr and fd->ctr_mt.
3233
653
    for (;;) {
3234
653
        cram_container *c_next = fd->ctr_mt;
3235
653
        cram_slice *s_next = NULL;
3236
3237
        // Next slice; either from the last job we failed to push
3238
        // to the input queue or via more I/O.
3239
653
        if (fd->job_pending) {
3240
0
            cram_decode_job *j = (cram_decode_job *)fd->job_pending;
3241
0
            c_next = j->c;
3242
0
            s_next = j->s;
3243
0
            free(fd->job_pending);
3244
0
            fd->job_pending = NULL;
3245
653
        } else if (!fd->ooc) {
3246
764
        empty_container:
3247
764
            if (!c_next || c_next->curr_slice_mt == c_next->max_slice) {
3248
                // new container
3249
4.52k
                for(;;) {
3250
4.52k
                    if (!(c_next = cram_read_container(fd))) {
3251
24
                        if (fd->pool) {
3252
0
                            fd->ooc = 1;
3253
0
                            break;
3254
0
                        }
3255
3256
24
                        return NULL;
3257
24
                    }
3258
4.50k
                    c_next->curr_slice_mt = c_next->curr_slice;
3259
3260
4.50k
                    if (c_next->length != 0)
3261
147
                        break;
3262
3263
4.35k
                    cram_free_container(c_next);
3264
4.35k
                }
3265
147
                if (fd->ooc)
3266
0
                    break;
3267
3268
//                printf("%p %d:%ld-%ld vs %d:%ld-%ld\n", fd,
3269
//                       c_next->ref_seq_id, c_next->ref_seq_start, c_next->ref_seq_start+c_next->ref_seq_span-1,
3270
//                       fd->range.refid, fd->range.start, fd->range.end);
3271
3272
                /* Skip containers not yet spanning our range */
3273
147
                if (fd->range.refid != -2 && c_next->ref_seq_id != -2) {
3274
                    // ref_id beyond end of range; bail out
3275
0
                    if (c_next->ref_seq_id != fd->range.refid) {
3276
0
                        cram_free_container(c_next);
3277
0
                        fd->ctr_mt = NULL;
3278
0
                        fd->ooc = 1;
3279
0
                        break;
3280
0
                    }
3281
3282
                    // position beyond end of range; bail out
3283
0
                    if (fd->range.refid != -1 &&
3284
0
                        c_next->ref_seq_start > fd->range.end) {
3285
0
                        cram_free_container(c_next);
3286
0
                        fd->ctr_mt = NULL;
3287
0
                        fd->ooc = 1;
3288
0
                        break;
3289
0
                    }
3290
3291
                    // Before start of range; skip to next container.
3292
                    // Can't use cram_seek() here as it drops in-progress
3293
                    // multi-threaded decode jobs, so call hseek() directly.
3294
0
                    if (fd->range.refid != -1 &&
3295
0
                        c_next->ref_seq_start + c_next->ref_seq_span-1 <
3296
0
                        fd->range.start) {
3297
0
                        off_t skip_length = c_next->length;
3298
0
                        cram_free_container(c_next);
3299
0
                        c_next = NULL;
3300
0
                        fd->ooc = 0;
3301
0
                        if (hseek(fd->fp, skip_length, SEEK_CUR) < 0)
3302
0
                            return NULL;
3303
0
                        continue;
3304
0
                    }
3305
0
                }
3306
3307
                // Container is valid range, so remember it for restarting
3308
                // this function.
3309
147
                fd->ctr_mt = c_next;
3310
3311
147
                if (!(c_next->comp_hdr_block = cram_read_block(fd)))
3312
9
                    return NULL;
3313
138
                if (c_next->comp_hdr_block->content_type != COMPRESSION_HEADER)
3314
15
                    return NULL;
3315
3316
123
                c_next->comp_hdr =
3317
123
                    cram_decode_compression_header(fd, c_next->comp_hdr_block);
3318
123
                if (!c_next->comp_hdr)
3319
15
                    return NULL;
3320
3321
108
                if (!c_next->comp_hdr->AP_delta &&
3322
0
                    sam_hrecs_sort_order(fd->header->hrecs) != ORDER_COORD) {
3323
0
                    pthread_mutex_lock(&fd->ref_lock);
3324
0
                    fd->unsorted = 1;
3325
0
                    pthread_mutex_unlock(&fd->ref_lock);
3326
0
                }
3327
108
            }
3328
3329
701
            if (c_next->num_records == 0) {
3330
111
                if (fd->ctr == c_next)
3331
24
                    fd->ctr = NULL;
3332
111
                if (c_curr == c_next)
3333
24
                    c_curr = NULL;
3334
111
                if (fd->ctr_mt == c_next)
3335
111
                    fd->ctr_mt = NULL;
3336
111
                cram_free_container(c_next);
3337
111
                c_next = NULL;
3338
111
                goto empty_container;
3339
111
            }
3340
3341
590
            if (!(s_next = c_next->slice = cram_read_slice(fd)))
3342
113
                return NULL;
3343
3344
477
            s_next->slice_num = ++c_next->curr_slice_mt;
3345
477
            s_next->curr_rec = 0;
3346
477
            s_next->max_rec = s_next->hdr->num_records;
3347
3348
477
            s_next->last_apos = s_next->hdr->ref_seq_start;
3349
3350
            // We know the container overlaps our range, but with multi-slice
3351
            // containers we may have slices that do not.  Skip these also.
3352
477
            if (fd->range.refid != -2 && s_next->hdr->ref_seq_id != -2) {
3353
                // ref_id beyond end of range; bail out
3354
0
                if (s_next->hdr->ref_seq_id != fd->range.refid) {
3355
0
                    fd->ooc = 1;
3356
0
                    cram_free_slice(s_next);
3357
0
                    c_next->slice = s_next = NULL;
3358
0
                    break;
3359
0
                }
3360
3361
                // position beyond end of range; bail out
3362
0
                if (fd->range.refid != -1 &&
3363
0
                    s_next->hdr->ref_seq_start > fd->range.end) {
3364
0
                    fd->ooc = 1;
3365
0
                    cram_free_slice(s_next);
3366
0
                    c_next->slice = s_next = NULL;
3367
0
                    break;
3368
0
                }
3369
3370
                // before start of range; skip to next slice
3371
0
                if (fd->range.refid != -1 &&
3372
0
                    s_next->hdr->ref_seq_start + s_next->hdr->ref_seq_span-1 <
3373
0
                    fd->range.start) {
3374
0
                    cram_free_slice(s_next);
3375
0
                    c_next->slice = s_next = NULL;
3376
0
                    continue;
3377
0
                }
3378
0
            }
3379
477
        } // end: if (!fd->ooc)
3380
3381
477
        if (!c_next || !s_next)
3382
0
            break;
3383
3384
        // Decode the slice, either right now (non-threaded) or by pushing
3385
        // it to the a decode queue (threaded).
3386
477
        if (cram_decode_slice_mt(fd, c_next, s_next, fd->header) != 0) {
3387
60
            hts_log_error("Failure to decode slice");
3388
60
            cram_free_slice(s_next);
3389
60
            c_next->slice = NULL;
3390
60
            return NULL;
3391
60
        }
3392
3393
        // No thread pool, so don't loop again
3394
417
        if (!fd->pool) {
3395
417
            c_curr = c_next;
3396
417
            s_curr = s_next;
3397
417
            break;
3398
417
        }
3399
3400
        // With thread pool, but we have a job pending so our decode queue
3401
        // is full.
3402
0
        if (fd->job_pending)
3403
0
            break;
3404
3405
        // Otherwise we're threaded with room in the decode input queue, so
3406
        // keep reading slices for decode.
3407
        // Push it a bit far, to qsize in queue rather than pending arrival,
3408
        // as cram tends to be a bit bursty in decode timings.
3409
0
        if (hts_tpool_process_len(fd->rqueue) >
3410
0
            hts_tpool_process_qsize(fd->rqueue))
3411
0
            break;
3412
0
    } // end of for(;;)
3413
3414
3415
    // When not threaded we've already have c_curr and s_curr.
3416
    // Otherwise we need get them by pulling off the decode output queue.
3417
417
    if (fd->pool) {
3418
0
        hts_tpool_result *res;
3419
0
        cram_decode_job *j;
3420
3421
0
        if (fd->ooc && hts_tpool_process_empty(fd->rqueue)) {
3422
0
            fd->eof = 1;
3423
0
            return NULL;
3424
0
        }
3425
3426
0
        res = hts_tpool_next_result_wait(fd->rqueue);
3427
3428
0
        if (!res || !hts_tpool_result_data(res)) {
3429
0
            hts_log_error("Call to hts_tpool_next_result failed");
3430
0
            return NULL;
3431
0
        }
3432
3433
0
        j = (cram_decode_job *)hts_tpool_result_data(res);
3434
0
        c_curr = j->c;
3435
0
        s_curr = j->s;
3436
3437
0
        if (j->exit_code != 0) {
3438
0
            hts_log_error("Slice decode failure");
3439
0
            fd->eof = 0;
3440
0
            hts_tpool_delete_result(res, 1);
3441
0
            return NULL;
3442
0
        }
3443
3444
0
        hts_tpool_delete_result(res, 1);
3445
0
    }
3446
3447
417
    *cp = c_curr;
3448
3449
    // Update current slice being processed (as opposed to current
3450
    // slice in the multi-threaded reahead.
3451
417
    fd->ctr = c_curr;
3452
417
    if (c_curr) {
3453
417
        c_curr->slice = s_curr;
3454
417
        if (s_curr)
3455
417
            c_curr->curr_slice = s_curr->slice_num;
3456
417
    }
3457
417
    if (s_curr)
3458
417
        s_curr->curr_rec = 0;
3459
0
    else
3460
0
        fd->eof = 1;
3461
3462
417
    return s_curr;
3463
417
}
3464
3465
/*
3466
 * Read the next cram record and return it.
3467
 * Note that to decode cram_record the caller will need to look up some data
3468
 * in the current slice, pointed to by fd->ctr->slice. This is valid until
3469
 * the next call to cram_get_seq (which may invalidate it).
3470
 *
3471
 * Returns record pointer on success (do not free)
3472
 *        NULL on failure
3473
 */
3474
3.88k
cram_record *cram_get_seq(cram_fd *fd) {
3475
3.88k
    cram_container *c;
3476
3.88k
    cram_slice *s;
3477
3478
4.29k
    for (;;) {
3479
4.29k
        c = fd->ctr;
3480
4.29k
        if (c && c->slice && c->slice->curr_rec < c->slice->max_rec) {
3481
0
            s = c->slice;
3482
4.29k
        } else {
3483
4.29k
            if (!(s = cram_next_slice(fd, &c)))
3484
3.88k
                return NULL;
3485
417
            continue; /* In case slice contains no records */
3486
4.29k
        }
3487
3488
        // No need to lock here as get_seq is running in the main thread,
3489
        // which is also the same one that does the range modifications.
3490
0
        if (fd->range.refid != -2) {
3491
0
            if (fd->range.refid == -1 && s->crecs[s->curr_rec].ref_id != -1) {
3492
                // Special case when looking for unmapped blocks at end.
3493
                // If these are mixed in with mapped data (c->ref_id == -2)
3494
                // then we need skip until we find the unmapped data, if at all
3495
0
                s->curr_rec++;
3496
0
                continue;
3497
0
            }
3498
0
            if (s->crecs[s->curr_rec].ref_id < fd->range.refid &&
3499
0
                s->crecs[s->curr_rec].ref_id != -1) {
3500
                // Looking for a mapped read, but not there yet.  Special case
3501
                // as -1 (unmapped) shouldn't be considered < refid.
3502
0
                s->curr_rec++;
3503
0
                continue;
3504
0
            }
3505
3506
0
            if (s->crecs[s->curr_rec].ref_id != fd->range.refid) {
3507
0
                fd->eof = 1;
3508
0
                cram_free_slice(s);
3509
0
                c->slice = NULL;
3510
0
                return NULL;
3511
0
            }
3512
3513
0
            if (fd->range.refid != -1 && s->crecs[s->curr_rec].apos > fd->range.end) {
3514
0
                fd->eof = 1;
3515
0
                cram_free_slice(s);
3516
0
                c->slice = NULL;
3517
0
                return NULL;
3518
0
            }
3519
3520
0
            if (fd->range.refid != -1 && s->crecs[s->curr_rec].aend < fd->range.start) {
3521
0
                s->curr_rec++;
3522
0
                continue;
3523
0
            }
3524
0
        }
3525
3526
0
        break;
3527
0
    }
3528
3529
0
    fd->ctr = c;
3530
0
    c->slice = s;
3531
0
    return &s->crecs[s->curr_rec++];
3532
3.88k
}
3533
3534
/*
3535
 * Read the next cram record and convert it to a bam_seq_t struct.
3536
 *
3537
 * Returns >= 0 success (number of bytes written to *bam)
3538
 *        -1 on EOF or failure (check fd->err)
3539
 */
3540
3.88k
int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam) {
3541
3.88k
    cram_record *cr;
3542
3.88k
    cram_container *c;
3543
3.88k
    cram_slice *s;
3544
3545
3.88k
    if (!(cr = cram_get_seq(fd)))
3546
3.88k
        return -1;
3547
3548
0
    c = fd->ctr;
3549
0
    s = c->slice;
3550
3551
0
    return cram_to_bam(fd->header, fd, s, cr, s->curr_rec-1, bam);
3552
3.88k
}
3553
3554
/*
3555
 * Drains and frees the decode read-queue for a multi-threaded reader.
3556
 */
3557
5.22k
void cram_drain_rqueue(cram_fd *fd) {
3558
5.22k
    cram_container *lc = NULL;
3559
3560
5.22k
    if (!fd->pool || !fd->rqueue)
3561
5.22k
        return;
3562
3563
    // drain queue of any in-flight decode jobs
3564
0
    while (!hts_tpool_process_empty(fd->rqueue)) {
3565
0
        hts_tpool_result *r = hts_tpool_next_result_wait(fd->rqueue);
3566
0
        if (!r)
3567
0
            break;
3568
0
        cram_decode_job *j = (cram_decode_job *)hts_tpool_result_data(r);
3569
0
        if (j->c->slice == j->s)
3570
0
            j->c->slice = NULL;
3571
0
        if (j->c != lc) {
3572
0
            if (lc) {
3573
0
                if (fd->ctr == lc)
3574
0
                    fd->ctr = NULL;
3575
0
                if (fd->ctr_mt == lc)
3576
0
                    fd->ctr_mt = NULL;
3577
0
                cram_free_container(lc);
3578
0
            }
3579
0
            lc = j->c;
3580
0
        }
3581
0
        cram_free_slice(j->s);
3582
0
        hts_tpool_delete_result(r, 1);
3583
0
    }
3584
3585
    // Also tidy up any pending decode job that we didn't submit to the workers
3586
    // due to the input queue being full.
3587
0
    if (fd->job_pending) {
3588
0
        cram_decode_job *j = (cram_decode_job *)fd->job_pending;
3589
0
        if (j->c->slice == j->s)
3590
0
            j->c->slice = NULL;
3591
0
        if (j->c != lc) {
3592
0
            if (lc) {
3593
0
                if (fd->ctr == lc)
3594
0
                    fd->ctr = NULL;
3595
0
                if (fd->ctr_mt == lc)
3596
0
                    fd->ctr_mt = NULL;
3597
0
                cram_free_container(lc);
3598
0
            }
3599
0
            lc = j->c;
3600
0
        }
3601
0
        cram_free_slice(j->s);
3602
0
        free(j);
3603
0
        fd->job_pending = NULL;
3604
0
    }
3605
3606
0
    if (lc) {
3607
0
        if (fd->ctr == lc)
3608
0
            fd->ctr = NULL;
3609
0
        if (fd->ctr_mt == lc)
3610
0
            fd->ctr_mt = NULL;
3611
0
        cram_free_container(lc);
3612
0
    }
3613
0
}