Coverage Report

Created: 2025-08-10 06:30

/src/htslib/cram/cram_decode.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2012-2020, 2022-2024 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * - In-memory decoding of CRAM data structures.
33
 * - Iterator for reading CRAM record by record.
34
 */
35
36
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
37
#include <config.h>
38
39
#include <stdio.h>
40
#include <errno.h>
41
#include <assert.h>
42
#include <stdlib.h>
43
#include <string.h>
44
#include <zlib.h>
45
#include <sys/types.h>
46
#include <sys/stat.h>
47
#include <math.h>
48
#include <stdint.h>
49
#include <inttypes.h>
50
51
#include "cram.h"
52
#include "os.h"
53
#include "../htslib/hts.h"
54
55
//Whether CIGAR has just M or uses = and X to indicate match and mismatch
56
//#define USE_X
57
58
/* ----------------------------------------------------------------------
59
 * CRAM compression headers
60
 */
61
62
/*
63
 * Decodes the Tag Dictionary record in the preservation map
64
 * Updates the cram compression header.
65
 *
66
 * Returns number of bytes decoded on success
67
 *        -1 on failure
68
 */
69
int cram_decode_TD(cram_fd *fd, char *cp, const char *endp,
70
666
                   cram_block_compression_hdr *h) {
71
666
    char *op = cp;
72
666
    unsigned char *dat;
73
666
    cram_block *b;
74
666
    int32_t blk_size = 0;
75
666
    int nTL, i, sz, err = 0;
76
77
666
    if (!(b = cram_new_block(0, 0)))
78
0
        return -1;
79
80
666
    if (h->TD_blk || h->TL) {
81
345
        hts_log_warning("More than one TD block found in compression header");
82
345
        cram_free_block(h->TD_blk);
83
345
        free(h->TL);
84
345
        h->TD_blk = NULL;
85
345
        h->TL = NULL;
86
345
    }
87
88
    /* Decode */
89
666
    blk_size = fd->vv.varint_get32(&cp, endp, &err);
90
666
    if (!blk_size) {
91
204
        h->nTL = 0;
92
204
        cram_free_block(b);
93
204
        return cp - op;
94
204
    }
95
96
462
    if (err || blk_size < 0 || endp - cp < blk_size) {
97
27
        cram_free_block(b);
98
27
        return -1;
99
27
    }
100
101
435
    BLOCK_APPEND(b, cp, blk_size);
102
435
    cp += blk_size;
103
435
    sz = cp - op;
104
    // Force nul termination if missing
105
435
    if (BLOCK_DATA(b)[BLOCK_SIZE(b)-1])
106
282
        BLOCK_APPEND_CHAR(b, '\0');
107
108
    /* Set up TL lookup table */
109
435
    dat = BLOCK_DATA(b);
110
111
    // Count
112
6.49k
    for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
113
6.06k
        nTL++;
114
34.6k
        while (dat[i])
115
28.6k
            i++;
116
6.06k
    }
117
118
    // Copy
119
435
    if (!(h->TL = calloc(nTL, sizeof(*h->TL)))) {
120
0
        cram_free_block(b);
121
0
        return -1;
122
0
    }
123
6.49k
    for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
124
6.06k
        h->TL[nTL++] = &dat[i];
125
34.6k
        while (dat[i])
126
28.6k
            i++;
127
6.06k
    }
128
435
    h->TD_blk = b;
129
435
    h->nTL = nTL;
130
131
435
    return sz;
132
133
0
 block_err:
134
0
    cram_free_block(b);
135
0
    return -1;
136
435
}
137
138
/*
139
 * Decodes a CRAM block compression header.
140
 * Returns header ptr on success
141
 *         NULL on failure
142
 */
143
cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
144
6.79k
                                                           cram_block *b) {
145
6.79k
    char *cp, *endp, *cp_copy;
146
6.79k
    cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
147
6.79k
    int i, err = 0;
148
6.79k
    int32_t map_size = 0, map_count = 0;
149
150
6.79k
    if (!hdr)
151
0
        return NULL;
152
153
6.79k
    if (b->method != RAW) {
154
4.65k
        if (cram_uncompress_block(b)) {
155
4.04k
            free(hdr);
156
4.04k
            return NULL;
157
4.04k
        }
158
4.65k
    }
159
160
2.74k
    cp = (char *)b->data;
161
2.74k
    endp = cp + b->uncomp_size;
162
163
2.74k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
164
2.52k
        hdr->ref_seq_id = fd->vv.varint_get32(&cp, endp, &err);
165
2.52k
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
166
0
            hdr->ref_seq_start = fd->vv.varint_get64(&cp, endp, &err);
167
0
            hdr->ref_seq_span  = fd->vv.varint_get64(&cp, endp, &err);
168
2.52k
        } else {
169
2.52k
            hdr->ref_seq_start = fd->vv.varint_get32(&cp, endp, &err);
170
2.52k
            hdr->ref_seq_span  = fd->vv.varint_get32(&cp, endp, &err);
171
2.52k
        }
172
2.52k
        hdr->num_records   = fd->vv.varint_get32(&cp, endp, &err);
173
2.52k
        hdr->num_landmarks = fd->vv.varint_get32(&cp, endp, &err);
174
2.52k
        if (hdr->num_landmarks < 0 ||
175
2.52k
            hdr->num_landmarks >= SIZE_MAX / sizeof(int32_t) ||
176
2.52k
            endp - cp < hdr->num_landmarks) {
177
115
            free(hdr);
178
115
            return NULL;
179
115
        }
180
2.41k
        if (!(hdr->landmark = malloc(hdr->num_landmarks * sizeof(int32_t)))) {
181
0
            free(hdr);
182
0
            return NULL;
183
0
        }
184
67.7k
        for (i = 0; i < hdr->num_landmarks; i++)
185
65.2k
            hdr->landmark[i] = fd->vv.varint_get32(&cp, endp, &err);;
186
2.41k
    }
187
188
2.62k
    hdr->preservation_map = kh_init(map);
189
190
2.62k
    memset(hdr->rec_encoding_map, 0,
191
2.62k
           CRAM_MAP_HASH * sizeof(hdr->rec_encoding_map[0]));
192
2.62k
    memset(hdr->tag_encoding_map, 0,
193
2.62k
           CRAM_MAP_HASH * sizeof(hdr->tag_encoding_map[0]));
194
195
2.62k
    if (!hdr->preservation_map) {
196
0
        cram_free_compression_header(hdr);
197
0
        return NULL;
198
0
    }
199
200
    /* Initialise defaults for preservation map */
201
2.62k
    hdr->read_names_included = 0;
202
2.62k
    hdr->AP_delta = 1;
203
2.62k
    hdr->qs_seq_orient = 1;
204
2.62k
    memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20);
205
206
    /* Preservation map */
207
2.62k
    map_size  = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp;
208
2.62k
    map_count = fd->vv.varint_get32(&cp, endp, &err);
209
80.4k
    for (i = 0; i < map_count; i++) {
210
78.1k
        pmap_t hd;
211
78.1k
        khint_t k;
212
78.1k
        int r;
213
214
78.1k
        if (endp - cp < 3) {
215
278
            cram_free_compression_header(hdr);
216
278
            return NULL;
217
278
        }
218
77.8k
        cp += 2;
219
77.8k
        switch(CRAM_KEY(cp[-2],cp[-1])) {
220
21
        case CRAM_KEY('M','I'): // was mapped QS included in V1.0
221
126
        case CRAM_KEY('U','I'): // was unmapped QS included in V1.0
222
171
        case CRAM_KEY('P','I'): // was unmapped placed in V1.0
223
171
            hd.i = *cp++;
224
171
            break;
225
226
84
        case CRAM_KEY('R','N'):
227
84
            hd.i = *cp++;
228
84
            k = kh_put(map, hdr->preservation_map, "RN", &r);
229
84
            if (-1 == r) {
230
0
                cram_free_compression_header(hdr);
231
0
                return NULL;
232
0
            }
233
234
84
            kh_val(hdr->preservation_map, k) = hd;
235
84
            hdr->read_names_included = hd.i;
236
84
            break;
237
238
180
        case CRAM_KEY('A','P'):
239
180
            hd.i = *cp++;
240
180
            k = kh_put(map, hdr->preservation_map, "AP", &r);
241
180
            if (-1 == r) {
242
0
                cram_free_compression_header(hdr);
243
0
                return NULL;
244
0
            }
245
246
180
            kh_val(hdr->preservation_map, k) = hd;
247
180
            hdr->AP_delta = hd.i;
248
180
            break;
249
250
3.87k
        case CRAM_KEY('R','R'):
251
3.87k
            hd.i = *cp++;
252
3.87k
            k = kh_put(map, hdr->preservation_map, "RR", &r);
253
3.87k
            if (-1 == r) {
254
0
                cram_free_compression_header(hdr);
255
0
                return NULL;
256
0
            }
257
258
3.87k
            kh_val(hdr->preservation_map, k) = hd;
259
3.87k
            hdr->no_ref = !hd.i;
260
3.87k
            break;
261
262
162
        case CRAM_KEY('Q','O'):
263
162
            hd.i = *cp++;
264
162
            k = kh_put(map, hdr->preservation_map, "QO", &r);
265
162
            if (-1 == r) {
266
0
                cram_free_compression_header(hdr);
267
0
                return NULL;
268
0
            }
269
270
162
            kh_val(hdr->preservation_map, k) = hd;
271
162
            hdr->qs_seq_orient = hd.i;
272
162
            break;
273
274
837
        case CRAM_KEY('S','M'):
275
837
            if (endp - cp < 5) {
276
3
                cram_free_compression_header(hdr);
277
3
                return NULL;
278
3
            }
279
834
            hdr->substitution_matrix[0][(cp[0]>>6)&3] = 'C';
280
834
            hdr->substitution_matrix[0][(cp[0]>>4)&3] = 'G';
281
834
            hdr->substitution_matrix[0][(cp[0]>>2)&3] = 'T';
282
834
            hdr->substitution_matrix[0][(cp[0]>>0)&3] = 'N';
283
284
834
            hdr->substitution_matrix[1][(cp[1]>>6)&3] = 'A';
285
834
            hdr->substitution_matrix[1][(cp[1]>>4)&3] = 'G';
286
834
            hdr->substitution_matrix[1][(cp[1]>>2)&3] = 'T';
287
834
            hdr->substitution_matrix[1][(cp[1]>>0)&3] = 'N';
288
289
834
            hdr->substitution_matrix[2][(cp[2]>>6)&3] = 'A';
290
834
            hdr->substitution_matrix[2][(cp[2]>>4)&3] = 'C';
291
834
            hdr->substitution_matrix[2][(cp[2]>>2)&3] = 'T';
292
834
            hdr->substitution_matrix[2][(cp[2]>>0)&3] = 'N';
293
294
834
            hdr->substitution_matrix[3][(cp[3]>>6)&3] = 'A';
295
834
            hdr->substitution_matrix[3][(cp[3]>>4)&3] = 'C';
296
834
            hdr->substitution_matrix[3][(cp[3]>>2)&3] = 'G';
297
834
            hdr->substitution_matrix[3][(cp[3]>>0)&3] = 'N';
298
299
834
            hdr->substitution_matrix[4][(cp[4]>>6)&3] = 'A';
300
834
            hdr->substitution_matrix[4][(cp[4]>>4)&3] = 'C';
301
834
            hdr->substitution_matrix[4][(cp[4]>>2)&3] = 'G';
302
834
            hdr->substitution_matrix[4][(cp[4]>>0)&3] = 'T';
303
304
834
            hd.p = cp;
305
834
            cp += 5;
306
307
834
            k = kh_put(map, hdr->preservation_map, "SM", &r);
308
834
            if (-1 == r) {
309
0
                cram_free_compression_header(hdr);
310
0
                return NULL;
311
0
            }
312
834
            kh_val(hdr->preservation_map, k) = hd;
313
834
            break;
314
315
666
        case CRAM_KEY('T','D'): {
316
666
            int sz = cram_decode_TD(fd, cp, endp, hdr); // tag dictionary
317
666
            if (sz < 0) {
318
27
                cram_free_compression_header(hdr);
319
27
                return NULL;
320
27
            }
321
322
639
            hd.p = cp;
323
639
            cp += sz;
324
325
639
            k = kh_put(map, hdr->preservation_map, "TD", &r);
326
639
            if (-1 == r) {
327
0
                cram_free_compression_header(hdr);
328
0
                return NULL;
329
0
            }
330
639
            kh_val(hdr->preservation_map, k) = hd;
331
639
            break;
332
639
        }
333
334
71.8k
        default:
335
71.8k
            hts_log_warning("Unrecognised preservation map key %c%c", cp[-2], cp[-1]);
336
            // guess byte;
337
71.8k
            cp++;
338
71.8k
            break;
339
77.8k
        }
340
77.8k
    }
341
2.31k
    if (cp - cp_copy != map_size) {
342
120
        cram_free_compression_header(hdr);
343
120
        return NULL;
344
120
    }
345
346
    /* Record encoding map */
347
2.19k
    map_size  = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp;
348
2.19k
    map_count = fd->vv.varint_get32(&cp, endp, &err);
349
2.19k
    int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0;
350
60.2k
    for (i = 0; i < map_count; i++) {
351
59.3k
        char *key = cp;
352
59.3k
        int32_t encoding = E_NULL;
353
59.3k
        int32_t size = 0;
354
59.3k
        ptrdiff_t offset;
355
59.3k
        cram_map *m;
356
59.3k
        enum cram_DS_ID ds_id;
357
59.3k
        enum cram_external_type type;
358
359
59.3k
        if (endp - cp < 4) {
360
122
            cram_free_compression_header(hdr);
361
122
            return NULL;
362
122
        }
363
364
59.1k
        cp += 2;
365
59.1k
        encoding = fd->vv.varint_get32(&cp, endp, &err);
366
59.1k
        size     = fd->vv.varint_get32(&cp, endp, &err);
367
368
59.1k
        offset = cp - (char *)b->data;
369
370
59.1k
        if (encoding == E_NULL)
371
16.5k
            continue;
372
373
42.6k
        if (size < 0 || endp - cp < size) {
374
458
            cram_free_compression_header(hdr);
375
458
            return NULL;
376
458
        }
377
378
        //printf("%s codes for %.2s\n", cram_encoding2str(encoding), key);
379
380
        /*
381
         * For CRAM1.0 CF and BF are Byte and not Int.
382
         * Practically speaking it makes no difference unless we have a
383
         * 1.0 format file that stores these in EXTERNAL as only then
384
         * does Byte vs Int matter.
385
         *
386
         * Neither this C code nor Java reference implementations did this,
387
         * so we gloss over it and treat them as int.
388
         */
389
42.2k
        ds_id = DS_CORE;
390
42.2k
        if (key[0] == 'B' && key[1] == 'F') {
391
156
            ds_id = DS_BF; type = E_INT;
392
42.0k
        } else if (key[0] == 'C' && key[1] == 'F') {
393
240
            ds_id = DS_CF; type = E_INT;
394
41.8k
        } else if (key[0] == 'R' && key[1] == 'I') {
395
150
            ds_id = DS_RI; type = E_INT;
396
41.6k
        } else if (key[0] == 'R' && key[1] == 'L') {
397
18
            ds_id = DS_RL; type = E_INT;
398
41.6k
        } else if (key[0] == 'A' && key[1] == 'P') {
399
723
            ds_id = DS_AP;
400
723
            type = is_v4 ? E_SLONG : E_INT;
401
40.9k
        } else if (key[0] == 'R' && key[1] == 'G') {
402
90
            ds_id = DS_RG;
403
90
            type = E_INT;
404
40.8k
        } else if (key[0] == 'M' && key[1] == 'F') {
405
45
            ds_id = DS_MF; type = E_INT;
406
40.8k
        } else if (key[0] == 'N' && key[1] == 'S') {
407
306
            ds_id = DS_NS; type = E_INT;
408
40.4k
        } else if (key[0] == 'N' && key[1] == 'P') {
409
1.46k
            ds_id = DS_NP;
410
1.46k
            type = is_v4 ? E_LONG : E_INT;
411
39.0k
        } else if (key[0] == 'T' && key[1] == 'S') {
412
78
            ds_id = DS_TS;
413
78
            type = is_v4 ? E_SLONG : E_INT;
414
38.9k
        } else if (key[0] == 'N' && key[1] == 'F') {
415
141
            ds_id = DS_NF; type = E_INT;
416
38.8k
        } else if (key[0] == 'T' && key[1] == 'C') {
417
138
            ds_id = DS_TC; type = E_BYTE;
418
38.6k
        } else if (key[0] == 'T' && key[1] == 'N') {
419
3
            ds_id = DS_TN; type = E_INT;
420
38.6k
        } else if (key[0] == 'F' && key[1] == 'N') {
421
3
            ds_id = DS_FN; type = E_INT;
422
38.6k
        } else if (key[0] == 'F' && key[1] == 'C') {
423
321
            ds_id = DS_FC; type = E_BYTE;
424
38.3k
        } else if (key[0] == 'F' && key[1] == 'P') {
425
39
            ds_id = DS_FP; type = E_INT;
426
38.3k
        } else if (key[0] == 'B' && key[1] == 'S') {
427
39
            ds_id = DS_BS; type = E_BYTE;
428
38.2k
        } else if (key[0] == 'I' && key[1] == 'N') {
429
57
            ds_id = DS_IN; type = E_BYTE_ARRAY;
430
38.2k
        } else if (key[0] == 'S' && key[1] == 'C') {
431
3
            ds_id = DS_SC; type = E_BYTE_ARRAY;
432
38.2k
        } else if (key[0] == 'D' && key[1] == 'L') {
433
102
            ds_id = DS_DL; type = E_INT;
434
38.1k
        } else if (key[0] == 'B' && key[1] == 'A') {
435
243
            ds_id = DS_BA; type = E_BYTE;
436
37.8k
        } else if (key[0] == 'B' && key[1] == 'B') {
437
120
            ds_id = DS_BB; type = E_BYTE_ARRAY;
438
37.7k
        } else if (key[0] == 'R' && key[1] == 'S') {
439
320
            ds_id = DS_RS; type = E_INT;
440
37.4k
        } else if (key[0] == 'P' && key[1] == 'D') {
441
3
            ds_id = DS_PD; type = E_INT;
442
37.4k
        } else if (key[0] == 'H' && key[1] == 'C') {
443
9
            ds_id = DS_HC; type = E_INT;
444
37.4k
        } else if (key[0] == 'M' && key[1] == 'Q') {
445
2.79k
            ds_id = DS_MQ; type = E_INT;
446
34.6k
        } else if (key[0] == 'R' && key[1] == 'N') {
447
18
            ds_id = DS_RN; type = E_BYTE_ARRAY_BLOCK;
448
34.6k
        } else if (key[0] == 'Q' && key[1] == 'S') {
449
195
            ds_id = DS_QS; type = E_BYTE;
450
34.4k
        } else if (key[0] == 'Q' && key[1] == 'Q') {
451
6
            ds_id = DS_QQ; type = E_BYTE_ARRAY;
452
34.4k
        } else if (key[0] == 'T' && key[1] == 'L') {
453
36
            ds_id = DS_TL; type = E_INT;
454
34.3k
        } else if (key[0] == 'T' && key[1] == 'M') {
455
34.3k
        } else if (key[0] == 'T' && key[1] == 'V') {
456
34.3k
        } else {
457
34.3k
            hts_log_warning("Unrecognised key: %.2s", key);
458
34.3k
        }
459
460
42.2k
        if (ds_id != DS_CORE) {
461
7.85k
            if (hdr->codecs[ds_id] != NULL) {
462
6.45k
                hts_log_warning("Codec for key %.2s defined more than once",
463
6.45k
                                key);
464
6.45k
                hdr->codecs[ds_id]->free(hdr->codecs[ds_id]);
465
6.45k
            }
466
7.85k
            hdr->codecs[ds_id] = cram_decoder_init(hdr, encoding, cp, size,
467
7.85k
                                                   type, fd->version, &fd->vv);
468
7.85k
            if (!hdr->codecs[ds_id]) {
469
696
                cram_free_compression_header(hdr);
470
696
                return NULL;
471
696
            }
472
7.85k
        }
473
474
41.5k
        cp += size;
475
476
        // Fill out cram_map purely for cram_dump to dump out.
477
41.5k
        m = malloc(sizeof(*m));
478
41.5k
        if (!m) {
479
0
            cram_free_compression_header(hdr);
480
0
            return NULL;
481
0
        }
482
41.5k
        m->key = CRAM_KEY(key[0], key[1]);
483
41.5k
        m->encoding = encoding;
484
41.5k
        m->size     = size;
485
41.5k
        m->offset   = offset;
486
41.5k
        m->codec = NULL;
487
488
41.5k
        m->next = hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])];
489
41.5k
        hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])] = m;
490
41.5k
    }
491
923
    if (cp - cp_copy != map_size) {
492
103
        cram_free_compression_header(hdr);
493
103
        return NULL;
494
103
    }
495
496
    /* Tag encoding map */
497
820
    map_size  = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp;
498
820
    map_count = fd->vv.varint_get32(&cp, endp, &err);
499
2.15k
    for (i = 0; i < map_count; i++) {
500
1.49k
        int32_t encoding = E_NULL;
501
1.49k
        int32_t size = 0;
502
1.49k
        cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
503
1.49k
        uint8_t key[3];
504
505
1.49k
        if (!m || endp - cp < 6) {
506
9
            free(m);
507
9
            cram_free_compression_header(hdr);
508
9
            return NULL;
509
9
        }
510
511
1.48k
        m->key = fd->vv.varint_get32(&cp, endp, &err);
512
1.48k
        key[0] = m->key>>16;
513
1.48k
        key[1] = m->key>>8;
514
1.48k
        key[2] = m->key;
515
1.48k
        encoding = fd->vv.varint_get32(&cp, endp, &err);
516
1.48k
        size     = fd->vv.varint_get32(&cp, endp, &err);
517
518
1.48k
        m->encoding = encoding;
519
1.48k
        m->size     = size;
520
1.48k
        m->offset   = cp - (char *)b->data;
521
1.48k
        if (size < 0 || endp - cp < size ||
522
1.48k
            !(m->codec = cram_decoder_init(hdr, encoding, cp, size,
523
1.46k
                                           E_BYTE_ARRAY_BLOCK, fd->version, &fd->vv))) {
524
146
            cram_free_compression_header(hdr);
525
146
            free(m);
526
146
            return NULL;
527
146
        }
528
529
1.33k
        cp += size;
530
531
1.33k
        m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])];
532
1.33k
        hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m;
533
1.33k
    }
534
665
    if (err || cp - cp_copy != map_size) {
535
168
        cram_free_compression_header(hdr);
536
168
        return NULL;
537
168
    }
538
539
497
    return hdr;
540
665
}
541
542
/*
543
 * Note we also need to scan through the record encoding map to
544
 * see which data series share the same block, either external or
545
 * CORE. For example if we need the BF data series but MQ and CF
546
 * are also encoded in the same block then we need to add those in
547
 * as a dependency in order to correctly decode BF.
548
 *
549
 * Returns 0 on success
550
 *        -1 on failure
551
 */
552
int cram_dependent_data_series(cram_fd *fd,
553
                               cram_block_compression_hdr *hdr,
554
660
                               cram_slice *s) {
555
660
    int *block_used;
556
660
    int core_used = 0;
557
660
    int i;
558
660
    static int i_to_id[] = {
559
660
        DS_BF, DS_AP, DS_FP, DS_RL, DS_DL, DS_NF, DS_BA, DS_QS,
560
660
        DS_FC, DS_FN, DS_BS, DS_IN, DS_RG, DS_MQ, DS_TL, DS_RN,
561
660
        DS_NS, DS_NP, DS_TS, DS_MF, DS_CF, DS_RI, DS_RS, DS_PD,
562
660
        DS_HC, DS_SC, DS_BB, DS_QQ,
563
660
    };
564
660
    uint32_t orig_ds;
565
566
    /*
567
     * Set the data_series bit field based on fd->required_fields
568
     * contents.
569
     */
570
660
    if (fd->required_fields && fd->required_fields != INT_MAX) {
571
0
        s->data_series = 0;
572
573
0
        if (fd->required_fields & SAM_QNAME)
574
0
            s->data_series |= CRAM_RN;
575
576
0
        if (fd->required_fields & SAM_FLAG)
577
0
            s->data_series |= CRAM_BF;
578
579
0
        if (fd->required_fields & SAM_RNAME)
580
0
            s->data_series |= CRAM_RI | CRAM_BF;
581
582
0
        if (fd->required_fields & SAM_POS)
583
0
            s->data_series |= CRAM_AP | CRAM_BF;
584
585
0
        if (fd->required_fields & SAM_MAPQ)
586
0
            s->data_series |= CRAM_MQ;
587
588
0
        if (fd->required_fields & SAM_CIGAR)
589
0
            s->data_series |= CRAM_CIGAR;
590
591
0
        if (fd->required_fields & SAM_RNEXT)
592
0
            s->data_series |= CRAM_CF | CRAM_NF | CRAM_RI | CRAM_NS |CRAM_BF;
593
594
0
        if (fd->required_fields & SAM_PNEXT)
595
0
            s->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_NP | CRAM_BF;
596
597
0
        if (fd->required_fields & SAM_TLEN)
598
0
            s->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_TS |
599
0
                CRAM_BF | CRAM_MF | CRAM_RI | CRAM_CIGAR;
600
601
0
        if (fd->required_fields & SAM_SEQ)
602
0
            s->data_series |= CRAM_SEQ;
603
604
0
        if (!(fd->required_fields & SAM_AUX))
605
            // No easy way to get MD/NM without other tags at present
606
0
            s->decode_md = 0;
607
608
0
        if (fd->required_fields & SAM_QUAL)
609
0
            s->data_series |= CRAM_QUAL;
610
611
0
        if (fd->required_fields & SAM_AUX)
612
0
            s->data_series |= CRAM_RG | CRAM_TL | CRAM_aux;
613
614
0
        if (fd->required_fields & SAM_RGAUX)
615
0
            s->data_series |= CRAM_RG | CRAM_BF;
616
617
        // Always uncompress CORE block
618
0
        if (cram_uncompress_block(s->block[0]))
619
0
            return -1;
620
660
    } else {
621
660
        s->data_series = CRAM_ALL;
622
623
3.38k
        for (i = 0; i < s->hdr->num_blocks; i++) {
624
2.73k
            if (cram_uncompress_block(s->block[i]))
625
7
                return -1;
626
2.73k
        }
627
628
653
        return 0;
629
660
    }
630
631
0
    block_used = calloc(s->hdr->num_blocks+1, sizeof(int));
632
0
    if (!block_used)
633
0
        return -1;
634
635
0
    do {
636
        /*
637
         * Also set data_series based on code prerequisites. Eg if we need
638
         * CRAM_QS then we also need to know CRAM_RL so we know how long it
639
         * is, or if we need FC/FP then we also need FN (number of features).
640
         *
641
         * It's not reciprocal though. We may be needing to decode FN
642
         * but have no need to decode FC, FP and cigar ops.
643
         */
644
0
        if (s->data_series & CRAM_RS)    s->data_series |= CRAM_FC|CRAM_FP;
645
0
        if (s->data_series & CRAM_PD)    s->data_series |= CRAM_FC|CRAM_FP;
646
0
        if (s->data_series & CRAM_HC)    s->data_series |= CRAM_FC|CRAM_FP;
647
0
        if (s->data_series & CRAM_QS)    s->data_series |= CRAM_FC|CRAM_FP;
648
0
        if (s->data_series & CRAM_IN)    s->data_series |= CRAM_FC|CRAM_FP;
649
0
        if (s->data_series & CRAM_SC)    s->data_series |= CRAM_FC|CRAM_FP;
650
0
        if (s->data_series & CRAM_BS)    s->data_series |= CRAM_FC|CRAM_FP;
651
0
        if (s->data_series & CRAM_DL)    s->data_series |= CRAM_FC|CRAM_FP;
652
0
        if (s->data_series & CRAM_BA)    s->data_series |= CRAM_FC|CRAM_FP;
653
0
        if (s->data_series & CRAM_BB)    s->data_series |= CRAM_FC|CRAM_FP;
654
0
        if (s->data_series & CRAM_QQ)    s->data_series |= CRAM_FC|CRAM_FP;
655
656
        // cram_decode_seq() needs seq[] array
657
0
        if (s->data_series & (CRAM_SEQ|CRAM_CIGAR)) s->data_series |= CRAM_RL;
658
659
0
        if (s->data_series & CRAM_FP)    s->data_series |= CRAM_FC;
660
0
        if (s->data_series & CRAM_FC)    s->data_series |= CRAM_FN;
661
0
        if (s->data_series & CRAM_aux)   s->data_series |= CRAM_TL;
662
0
        if (s->data_series & CRAM_MF)    s->data_series |= CRAM_CF;
663
0
        if (s->data_series & CRAM_MQ)    s->data_series |= CRAM_BF;
664
0
        if (s->data_series & CRAM_BS)    s->data_series |= CRAM_RI;
665
0
        if (s->data_series & (CRAM_MF |CRAM_NS |CRAM_NP |CRAM_TS |CRAM_NF))
666
0
            s->data_series |= CRAM_CF;
667
0
        if (!hdr->read_names_included && s->data_series & CRAM_RN)
668
0
            s->data_series |= CRAM_CF | CRAM_NF;
669
0
        if (s->data_series & (CRAM_BA | CRAM_QS | CRAM_BB | CRAM_QQ))
670
0
            s->data_series |= CRAM_BF | CRAM_CF | CRAM_RL;
671
0
        if (s->data_series & CRAM_FN) {
672
            // The CRAM_FN loop checks for reference length boundaries,
673
            // which needs a working seq_pos.  Some fields are fixed size
674
            // irrespective of if we decode (BS), but others need to know
675
            // the size of the string fetched back (SC, IN, BB).
676
0
            s->data_series |= CRAM_SC | CRAM_IN | CRAM_BB;
677
0
        }
678
679
0
        orig_ds = s->data_series;
680
681
        // Find which blocks are in use.
682
0
        for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
683
0
            int bnum1, bnum2, j;
684
0
            cram_codec *c = hdr->codecs[i_to_id[i]];
685
686
0
            if (!(s->data_series & (1<<i)))
687
0
                continue;
688
689
0
            if (!c)
690
0
                continue;
691
692
0
            bnum1 = cram_codec_to_id(c, &bnum2);
693
694
0
            for (;;) {
695
0
                switch (bnum1) {
696
0
                case -2:
697
0
                    break;
698
699
0
                case -1:
700
0
                    core_used = 1;
701
0
                    break;
702
703
0
                default:
704
0
                    for (j = 0; j < s->hdr->num_blocks; j++) {
705
0
                        if (s->block[j]->content_type == EXTERNAL &&
706
0
                            s->block[j]->content_id == bnum1) {
707
0
                            block_used[j] = 1;
708
0
                            if (cram_uncompress_block(s->block[j])) {
709
0
                                free(block_used);
710
0
                                return -1;
711
0
                            }
712
0
                        }
713
0
                    }
714
0
                    break;
715
0
                }
716
717
0
                if (bnum2 == -2 || bnum1 == bnum2)
718
0
                    break;
719
720
0
                bnum1 = bnum2; // 2nd pass
721
0
            }
722
0
        }
723
724
        // Tags too
725
0
        if ((fd->required_fields & SAM_AUX) ||
726
0
            (s->data_series & CRAM_aux)) {
727
0
            for (i = 0; i < CRAM_MAP_HASH; i++) {
728
0
                int bnum1, bnum2, j;
729
0
                cram_map *m = hdr->tag_encoding_map[i];
730
731
0
                while (m) {
732
0
                    cram_codec *c = m->codec;
733
0
                    if (!c) {
734
0
                        m = m->next;
735
0
                        continue;
736
0
                    }
737
738
0
                    bnum1 = cram_codec_to_id(c, &bnum2);
739
740
0
                    for (;;) {
741
0
                        switch (bnum1) {
742
0
                        case -2:
743
0
                            break;
744
745
0
                        case -1:
746
0
                            core_used = 1;
747
0
                            break;
748
749
0
                        default:
750
0
                            for (j = 0; j < s->hdr->num_blocks; j++) {
751
0
                                if (s->block[j]->content_type == EXTERNAL &&
752
0
                                    s->block[j]->content_id == bnum1) {
753
0
                                    block_used[j] = 1;
754
0
                                    if (cram_uncompress_block(s->block[j])) {
755
0
                                        free(block_used);
756
0
                                        return -1;
757
0
                                    }
758
0
                                }
759
0
                            }
760
0
                            break;
761
0
                        }
762
763
0
                        if (bnum2 == -2 || bnum1 == bnum2)
764
0
                            break;
765
766
0
                        bnum1 = bnum2; // 2nd pass
767
0
                    }
768
769
0
                    m = m->next;
770
0
                }
771
0
            }
772
0
        }
773
774
        // We now know which blocks are in used, so repeat and find
775
        // which other data series need to be added.
776
0
        for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
777
0
            int bnum1, bnum2, j;
778
0
            cram_codec *c = hdr->codecs[i_to_id[i]];
779
780
0
            if (!c)
781
0
                continue;
782
783
0
            bnum1 = cram_codec_to_id(c, &bnum2);
784
785
0
            for (;;) {
786
0
                switch (bnum1) {
787
0
                case -2:
788
0
                    break;
789
790
0
                case -1:
791
0
                    if (core_used) {
792
                        //printf(" + data series %08x:\n", 1<<i);
793
0
                        s->data_series |= 1<<i;
794
0
                    }
795
0
                    break;
796
797
0
                default:
798
0
                    for (j = 0; j < s->hdr->num_blocks; j++) {
799
0
                        if (s->block[j]->content_type == EXTERNAL &&
800
0
                            s->block[j]->content_id == bnum1) {
801
0
                            if (block_used[j]) {
802
                                //printf(" + data series %08x:\n", 1<<i);
803
0
                                s->data_series |= 1<<i;
804
0
                            }
805
0
                        }
806
0
                    }
807
0
                    break;
808
0
                }
809
810
0
                if (bnum2 == -2 || bnum1 == bnum2)
811
0
                    break;
812
813
0
                bnum1 = bnum2; // 2nd pass
814
0
            }
815
0
        }
816
817
        // Tags too
818
0
        for (i = 0; i < CRAM_MAP_HASH; i++) {
819
0
            int bnum1, bnum2, j;
820
0
            cram_map *m = hdr->tag_encoding_map[i];
821
822
0
            while (m) {
823
0
                cram_codec *c = m->codec;
824
0
                if (!c) {
825
0
                    m = m->next;
826
0
                    continue;
827
0
                }
828
829
0
                bnum1 = cram_codec_to_id(c, &bnum2);
830
831
0
                for (;;) {
832
0
                    switch (bnum1) {
833
0
                    case -2:
834
0
                        break;
835
836
0
                    case -1:
837
                        //printf(" + data series %08x:\n", CRAM_aux);
838
0
                        s->data_series |= CRAM_aux;
839
0
                        break;
840
841
0
                    default:
842
0
                        for (j = 0; j < s->hdr->num_blocks; j++) {
843
0
                            if (s->block[j]->content_type == EXTERNAL &&
844
0
                                s->block[j]->content_id == bnum1) {
845
0
                                if (block_used[j]) {
846
                                    //printf(" + data series %08x:\n",
847
                                    //       CRAM_aux);
848
0
                                    s->data_series |= CRAM_aux;
849
0
                                }
850
0
                            }
851
0
                        }
852
0
                        break;
853
0
                    }
854
855
0
                    if (bnum2 == -2 || bnum1 == bnum2)
856
0
                        break;
857
858
0
                    bnum1 = bnum2; // 2nd pass
859
0
                }
860
861
0
                m = m->next;
862
0
            }
863
0
        }
864
0
    } while (orig_ds != s->data_series);
865
866
0
    free(block_used);
867
0
    return 0;
868
0
}
869
870
/*
871
 * Checks whether an external block is used solely by a single data series.
872
 * Returns the codec type if so (EXTERNAL, BYTE_ARRAY_LEN, BYTE_ARRAY_STOP)
873
 *         or 0 if not (E_NULL).
874
 */
875
static int cram_ds_unique(cram_block_compression_hdr *hdr, cram_codec *c,
876
0
                          int id) {
877
0
    int i, n_id = 0;
878
0
    enum cram_encoding e_type = 0;
879
880
0
    for (i = 0; i < DS_END; i++) {
881
0
        cram_codec *c;
882
0
        int bnum1, bnum2, old_n_id;
883
884
0
        if (!(c = hdr->codecs[i]))
885
0
            continue;
886
887
0
        bnum1 = cram_codec_to_id(c, &bnum2);
888
889
0
        old_n_id = n_id;
890
0
        if (bnum1 == id) {
891
0
            n_id++;
892
0
            e_type = c->codec;
893
0
        }
894
0
        if (bnum2 == id) {
895
0
            n_id++;
896
0
            e_type = c->codec;
897
0
        }
898
899
0
        if (n_id == old_n_id+2)
900
0
            n_id--; // len/val in same place counts once only.
901
0
    }
902
903
0
    return n_id == 1 ? e_type : 0;
904
0
}
905
906
/*
907
 * Attempts to estimate the size of some blocks so we can preallocate them
908
 * before decoding.  Although decoding will automatically grow the blocks,
909
 * it is typically more efficient to preallocate.
910
 */
911
void cram_decode_estimate_sizes(cram_block_compression_hdr *hdr, cram_slice *s,
912
                                int *qual_size, int *name_size,
913
0
                                int *q_id) {
914
0
    int bnum1, bnum2;
915
0
    cram_codec *cd;
916
917
0
    *qual_size = 0;
918
0
    *name_size = 0;
919
920
    /* Qual */
921
0
    cd = hdr->codecs[DS_QS];
922
0
    if (cd == NULL) return;
923
0
    bnum1 = cram_codec_to_id(cd, &bnum2);
924
0
    if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
925
0
    if (cram_ds_unique(hdr, cd, bnum1)) {
926
0
        cram_block *b = cram_get_block_by_id(s, bnum1);
927
0
        if (b) *qual_size = b->uncomp_size;
928
0
        if (q_id && cd->codec == E_EXTERNAL)
929
0
            *q_id = bnum1;
930
0
    }
931
932
    /* Name */
933
0
    cd = hdr->codecs[DS_RN];
934
0
    if (cd == NULL) return;
935
0
    bnum1 = cram_codec_to_id(cd, &bnum2);
936
0
    if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
937
0
    if (cram_ds_unique(hdr, cd, bnum1)) {
938
0
        cram_block *b = cram_get_block_by_id(s, bnum1);
939
0
        if (b) *name_size = b->uncomp_size;
940
0
    }
941
0
}
942
943
944
/* ----------------------------------------------------------------------
945
 * CRAM slices
946
 */
947
948
/*
949
 * Decodes a CRAM (un)mapped slice header block.
950
 * Returns slice header ptr on success
951
 *         NULL on failure
952
 */
953
773
cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) {
954
773
    cram_block_slice_hdr *hdr;
955
773
    unsigned char *cp;
956
773
    unsigned char *cp_end;
957
773
    int i, err = 0;
958
959
773
    if (b->method != RAW) {
960
        /* Spec. says slice header should be RAW, but we can future-proof
961
           by trying to decode it if it isn't. */
962
8
        if (cram_uncompress_block(b) < 0)
963
5
            return NULL;
964
8
    }
965
768
    cp =  (unsigned char *)BLOCK_DATA(b);
966
768
    cp_end = cp + b->uncomp_size;
967
968
768
    if (b->content_type != MAPPED_SLICE &&
969
768
        b->content_type != UNMAPPED_SLICE)
970
0
        return NULL;
971
972
768
    if (!(hdr  = calloc(1, sizeof(*hdr))))
973
0
        return NULL;
974
975
768
    hdr->content_type = b->content_type;
976
977
768
    if (b->content_type == MAPPED_SLICE) {
978
760
        hdr->ref_seq_id = fd->vv.varint_get32s((char **)&cp, (char *)cp_end, &err);
979
760
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
980
0
            hdr->ref_seq_start = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err);
981
0
            hdr->ref_seq_span  = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err);
982
760
        } else {
983
760
            hdr->ref_seq_start = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
984
760
            hdr->ref_seq_span  = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
985
760
        }
986
760
        if (hdr->ref_seq_start < 0 || hdr->ref_seq_span < 0) {
987
16
            free(hdr);
988
16
            hts_log_error("Negative values not permitted for header "
989
16
                          "sequence start or span fields");
990
16
            return NULL;
991
16
        }
992
760
    }
993
752
    hdr->num_records = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err);
994
752
    hdr->record_counter = 0;
995
752
    if (CRAM_MAJOR_VERS(fd->version) == 2) {
996
0
        hdr->record_counter = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
997
752
    } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
998
0
        hdr->record_counter = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err);
999
0
    }
1000
752
    hdr->num_blocks      = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
1001
752
    hdr->num_content_ids = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
1002
752
    if (hdr->num_content_ids < 1 ||
1003
752
        hdr->num_content_ids >= 10000) {
1004
        // Slice must have at least one data block, and there is no need
1005
        // for more than 2 per possible aux-tag plus ancillary.
1006
18
        free(hdr);
1007
18
        return NULL;
1008
18
    }
1009
734
    hdr->block_content_ids = malloc(hdr->num_content_ids * sizeof(int32_t));
1010
734
    if (!hdr->block_content_ids) {
1011
0
        free(hdr);
1012
0
        return NULL;
1013
0
    }
1014
1015
34.5k
    for (i = 0; i < hdr->num_content_ids; i++)
1016
33.8k
        hdr->block_content_ids[i] = fd->vv.varint_get32((char **)&cp,
1017
33.8k
                                                         (char *)cp_end,
1018
33.8k
                                                         &err);
1019
734
    if (err) {
1020
8
        free(hdr->block_content_ids);
1021
8
        free(hdr);
1022
8
        return NULL;
1023
8
    }
1024
1025
726
    if (b->content_type == MAPPED_SLICE)
1026
720
        hdr->ref_base_id = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err);
1027
1028
726
    if (CRAM_MAJOR_VERS(fd->version) != 1) {
1029
0
        if (cp_end - cp < 16) {
1030
0
            free(hdr->block_content_ids);
1031
0
            free(hdr);
1032
0
            return NULL;
1033
0
        }
1034
0
        memcpy(hdr->md5, cp, 16);
1035
726
    } else {
1036
726
        memset(hdr->md5, 0, 16);
1037
726
    }
1038
1039
726
    if (!err)
1040
720
        return hdr;
1041
1042
6
    free(hdr->block_content_ids);
1043
6
    free(hdr);
1044
6
    return NULL;
1045
726
}
1046
1047
1048
#if 0
1049
/* Returns the number of bits set in val; it the highest bit used */
1050
static int nbits(int v) {
1051
    static const int MultiplyDeBruijnBitPosition[32] = {
1052
        1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31,
1053
        9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32
1054
    };
1055
1056
    v |= v >> 1; // first up to set all bits 1 after the first 1 */
1057
    v |= v >> 2;
1058
    v |= v >> 4;
1059
    v |= v >> 8;
1060
    v |= v >> 16;
1061
1062
    // DeBruijn magic to find top bit
1063
    return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
1064
}
1065
#endif
1066
1067
#if 0
1068
static int sort_freqs(const void *vp1, const void *vp2) {
1069
    const int i1 = *(const int *)vp1;
1070
    const int i2 = *(const int *)vp2;
1071
    return i1-i2;
1072
}
1073
#endif
1074
1075
/* ----------------------------------------------------------------------
1076
 * Primary CRAM sequence decoder
1077
 */
1078
1079
0
static inline int add_md_char(cram_slice *s, int decode_md, char c, int32_t *md_dist) {
1080
0
    if (decode_md) {
1081
0
        BLOCK_APPEND_UINT(s->aux_blk, *md_dist);
1082
0
        BLOCK_APPEND_CHAR(s->aux_blk, c);
1083
0
        *md_dist = 0;
1084
0
    }
1085
0
    return 0;
1086
1087
0
 block_err:
1088
0
    return -1;
1089
0
}
1090
1091
/*
1092
 * Internal part of cram_decode_slice().
1093
 * Generates the sequence, quality and cigar components.
1094
 */
1095
static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
1096
                           cram_block *blk, cram_record *cr, sam_hdr_t *sh,
1097
                           int cf, char *seq, char *qual,
1098
0
                           int has_MD, int has_NM) {
1099
0
    int prev_pos = 0, f, r = 0, out_sz = 1;
1100
0
    int seq_pos = 1;
1101
0
    int cig_len = 0;
1102
0
    int64_t ref_pos = cr->apos;
1103
0
    int32_t fn, i32;
1104
0
    enum cigar_op cig_op = BAM_CMATCH;
1105
0
    uint32_t *cigar = s->cigar;
1106
0
    uint32_t ncigar = s->ncigar;
1107
0
    uint32_t cigar_alloc = s->cigar_alloc;
1108
0
    uint32_t nm = 0;
1109
0
    int32_t md_dist = 0;
1110
0
    int orig_aux = 0;
1111
    // CRAM <  4.0 decode_md is off/on
1112
    // CRAM >= 4.0 decode_md is auto/on (auto=on if MD* present, off otherwise)
1113
0
    int do_md = CRAM_MAJOR_VERS(fd->version) >= 4
1114
0
        ? (s->decode_md > 0)
1115
0
        : (s->decode_md != 0);
1116
0
    int decode_md = s->ref && cr->ref_id >= 0 && ((do_md && !has_MD) || has_MD < 0);
1117
0
    int decode_nm = s->ref && cr->ref_id >= 0 && ((do_md && !has_NM) || has_NM < 0);
1118
0
    uint32_t ds = s->data_series;
1119
0
    sam_hrecs_t *bfd = sh->hrecs;
1120
1121
0
    cram_codec **codecs = c->comp_hdr->codecs;
1122
1123
0
    if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1124
0
        memset(qual, 255, cr->len);
1125
0
    }
1126
1127
0
    if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
1128
0
        decode_md = decode_nm = 0;
1129
1130
0
    if (decode_md) {
1131
0
        orig_aux = BLOCK_SIZE(s->aux_blk);
1132
0
        if (has_MD == 0)
1133
0
            BLOCK_APPEND(s->aux_blk, "MDZ", 3);
1134
0
    }
1135
1136
0
    if (ds & CRAM_FN) {
1137
0
        if (!codecs[DS_FN]) return -1;
1138
0
        r |= codecs[DS_FN]->decode(s,codecs[DS_FN],
1139
0
                                   blk, (char *)&fn, &out_sz);
1140
0
        if (r) return r;
1141
0
    } else {
1142
0
        fn = 0;
1143
0
    }
1144
1145
0
    ref_pos--; // count from 0
1146
0
    cr->cigar = ncigar;
1147
1148
0
    if (!(ds & (CRAM_FC | CRAM_FP)))
1149
0
        goto skip_cigar;
1150
1151
0
    if (fn) {
1152
0
        if ((ds & CRAM_FC) && !codecs[DS_FC])
1153
0
            return -1;
1154
0
        if ((ds & CRAM_FP) && !codecs[DS_FP])
1155
0
            return -1;
1156
0
    }
1157
1158
0
    for (f = 0; f < fn; f++) {
1159
0
        int32_t pos = 0;
1160
0
        char op;
1161
1162
0
        if (ncigar+2 >= cigar_alloc) {
1163
0
            cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1164
0
            if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar))))
1165
0
                return -1;
1166
0
            s->cigar = cigar;
1167
0
        }
1168
1169
0
        if (ds & CRAM_FC) {
1170
0
            r |= codecs[DS_FC]->decode(s,
1171
0
                                       codecs[DS_FC],
1172
0
                                       blk,
1173
0
                                       &op,  &out_sz);
1174
0
            if (r) return r;
1175
0
        }
1176
1177
0
        if (!(ds & CRAM_FP))
1178
0
            continue;
1179
1180
0
        r |= codecs[DS_FP]->decode(s,
1181
0
                                   codecs[DS_FP],
1182
0
                                   blk,
1183
0
                                   (char *)&pos, &out_sz);
1184
0
        if (r) return r;
1185
0
        pos += prev_pos;
1186
1187
0
        if (pos <= 0) {
1188
0
            hts_log_error("Feature position %d before start of read", pos);
1189
0
            return -1;
1190
0
        }
1191
1192
0
        if (pos > seq_pos) {
1193
0
            if (pos > cr->len+1)
1194
0
                return -1;
1195
1196
0
            if (s->ref && cr->ref_id >= 0) {
1197
0
                if (ref_pos + pos - seq_pos > bfd->ref[cr->ref_id].len) {
1198
0
                    static int whinged = 0;
1199
0
                    int rlen;
1200
0
                    if (!whinged)
1201
0
                        hts_log_warning("Ref pos outside of ref sequence boundary");
1202
0
                    whinged = 1;
1203
0
                    rlen = bfd->ref[cr->ref_id].len - ref_pos;
1204
                    // May miss MD/NM cases where both seq/ref are N, but this is a
1205
                    // malformed cram file anyway.
1206
0
                    if (rlen > 0) {
1207
0
                        if (ref_pos + rlen > s->ref_end)
1208
0
                            goto beyond_slice;
1209
1210
0
                        memcpy(&seq[seq_pos-1],
1211
0
                               &s->ref[ref_pos - s->ref_start +1], rlen);
1212
0
                        if ((pos - seq_pos) - rlen > 0)
1213
0
                            memset(&seq[seq_pos-1+rlen], 'N',
1214
0
                                   (pos - seq_pos) - rlen);
1215
0
                    } else {
1216
0
                        memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
1217
0
                    }
1218
0
                    if (md_dist >= 0)
1219
0
                        md_dist += pos - seq_pos;
1220
0
                } else {
1221
                    // 'N' in both ref and seq is also mismatch for NM/MD
1222
0
                    if (ref_pos + pos-seq_pos > s->ref_end)
1223
0
                        goto beyond_slice;
1224
1225
0
                    const char *refp = s->ref + ref_pos - s->ref_start + 1;
1226
0
                    const int frag_len = pos - seq_pos;
1227
0
                    int do_cpy = 1;
1228
0
                    if (decode_md || decode_nm) {
1229
0
                        char *N = memchr(refp, 'N', frag_len);
1230
0
                        if (N) {
1231
0
                            int i;
1232
0
                            for (i = 0; i < frag_len; i++) {
1233
0
                                char base = refp[i];
1234
0
                                if (base == 'N') {
1235
0
                                    if (add_md_char(s, decode_md,
1236
0
                                                    'N', &md_dist) < 0)
1237
0
                                        return -1;
1238
0
                                    nm++;
1239
0
                                } else {
1240
0
                                    md_dist++;
1241
0
                                }
1242
0
                                seq[seq_pos-1+i] = base;
1243
0
                            }
1244
0
                            do_cpy = 0;
1245
0
                        } else {
1246
0
                            md_dist += frag_len;
1247
0
                        }
1248
0
                    }
1249
0
                    if (do_cpy)
1250
0
                        memcpy(&seq[seq_pos-1], refp, frag_len);
1251
0
                }
1252
0
            }
1253
#ifdef USE_X
1254
            if (cig_len && cig_op != BAM_CBASE_MATCH) {
1255
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1256
                cig_len = 0;
1257
            }
1258
            cig_op = BAM_CBASE_MATCH;
1259
#else
1260
0
            if (cig_len && cig_op != BAM_CMATCH) {
1261
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1262
0
                cig_len = 0;
1263
0
            }
1264
0
            cig_op = BAM_CMATCH;
1265
0
#endif
1266
0
            cig_len += pos - seq_pos;
1267
0
            ref_pos += pos - seq_pos;
1268
0
            seq_pos = pos;
1269
0
        }
1270
1271
0
        prev_pos = pos;
1272
1273
0
        if (!(ds & CRAM_FC))
1274
0
            goto skip_cigar;
1275
1276
0
        switch(op) {
1277
0
        case 'S': { // soft clip: IN
1278
0
            int32_t out_sz2 = cr->len ? cr->len-(pos-1) : 1;
1279
0
            int have_sc = 0;
1280
1281
0
            if (cig_len) {
1282
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1283
0
                cig_len = 0;
1284
0
            }
1285
0
            switch (CRAM_MAJOR_VERS(fd->version)) {
1286
0
            case 1:
1287
0
                if (ds & CRAM_IN) {
1288
0
                    r |= codecs[DS_IN]
1289
0
                        ? codecs[DS_IN]->decode(s, codecs[DS_IN],
1290
0
                                                blk,
1291
0
                                                cr->len ? &seq[pos-1] : NULL,
1292
0
                                                &out_sz2)
1293
0
                        : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1294
0
                    have_sc = 1;
1295
0
                }
1296
0
                break;
1297
0
            case 2:
1298
0
            default:
1299
0
                if (ds & CRAM_SC) {
1300
0
                    r |= codecs[DS_SC]
1301
0
                        ? codecs[DS_SC]->decode(s, codecs[DS_SC],
1302
0
                                                blk,
1303
0
                                                cr->len ? &seq[pos-1] : NULL,
1304
0
                                                &out_sz2)
1305
0
                        : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1306
0
                    have_sc = 1;
1307
0
                }
1308
0
                break;
1309
1310
                //default:
1311
                //    r |= codecs[DS_BB]
1312
                //        ? codecs[DS_BB]->decode(s, codecs[DS_BB],
1313
                //                                blk, &seq[pos-1], &out_sz2)
1314
                //        : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1315
0
            }
1316
0
            if (have_sc) {
1317
0
                if (r) return r;
1318
0
                cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP;
1319
0
                cig_op = BAM_CSOFT_CLIP;
1320
0
                seq_pos += out_sz2;
1321
0
            }
1322
0
            break;
1323
0
        }
1324
1325
0
        case 'X': { // Substitution; BS
1326
0
            unsigned char base;
1327
#ifdef USE_X
1328
            if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1329
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1330
                cig_len = 0;
1331
            }
1332
            if (ds & CRAM_BS) {
1333
                if (!codecs[DS_BS]) return -1;
1334
                r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk,
1335
                                           (char *)&base, &out_sz);
1336
                if (pos-1 < cr->len)
1337
                    seq[pos-1] = 'N'; // FIXME look up BS=base value
1338
            }
1339
            cig_op = BAM_CBASE_MISMATCH;
1340
#else
1341
0
            int ref_base;
1342
0
            if (cig_len && cig_op != BAM_CMATCH) {
1343
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1344
0
                cig_len = 0;
1345
0
            }
1346
0
            if (ds & CRAM_BS) {
1347
0
                if (!codecs[DS_BS]) return -1;
1348
0
                r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk,
1349
0
                                           (char *)&base, &out_sz);
1350
0
                if (r) return -1;
1351
0
                if (cr->ref_id < 0 || ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
1352
0
                    if (pos-1 < cr->len)
1353
0
                        seq[pos-1] = c->comp_hdr->
1354
0
                            substitution_matrix[fd->L1['N']][base];
1355
0
                    if (decode_md || decode_nm) {
1356
0
                        if (md_dist >= 0 && decode_md)
1357
0
                            BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1358
0
                        md_dist = -1;
1359
0
                        nm--;
1360
0
                    }
1361
0
                } else {
1362
0
                    unsigned char ref_call = ref_pos < s->ref_end
1363
0
                        ? (uc)s->ref[ref_pos - s->ref_start +1]
1364
0
                        : 'N';
1365
0
                    ref_base = fd->L1[ref_call];
1366
0
                    if (pos-1 < cr->len)
1367
0
                        seq[pos-1] = c->comp_hdr->
1368
0
                            substitution_matrix[ref_base][base];
1369
0
                    if (add_md_char(s, decode_md, ref_call, &md_dist) < 0)
1370
0
                        return -1;
1371
0
                }
1372
0
            }
1373
0
            cig_op = BAM_CMATCH;
1374
0
#endif
1375
0
            nm++;
1376
0
            cig_len++;
1377
0
            seq_pos++;
1378
0
            ref_pos++;
1379
0
            break;
1380
0
        }
1381
1382
0
        case 'D': { // Deletion; DL
1383
0
            if (cig_len && cig_op != BAM_CDEL) {
1384
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1385
0
                cig_len = 0;
1386
0
            }
1387
0
            if (ds & CRAM_DL) {
1388
0
                if (!codecs[DS_DL]) return -1;
1389
0
                r |= codecs[DS_DL]->decode(s, codecs[DS_DL], blk,
1390
0
                                           (char *)&i32, &out_sz);
1391
0
                if (r) return r;
1392
0
                if (decode_md || decode_nm) {
1393
0
                    if (ref_pos + i32 > s->ref_end)
1394
0
                        goto beyond_slice;
1395
0
                    if (md_dist >= 0 && decode_md)
1396
0
                        BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1397
0
                    if (ref_pos + i32 <= bfd->ref[cr->ref_id].len) {
1398
0
                        if (decode_md) {
1399
0
                            BLOCK_APPEND_CHAR(s->aux_blk, '^');
1400
0
                            BLOCK_APPEND(s->aux_blk,
1401
0
                                         &s->ref[ref_pos - s->ref_start +1],
1402
0
                                         i32);
1403
0
                            md_dist = 0;
1404
0
                        }
1405
0
                        nm += i32;
1406
0
                    } else {
1407
0
                        uint32_t dlen;
1408
0
                        if (bfd->ref[cr->ref_id].len >= ref_pos) {
1409
0
                            if (decode_md) {
1410
0
                                BLOCK_APPEND_CHAR(s->aux_blk, '^');
1411
0
                                BLOCK_APPEND(s->aux_blk,
1412
0
                                             &s->ref[ref_pos - s->ref_start+1],
1413
0
                                             bfd->ref[cr->ref_id].len-ref_pos);
1414
0
                                BLOCK_APPEND_UINT(s->aux_blk, 0);
1415
0
                            }
1416
0
                            dlen = i32 - (bfd->ref[cr->ref_id].len - ref_pos);
1417
0
                            nm += i32 - dlen;
1418
0
                        } else {
1419
0
                            dlen = i32;
1420
0
                        }
1421
1422
0
                        md_dist = -1;
1423
0
                    }
1424
0
                }
1425
0
                cig_op = BAM_CDEL;
1426
0
                cig_len += i32;
1427
0
                ref_pos += i32;
1428
                //printf("  %d: DL = %d (ret %d)\n", f, i32, r);
1429
0
            }
1430
0
            break;
1431
0
        }
1432
1433
0
        case 'I': { // Insertion (several bases); IN
1434
0
            int32_t out_sz2 = cr->len ? cr->len-(pos-1) : 1;
1435
1436
0
            if (cig_len && cig_op != BAM_CINS) {
1437
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1438
0
                cig_len = 0;
1439
0
            }
1440
1441
0
            if (ds & CRAM_IN) {
1442
0
                if (!codecs[DS_IN]) return -1;
1443
0
                r |= codecs[DS_IN]->decode(s, codecs[DS_IN], blk,
1444
0
                                           cr->len ? &seq[pos-1] : NULL,
1445
0
                                           &out_sz2);
1446
0
                if (r) return r;
1447
0
                cig_op = BAM_CINS;
1448
0
                cig_len += out_sz2;
1449
0
                seq_pos += out_sz2;
1450
0
                nm      += out_sz2;
1451
                //printf("  %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2);
1452
0
            }
1453
0
            break;
1454
0
        }
1455
1456
0
        case 'i': { // Insertion (single base); BA
1457
0
            if (cig_len && cig_op != BAM_CINS) {
1458
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1459
0
                cig_len = 0;
1460
0
            }
1461
0
            if (ds & CRAM_BA) {
1462
0
                if (!codecs[DS_BA]) return -1;
1463
0
                r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk,
1464
0
                                           cr->len ? &seq[pos-1] : NULL,
1465
0
                                           &out_sz);
1466
0
                if (r) return r;
1467
0
            }
1468
0
            cig_op = BAM_CINS;
1469
0
            cig_len++;
1470
0
            seq_pos++;
1471
0
            nm++;
1472
0
            break;
1473
0
        }
1474
1475
0
        case 'b': { // Several bases
1476
0
            int32_t len = cr->len ? cr->len-(pos-1) : 1;
1477
1478
0
            if (cig_len && cig_op != BAM_CMATCH) {
1479
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1480
0
                cig_len = 0;
1481
0
            }
1482
1483
0
            if (ds & CRAM_BB) {
1484
0
                if (!codecs[DS_BB]) return -1;
1485
0
                r |= codecs[DS_BB]->decode(s, codecs[DS_BB], blk,
1486
0
                                           cr->len ? &seq[pos-1] : NULL,
1487
0
                                           &len);
1488
0
                if (r) return r;
1489
1490
0
                if (decode_md || decode_nm) {
1491
0
                    int x;
1492
0
                    if (md_dist >= 0 && decode_md)
1493
0
                        BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1494
1495
0
                    for (x = 0; x < len; x++) {
1496
0
                        if (x && decode_md)
1497
0
                            BLOCK_APPEND_UINT(s->aux_blk, 0);
1498
0
                        if (ref_pos+x >= bfd->ref[cr->ref_id].len || !s->ref) {
1499
0
                            md_dist = -1;
1500
0
                            break;
1501
0
                        } else {
1502
0
                            if (decode_md) {
1503
0
                                if (ref_pos + x > s->ref_end)
1504
0
                                    goto beyond_slice;
1505
0
                                char r = s->ref[ref_pos+x-s->ref_start +1];
1506
0
                                BLOCK_APPEND_CHAR(s->aux_blk, r);
1507
0
                            }
1508
0
                        }
1509
0
                    }
1510
1511
0
                    nm += x;
1512
0
                    md_dist = 0;
1513
0
                }
1514
0
            }
1515
1516
0
            cig_op = BAM_CMATCH;
1517
1518
0
            cig_len+=len;
1519
0
            seq_pos+=len;
1520
0
            ref_pos+=len;
1521
            //prev_pos+=len;
1522
0
            break;
1523
0
        }
1524
1525
0
        case 'q': { // Several quality values
1526
0
            int32_t len = cr->len ? cr->len - (pos-1) : 1;
1527
1528
0
            if (cig_len && cig_op != BAM_CMATCH) {
1529
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1530
0
                cig_len = 0;
1531
0
            }
1532
1533
0
            if (ds & CRAM_QQ) {
1534
0
                if (!codecs[DS_QQ]) return -1;
1535
0
                if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)
1536
0
                    && (unsigned char)*qual == 255)
1537
0
                    memset(qual, 30, cr->len); // ?
1538
0
                r |= codecs[DS_QQ]->decode(s, codecs[DS_QQ], blk,
1539
0
                                           (char *)&qual[pos-1], &len);
1540
0
                if (r) return r;
1541
0
            }
1542
1543
0
            cig_op = BAM_CMATCH;
1544
1545
            //prev_pos+=len;
1546
0
            break;
1547
0
        }
1548
1549
0
        case 'B': { // Read base; BA, QS
1550
#ifdef USE_X
1551
            if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1552
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1553
                cig_len = 0;
1554
            }
1555
#else
1556
0
            if (cig_len && cig_op != BAM_CMATCH) {
1557
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1558
0
                cig_len = 0;
1559
0
            }
1560
0
#endif
1561
0
            if (ds & CRAM_BA) {
1562
0
                if (!codecs[DS_BA]) return -1;
1563
0
                r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk,
1564
0
                                           cr->len ? &seq[pos-1] : NULL,
1565
0
                                           &out_sz);
1566
1567
0
                if (decode_md || decode_nm) {
1568
0
                    if (md_dist >= 0 && decode_md)
1569
0
                        BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1570
0
                    if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
1571
0
                        md_dist = -1;
1572
0
                    } else {
1573
0
                        if (decode_md) {
1574
0
                            if (ref_pos > s->ref_end)
1575
0
                                goto beyond_slice;
1576
0
                            BLOCK_APPEND_CHAR(s->aux_blk,
1577
0
                                              s->ref[ref_pos-s->ref_start +1]);
1578
0
                        }
1579
0
                        nm++;
1580
0
                        md_dist = 0;
1581
0
                    }
1582
0
                }
1583
0
            }
1584
0
            if (ds & CRAM_QS) {
1585
0
                if (!codecs[DS_QS]) return -1;
1586
0
                if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)
1587
0
                    && (unsigned char)*qual == 255)
1588
0
                    memset(qual, 30, cr->len); // ASCII ?.  Same as htsjdk
1589
0
                r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk,
1590
0
                                           (char *)&qual[pos-1], &out_sz);
1591
0
            }
1592
#ifdef USE_X
1593
            cig_op = BAM_CBASE_MISMATCH;
1594
#else
1595
0
            cig_op = BAM_CMATCH;
1596
0
#endif
1597
0
            cig_len++;
1598
0
            seq_pos++;
1599
0
            ref_pos++;
1600
            //printf("  %d: BA/QS(B) = %c/%d (ret %d)\n", f, i32, qc, r);
1601
0
            break;
1602
0
        }
1603
1604
0
        case 'Q': { // Quality score; QS
1605
0
            if (ds & CRAM_QS) {
1606
0
                if (!codecs[DS_QS]) return -1;
1607
0
                if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) &&
1608
0
                    (unsigned char)*qual == 255)
1609
0
                    memset(qual, 30, cr->len); // ?
1610
0
                r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk,
1611
0
                                           (char *)&qual[pos-1], &out_sz);
1612
                //printf("  %d: QS = %d (ret %d)\n", f, qc, r);
1613
0
            }
1614
0
            break;
1615
0
        }
1616
1617
0
        case 'H': { // hard clip; HC
1618
0
            if (cig_len && cig_op != BAM_CHARD_CLIP) {
1619
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1620
0
                cig_len = 0;
1621
0
            }
1622
0
            if (ds & CRAM_HC) {
1623
0
                if (!codecs[DS_HC]) return -1;
1624
0
                r |= codecs[DS_HC]->decode(s, codecs[DS_HC], blk,
1625
0
                                           (char *)&i32, &out_sz);
1626
0
                if (r) return r;
1627
0
                cig_op = BAM_CHARD_CLIP;
1628
0
                cig_len += i32;
1629
0
            }
1630
0
            break;
1631
0
        }
1632
1633
0
        case 'P': { // padding; PD
1634
0
            if (cig_len && cig_op != BAM_CPAD) {
1635
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1636
0
                cig_len = 0;
1637
0
            }
1638
0
            if (ds & CRAM_PD) {
1639
0
                if (!codecs[DS_PD]) return -1;
1640
0
                r |= codecs[DS_PD]->decode(s, codecs[DS_PD], blk,
1641
0
                                           (char *)&i32, &out_sz);
1642
0
                if (r) return r;
1643
0
                cig_op = BAM_CPAD;
1644
0
                cig_len += i32;
1645
0
            }
1646
0
            break;
1647
0
        }
1648
1649
0
        case 'N': { // Ref skip; RS
1650
0
            if (cig_len && cig_op != BAM_CREF_SKIP) {
1651
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1652
0
                cig_len = 0;
1653
0
            }
1654
0
            if (ds & CRAM_RS) {
1655
0
                if (!codecs[DS_RS]) return -1;
1656
0
                r |= codecs[DS_RS]->decode(s, codecs[DS_RS], blk,
1657
0
                                           (char *)&i32, &out_sz);
1658
0
                if (r) return r;
1659
0
                cig_op = BAM_CREF_SKIP;
1660
0
                cig_len += i32;
1661
0
                ref_pos += i32;
1662
0
            }
1663
0
            break;
1664
0
        }
1665
1666
0
        default:
1667
0
            hts_log_error("Unknown feature code '%c'", op);
1668
0
            return -1;
1669
0
        }
1670
0
    }
1671
1672
0
    if (!(ds & CRAM_FC))
1673
0
        goto skip_cigar;
1674
1675
    /* An implicit match op for any unaccounted for bases */
1676
0
    if ((ds & CRAM_FN) && cr->len >= seq_pos) {
1677
0
        if (s->ref && cr->ref_id >= 0) {
1678
0
            if (ref_pos + cr->len - seq_pos + 1 > bfd->ref[cr->ref_id].len) {
1679
0
                static int whinged = 0;
1680
0
                int rlen;
1681
0
                if (!whinged)
1682
0
                    hts_log_warning("Ref pos outside of ref sequence boundary");
1683
0
                whinged = 1;
1684
0
                rlen = bfd->ref[cr->ref_id].len - ref_pos;
1685
                // May miss MD/NM cases where both seq/ref are N, but this is a
1686
                // malformed cram file anyway.
1687
0
                if (rlen > 0) {
1688
0
                    if (seq_pos-1 + rlen < cr->len)
1689
0
                        memcpy(&seq[seq_pos-1],
1690
0
                               &s->ref[ref_pos - s->ref_start +1], rlen);
1691
0
                    if ((cr->len - seq_pos + 1) - rlen > 0)
1692
0
                        memset(&seq[seq_pos-1+rlen], 'N',
1693
0
                               (cr->len - seq_pos + 1) - rlen);
1694
0
                } else {
1695
0
                    if (cr->len - seq_pos + 1 > 0)
1696
0
                        memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
1697
0
                }
1698
0
                if (md_dist >= 0)
1699
0
                    md_dist += cr->len - seq_pos + 1;
1700
0
            } else {
1701
0
                if (cr->len - seq_pos + 1 > 0) {
1702
0
                    if (ref_pos + cr->len-seq_pos +1 > s->ref_end)
1703
0
                        goto beyond_slice;
1704
0
                    int remainder = cr->len - (seq_pos-1);
1705
0
                    int j = ref_pos - s->ref_start + 1;
1706
0
                    if (decode_md || decode_nm) {
1707
0
                        int i;
1708
0
                        char *N = memchr(&s->ref[j], 'N', remainder);
1709
0
                        if (!N) {
1710
                            // short cut the common case
1711
0
                            md_dist += cr->len - (seq_pos-1);
1712
0
                        } else {
1713
0
                            char *refp = &s->ref[j-(seq_pos-1)];
1714
0
                            md_dist += N-&s->ref[j];
1715
0
                            int i_start = seq_pos-1 + (N - &s->ref[j]);
1716
0
                            for (i = i_start; i < cr->len; i++) {
1717
0
                                char base = refp[i];
1718
0
                                if (base == 'N') {
1719
0
                                    if (add_md_char(s, decode_md, 'N',
1720
0
                                                    &md_dist) < 0)
1721
0
                                        return -1;
1722
0
                                    nm++;
1723
0
                                } else {
1724
0
                                    md_dist++;
1725
0
                                }
1726
0
                            }
1727
0
                        }
1728
0
                    }
1729
0
                    memcpy(&seq[seq_pos-1], &s->ref[j], remainder);
1730
0
                }
1731
0
                ref_pos += cr->len - seq_pos + 1;
1732
0
            }
1733
0
        } else if (cr->ref_id >= 0) {
1734
            // So alignment end can be computed even when not decoding sequence
1735
0
            ref_pos += cr->len - seq_pos + 1;
1736
0
        }
1737
1738
0
        if (ncigar+1 >= cigar_alloc) {
1739
0
            cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1740
0
            if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar))))
1741
0
                return -1;
1742
0
            s->cigar = cigar;
1743
0
        }
1744
#ifdef USE_X
1745
        if (cig_len && cig_op != BAM_CBASE_MATCH) {
1746
            cigar[ncigar++] = (cig_len<<4) + cig_op;
1747
            cig_len = 0;
1748
        }
1749
        cig_op = BAM_CBASE_MATCH;
1750
#else
1751
0
        if (cig_len && cig_op != BAM_CMATCH) {
1752
0
            cigar[ncigar++] = (cig_len<<4) + cig_op;
1753
0
            cig_len = 0;
1754
0
        }
1755
0
        cig_op = BAM_CMATCH;
1756
0
#endif
1757
0
        cig_len += cr->len - seq_pos+1;
1758
0
    }
1759
1760
0
 skip_cigar:
1761
1762
0
    if ((ds & CRAM_FN) && decode_md) {
1763
0
        if (md_dist >= 0)
1764
0
            BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1765
0
    }
1766
1767
0
    if (cig_len) {
1768
0
        if (ncigar >= cigar_alloc) {
1769
0
            cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1770
0
            if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar))))
1771
0
                return -1;
1772
0
            s->cigar = cigar;
1773
0
        }
1774
1775
0
        cigar[ncigar++] = (cig_len<<4) + cig_op;
1776
0
    }
1777
1778
0
    cr->ncigar = ncigar - cr->cigar;
1779
0
    cr->aend = ref_pos > cr->apos ? ref_pos : cr->apos;
1780
1781
    //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos);
1782
1783
0
    if (ds & CRAM_MQ) {
1784
0
        if (!codecs[DS_MQ]) return -1;
1785
0
        r |= codecs[DS_MQ]->decode(s, codecs[DS_MQ], blk,
1786
0
                                   (char *)&cr->mqual, &out_sz);
1787
0
    } else {
1788
0
        cr->mqual = 40;
1789
0
    }
1790
1791
0
    if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1792
0
        int32_t out_sz2 = cr->len;
1793
1794
0
        if (!codecs[DS_QS]) return -1;
1795
0
        r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk,
1796
0
                                   qual, &out_sz2);
1797
0
    }
1798
1799
0
    s->cigar = cigar;
1800
0
    s->cigar_alloc = cigar_alloc;
1801
0
    s->ncigar = ncigar;
1802
1803
0
    if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
1804
0
        cr->len = 0;
1805
1806
0
    if (decode_md) {
1807
0
        BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z:
1808
0
        size_t sz = BLOCK_SIZE(s->aux_blk) - orig_aux;
1809
0
        if (has_MD < 0) {
1810
            // has_MD < 0; already have MDZ allocated in aux at -has_MD,
1811
            // but wrote MD to end of aux (at orig_aux).
1812
            // We need some memmoves to shuffle it around.
1813
0
            char tmp_MD_[1024], *tmp_MD = tmp_MD_;
1814
0
            unsigned char *orig_aux_p = BLOCK_DATA(s->aux_blk) + orig_aux;
1815
0
            if (sz > 1024) {
1816
0
                tmp_MD = malloc(sz);
1817
0
                if (!tmp_MD)
1818
0
                    return -1;
1819
0
            }
1820
0
            memcpy(tmp_MD, orig_aux_p, sz);
1821
0
            memmove(&BLOCK_DATA(s->aux_blk)[-has_MD] + sz,
1822
0
                    &BLOCK_DATA(s->aux_blk)[-has_MD],
1823
0
                    orig_aux_p - &BLOCK_DATA(s->aux_blk)[-has_MD]);
1824
0
            memcpy(&BLOCK_DATA(s->aux_blk)[-has_MD], tmp_MD, sz);
1825
0
            if (tmp_MD != tmp_MD_)
1826
0
                free(tmp_MD);
1827
1828
0
            if (-has_NM > -has_MD)
1829
                // we inserted before NM, so move it up a bit
1830
0
                has_NM -= sz;
1831
0
        }
1832
        // else has_MD == 0 and we've already appended MD to the end.
1833
1834
0
        cr->aux_size += sz;
1835
0
    }
1836
1837
0
    if (decode_nm) {
1838
0
        if (has_NM == 0) {
1839
0
            char buf[7];
1840
0
            size_t buf_size;
1841
0
            buf[0] = 'N'; buf[1] = 'M';
1842
0
            if (nm <= UINT8_MAX) {
1843
0
                buf_size = 4;
1844
0
                buf[2] = 'C';
1845
0
                buf[3] = (nm>> 0) & 0xff;
1846
0
            } else if (nm <= UINT16_MAX) {
1847
0
                buf_size = 5;
1848
0
                buf[2] = 'S';
1849
0
                buf[3] = (nm>> 0) & 0xff;
1850
0
                buf[4] = (nm>> 8) & 0xff;
1851
0
            } else {
1852
0
                buf_size = 7;
1853
0
                buf[2] = 'I';
1854
0
                buf[3] = (nm>> 0) & 0xff;
1855
0
                buf[4] = (nm>> 8) & 0xff;
1856
0
                buf[5] = (nm>>16) & 0xff;
1857
0
                buf[6] = (nm>>24) & 0xff;
1858
0
            }
1859
0
            BLOCK_APPEND(s->aux_blk, buf, buf_size);
1860
0
            cr->aux_size += buf_size;
1861
0
        } else {
1862
            // Preallocated space for NM at -has_NM into aux block
1863
0
            unsigned char *buf = BLOCK_DATA(s->aux_blk) + -has_NM;
1864
0
            buf[0] = (nm>> 0) & 0xff;
1865
0
            buf[1] = (nm>> 8) & 0xff;
1866
0
            buf[2] = (nm>>16) & 0xff;
1867
0
            buf[3] = (nm>>24) & 0xff;
1868
0
        }
1869
0
    }
1870
1871
0
    return r;
1872
1873
0
 beyond_slice:
1874
    // Cramtools can create CRAMs that have sequence features outside the
1875
    // stated range of the container & slice reference extents (start + span).
1876
    // We have to check for these in many places, but for brevity have the
1877
    // error reporting in only one.
1878
0
    hts_log_error("CRAM CIGAR extends beyond slice reference extents");
1879
0
    return -1;
1880
1881
0
 block_err:
1882
0
    return -1;
1883
0
}
1884
1885
/*
1886
 * Quick and simple hash lookup for cram_map arrays
1887
 */
1888
0
static cram_map *map_find(cram_map **map, unsigned char *key, int id) {
1889
0
    cram_map *m;
1890
1891
0
    m = map[CRAM_MAP(key[0],key[1])];
1892
0
    while (m && m->key != id)
1893
0
        m= m->next;
1894
1895
0
    return m;
1896
0
}
1897
1898
//#define map_find(M,K,I) M[CRAM_MAP(K[0],K[1])];while (m && m->key != I);m= m->next
1899
1900
1901
static int cram_decode_aux_1_0(cram_container *c, cram_slice *s,
1902
0
                               cram_block *blk, cram_record *cr) {
1903
0
    int i, r = 0, out_sz = 1;
1904
0
    unsigned char ntags;
1905
1906
0
    if (!c->comp_hdr->codecs[DS_TC]) return -1;
1907
0
    r |= c->comp_hdr->codecs[DS_TC]->decode(s, c->comp_hdr->codecs[DS_TC], blk,
1908
0
                                            (char *)&ntags, &out_sz);
1909
0
    cr->ntags = ntags;
1910
1911
    //printf("TC=%d\n", cr->ntags);
1912
0
    cr->aux_size = 0;
1913
0
    cr->aux = BLOCK_SIZE(s->aux_blk);
1914
1915
0
    for (i = 0; i < cr->ntags; i++) {
1916
0
        int32_t id, out_sz = 1;
1917
0
        unsigned char tag_data[3];
1918
0
        cram_map *m;
1919
1920
        //printf("Tag %d/%d\n", i+1, cr->ntags);
1921
0
        if (!c->comp_hdr->codecs[DS_TN]) return -1;
1922
0
        r |= c->comp_hdr->codecs[DS_TN]->decode(s, c->comp_hdr->codecs[DS_TN],
1923
0
                                                blk, (char *)&id, &out_sz);
1924
0
        if (out_sz == 3) {
1925
            // Tag name stored as 3 chars instead of an int?
1926
0
            memcpy(tag_data, &id, 3);
1927
0
        } else {
1928
0
            tag_data[0] = (id>>16) & 0xff;
1929
0
            tag_data[1] = (id>>8)  & 0xff;
1930
0
            tag_data[2] = id       & 0xff;
1931
0
        }
1932
1933
0
        m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
1934
0
        if (!m)
1935
0
            return -1;
1936
0
        BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
1937
1938
0
        if (!m->codec) return -1;
1939
0
        r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
1940
1941
0
        cr->aux_size += out_sz + 3;
1942
0
    }
1943
1944
0
    return r;
1945
1946
0
 block_err:
1947
0
    return -1;
1948
0
}
1949
1950
// has_MD and has_NM are filled out with 0 for none present,
1951
// 1 for present and verbatim, and -pos for present as placeholder
1952
// (MD*, NM*) to be generated and filled out at offset +pos.
1953
static int cram_decode_aux(cram_fd *fd,
1954
                           cram_container *c, cram_slice *s,
1955
                           cram_block *blk, cram_record *cr,
1956
0
                           int *has_MD, int *has_NM) {
1957
0
    int i, r = 0, out_sz = 1;
1958
0
    int32_t TL = 0;
1959
0
    unsigned char *TN;
1960
0
    uint32_t ds = s->data_series;
1961
1962
0
    if (!(ds & (CRAM_TL|CRAM_aux))) {
1963
0
        cr->aux = 0;
1964
0
        cr->aux_size = 0;
1965
0
        return 0;
1966
0
    }
1967
1968
0
    if (!c->comp_hdr->codecs[DS_TL]) return -1;
1969
0
    r |= c->comp_hdr->codecs[DS_TL]->decode(s, c->comp_hdr->codecs[DS_TL], blk,
1970
0
                                            (char *)&TL, &out_sz);
1971
0
    if (r || TL < 0 || TL >= c->comp_hdr->nTL)
1972
0
        return -1;
1973
1974
0
    TN = c->comp_hdr->TL[TL];
1975
0
    cr->ntags = strlen((char *)TN)/3; // optimise to remove strlen
1976
1977
    //printf("TC=%d\n", cr->ntags);
1978
0
    cr->aux_size = 0;
1979
0
    cr->aux = BLOCK_SIZE(s->aux_blk);
1980
1981
0
    if (!(ds & CRAM_aux))
1982
0
        return 0;
1983
1984
0
    for (i = 0; i < cr->ntags; i++) {
1985
0
        int32_t id, out_sz = 1;
1986
0
        unsigned char tag_data[7];
1987
0
        cram_map *m;
1988
1989
0
        if (TN[0] == 'M' && TN[1] == 'D' && has_MD)
1990
0
            *has_MD = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1);
1991
0
        if (TN[0] == 'N' && TN[1] == 'M' && has_NM)
1992
0
            *has_NM = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1);;
1993
1994
        //printf("Tag %d/%d\n", i+1, cr->ntags);
1995
0
        tag_data[0] = TN[0];
1996
0
        tag_data[1] = TN[1];
1997
0
        tag_data[2] = TN[2];
1998
0
        id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2];
1999
2000
0
        if (CRAM_MAJOR_VERS(fd->version) >= 4 && TN[2] == '*') {
2001
            // Place holder, fill out contents later.
2002
0
            int tag_data_size;
2003
0
            if (TN[0] == 'N' && TN[1] == 'M') {
2004
                // Use a fixed size, so we can allocate room for it now.
2005
0
                memcpy(&tag_data[2], "I\0\0\0\0", 5);
2006
0
                tag_data_size = 7;
2007
0
            } else if (TN[0] == 'R' && TN[1] == 'G') {
2008
                // RG is variable size, but known already.  Insert now
2009
0
                TN += 3;
2010
                // Equiv to fd->header->hrecs->rg[cr->rg], but this is the
2011
                // new header API equivalent.
2012
0
                const char *rg = sam_hdr_line_name(fd->header, "RG", cr->rg);
2013
0
                if (!rg)
2014
0
                    continue;
2015
2016
0
                size_t rg_len = strlen(rg);
2017
0
                tag_data[2] = 'Z';
2018
0
                BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
2019
0
                BLOCK_APPEND(s->aux_blk, rg, rg_len);
2020
0
                BLOCK_APPEND_CHAR(s->aux_blk, '\0');
2021
0
                cr->aux_size += 3 + rg_len + 1;
2022
0
                cr->rg = -1; // prevents auto-add later
2023
0
                continue;
2024
0
            } else {
2025
                // Unknown size.  We'll insert MD into stream later.
2026
0
                tag_data[2] = 'Z';
2027
0
                tag_data_size = 3;
2028
0
            }
2029
0
            BLOCK_APPEND(s->aux_blk, (char *)tag_data, tag_data_size);
2030
0
            cr->aux_size += tag_data_size;
2031
0
            TN += 3;
2032
0
        } else {
2033
0
            TN += 3;
2034
0
            m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
2035
0
            if (!m)
2036
0
                return -1;
2037
2038
0
            BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
2039
2040
0
            if (!m->codec) return -1;
2041
0
            r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
2042
0
            if (r) break;
2043
0
            cr->aux_size += out_sz + 3;
2044
2045
            // cF CRAM flags.
2046
0
            if (TN[-3]=='c' && TN[-2]=='F' && TN[-1]=='C' && out_sz == 1) {
2047
                // Remove cF tag
2048
0
                uint8_t cF = BLOCK_END(s->aux_blk)[-1];
2049
0
                BLOCK_SIZE(s->aux_blk) -= out_sz+3;
2050
0
                cr->aux_size -= out_sz+3;
2051
2052
                // bit 1 => don't auto-decode MD.
2053
                // Pretend MD is present verbatim, so we don't auto-generate
2054
0
                if ((cF & 1) && has_MD && *has_MD == 0)
2055
0
                    *has_MD = 1;
2056
2057
                // bit 1 => don't auto-decode NM
2058
0
                if ((cF & 2) && has_NM && *has_NM == 0)
2059
0
                    *has_NM = 1;
2060
0
            }
2061
0
        }
2062
2063
        // We could go to 2^32 fine, but we shouldn't be hitting this anyway,
2064
        // and it's protecting against memory hogs too.
2065
0
        if (BLOCK_SIZE(s->aux_blk) > (1u<<31)) {
2066
0
            hts_log_error("CRAM->BAM aux block size overflow");
2067
0
            goto block_err;
2068
0
        }
2069
0
    }
2070
2071
0
    return r;
2072
2073
0
 block_err:
2074
0
    return -1;
2075
0
}
2076
2077
/* Resolve mate pair cross-references between recs within this slice */
2078
567
static int cram_decode_slice_xref(cram_slice *s, int required_fields) {
2079
567
    int rec;
2080
2081
567
    if (!(required_fields & (SAM_RNEXT | SAM_PNEXT | SAM_TLEN))) {
2082
0
        for (rec = 0; rec < s->hdr->num_records; rec++) {
2083
0
            cram_record *cr = &s->crecs[rec];
2084
2085
0
            cr->tlen = 0;
2086
0
            cr->mate_pos = 0;
2087
0
            cr->mate_ref_id = -1;
2088
0
        }
2089
2090
0
        return 0;
2091
0
    }
2092
2093
567
    for (rec = 0; rec < s->hdr->num_records; rec++) {
2094
0
        cram_record *cr = &s->crecs[rec];
2095
2096
0
        if (cr->mate_line >= 0) {
2097
0
            if (cr->mate_line < s->hdr->num_records) {
2098
                /*
2099
                 * On the first read, loop through computing lengths.
2100
                 * It's not perfect as we have one slice per reference so we
2101
                 * cannot detect when TLEN should be zero due to seqs that
2102
                 * map to multiple references.
2103
                 *
2104
                 * We also cannot set tlen correct when it spans a slice for
2105
                 * other reasons. This may make tlen too small. Should we
2106
                 * fix this by forcing TLEN to be stored verbatim in such cases?
2107
                 *
2108
                 * Or do we just admit defeat and output 0 for tlen? It's the
2109
                 * safe option...
2110
                 */
2111
0
                if (cr->tlen == INT64_MIN) {
2112
0
                    int id1 = rec, id2 = rec;
2113
0
                    int64_t aleft = cr->apos, aright = cr->aend;
2114
0
                    int64_t tlen;
2115
0
                    int ref = cr->ref_id;
2116
2117
                    // number of segments starting at the same point.
2118
0
                    int left_cnt = 0;
2119
2120
0
                    do {
2121
0
                        if (aleft > s->crecs[id2].apos)
2122
0
                            aleft = s->crecs[id2].apos, left_cnt = 1;
2123
0
                        else if (aleft == s->crecs[id2].apos)
2124
0
                            left_cnt++;
2125
0
                        if (aright < s->crecs[id2].aend)
2126
0
                            aright = s->crecs[id2].aend;
2127
0
                        if (s->crecs[id2].mate_line == -1) {
2128
0
                            s->crecs[id2].mate_line = rec;
2129
0
                            break;
2130
0
                        }
2131
0
                        if (s->crecs[id2].mate_line <= id2 ||
2132
0
                            s->crecs[id2].mate_line >= s->hdr->num_records)
2133
0
                            return -1;
2134
0
                        id2 = s->crecs[id2].mate_line;
2135
2136
0
                        if (s->crecs[id2].ref_id != ref)
2137
0
                            ref = -1;
2138
0
                    } while (id2 != id1);
2139
2140
0
                    if (ref != -1) {
2141
0
                        tlen = aright - aleft + 1;
2142
0
                        id1 = id2 = rec;
2143
2144
                        /*
2145
                         * When we have two seqs with identical start and
2146
                         * end coordinates, set +/- tlen based on 1st/last
2147
                         * bit flags instead, as a tie breaker.
2148
                         */
2149
0
                        if (s->crecs[id2].apos == aleft) {
2150
0
                            if (left_cnt == 1 ||
2151
0
                                (s->crecs[id2].flags & BAM_FREAD1))
2152
0
                                s->crecs[id2].tlen = tlen;
2153
0
                            else
2154
0
                                s->crecs[id2].tlen = -tlen;
2155
0
                        } else {
2156
0
                            s->crecs[id2].tlen = -tlen;
2157
0
                        }
2158
2159
0
                        id2 = s->crecs[id2].mate_line;
2160
0
                        while (id2 != id1) {
2161
0
                            if (s->crecs[id2].apos == aleft) {
2162
0
                                if (left_cnt == 1 ||
2163
0
                                    (s->crecs[id2].flags & BAM_FREAD1))
2164
0
                                    s->crecs[id2].tlen = tlen;
2165
0
                                else
2166
0
                                    s->crecs[id2].tlen = -tlen;
2167
0
                            } else {
2168
0
                                s->crecs[id2].tlen = -tlen;
2169
0
                            }
2170
0
                            id2 = s->crecs[id2].mate_line;
2171
0
                        }
2172
0
                    } else {
2173
0
                        id1 = id2 = rec;
2174
2175
0
                        s->crecs[id2].tlen = 0;
2176
0
                        id2 = s->crecs[id2].mate_line;
2177
0
                        while (id2 != id1) {
2178
0
                            s->crecs[id2].tlen = 0;
2179
0
                            id2 = s->crecs[id2].mate_line;
2180
0
                        }
2181
0
                    }
2182
0
                }
2183
2184
0
                cr->mate_pos = s->crecs[cr->mate_line].apos;
2185
0
                cr->mate_ref_id = s->crecs[cr->mate_line].ref_id;
2186
2187
                // paired
2188
0
                cr->flags |= BAM_FPAIRED;
2189
2190
                // set mate unmapped if needed
2191
0
                if (s->crecs[cr->mate_line].flags & BAM_FUNMAP) {
2192
0
                    cr->flags |= BAM_FMUNMAP;
2193
0
                    cr->tlen = 0;
2194
0
                }
2195
0
                if (cr->flags & BAM_FUNMAP) {
2196
0
                    cr->tlen = 0;
2197
0
                }
2198
2199
                // set mate reversed if needed
2200
0
                if (s->crecs[cr->mate_line].flags & BAM_FREVERSE)
2201
0
                    cr->flags |= BAM_FMREVERSE;
2202
0
            } else {
2203
0
                hts_log_error("Mate line out of bounds: %d vs [0, %d]",
2204
0
                              cr->mate_line, s->hdr->num_records-1);
2205
0
            }
2206
2207
            /* FIXME: construct read names here too if needed */
2208
0
        } else {
2209
0
            if (cr->mate_flags & CRAM_M_REVERSE) {
2210
0
                cr->flags |= BAM_FPAIRED | BAM_FMREVERSE;
2211
0
            }
2212
0
            if (cr->mate_flags & CRAM_M_UNMAP) {
2213
0
                cr->flags |= BAM_FMUNMAP;
2214
                //cr->mate_ref_id = -1;
2215
0
            }
2216
0
            if (!(cr->flags & BAM_FPAIRED))
2217
0
                cr->mate_ref_id = -1;
2218
0
        }
2219
2220
0
        if (cr->tlen == INT64_MIN)
2221
0
            cr->tlen = 0; // Just incase
2222
0
    }
2223
2224
567
    for (rec = 0; rec < s->hdr->num_records; rec++) {
2225
0
        cram_record *cr = &s->crecs[rec];
2226
0
        if (cr->explicit_tlen != INT64_MIN)
2227
0
            cr->tlen = cr->explicit_tlen;
2228
0
    }
2229
2230
567
    return 0;
2231
567
}
2232
2233
0
static char *md5_print(unsigned char *md5, char *out) {
2234
0
    int i;
2235
0
    for (i = 0; i < 16; i++) {
2236
0
        out[i*2+0] = "0123456789abcdef"[md5[i]>>4];
2237
0
        out[i*2+1] = "0123456789abcdef"[md5[i]&15];
2238
0
    }
2239
0
    out[32] = 0;
2240
2241
0
    return out;
2242
0
}
2243
2244
/*
2245
 * Utility function to decode tlen (ISIZE), as it's called
2246
 * in multiple places.
2247
 *
2248
 * Returns codec return value (0 on success).
2249
 */
2250
static int cram_decode_tlen(cram_fd *fd, cram_container *c, cram_slice *s,
2251
0
                            cram_block *blk, int64_t *tlen) {
2252
0
    int out_sz = 1, r = 0;
2253
2254
0
    if (!c->comp_hdr->codecs[DS_TS]) return -1;
2255
0
    if (CRAM_MAJOR_VERS(fd->version) < 4) {
2256
0
        int32_t i32;
2257
0
        r |= c->comp_hdr->codecs[DS_TS]
2258
0
            ->decode(s, c->comp_hdr->codecs[DS_TS], blk,
2259
0
                     (char *)&i32, &out_sz);
2260
0
        *tlen = i32;
2261
0
    } else {
2262
0
        r |= c->comp_hdr->codecs[DS_TS]
2263
0
            ->decode(s, c->comp_hdr->codecs[DS_TS], blk,
2264
0
                     (char *)tlen, &out_sz);
2265
0
    }
2266
0
    return r;
2267
0
}
2268
2269
/*
2270
 * Decode an entire slice from container blocks. Fills out s->crecs[] array.
2271
 * Returns 0 on success
2272
 *        -1 on failure
2273
 */
2274
int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
2275
660
                      sam_hdr_t *sh) {
2276
660
    cram_block *blk = s->block[0];
2277
660
    int32_t bf, ref_id;
2278
660
    unsigned char cf;
2279
660
    int out_sz, r = 0;
2280
660
    int rec;
2281
660
    char *seq = NULL, *qual = NULL;
2282
660
    int unknown_rg = -1;
2283
660
    int embed_ref;
2284
660
    char **refs = NULL;
2285
660
    uint32_t ds;
2286
660
    sam_hrecs_t *bfd = sh->hrecs;
2287
2288
660
    if (cram_dependent_data_series(fd, c->comp_hdr, s) != 0)
2289
7
        return -1;
2290
2291
653
    ds = s->data_series;
2292
2293
653
    blk->bit = 7; // MSB first
2294
2295
    // Study the blocks and estimate approx sizes to preallocate.
2296
    // This looks to speed up decoding by around 8-9%.
2297
    // We can always shrink back down at the end if we overestimated.
2298
    // However it's likely that this also saves memory as own growth
2299
    // factor (*=1.5) is never applied.
2300
653
    {
2301
653
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
2302
653
        int qsize=0, nsize=0, q_id=0;
2303
#else
2304
        int qsize, nsize, q_id;
2305
        cram_decode_estimate_sizes(c->comp_hdr, s, &qsize, &nsize, &q_id);
2306
        //fprintf(stderr, "qsize=%d nsize=%d\n", qsize, nsize);
2307
#endif
2308
2309
653
        if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->seqs_blk, qsize+1);
2310
653
        if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->qual_blk, qsize+1);
2311
653
        if (nsize && (ds & CRAM_NS)) BLOCK_RESIZE_EXACT(s->name_blk, nsize+1);
2312
2313
        // To do - consider using q_id here to usurp the quality block and
2314
        // avoid a memcpy during decode.
2315
        // Specifically when quality is an external block uniquely used by
2316
        // DS_QS only, then we can set s->qual_blk directly to this
2317
        // block and save the codec->decode() calls. (Approx 3% cpu saving)
2318
653
    }
2319
2320
    /* Look for unknown RG, added as last by Java CRAM? */
2321
653
    if (bfd->nrg > 0 &&
2322
653
        bfd->rg[bfd->nrg-1].name != NULL &&
2323
653
        !strcmp(bfd->rg[bfd->nrg-1].name, "UNKNOWN"))
2324
0
        unknown_rg = bfd->nrg-1;
2325
2326
653
    if (blk->content_type != CORE)
2327
11
        return -1;
2328
2329
642
    if (s->crecs)
2330
0
        free(s->crecs);
2331
642
    if (!(s->crecs = malloc(s->hdr->num_records * sizeof(*s->crecs))))
2332
3
        return -1;
2333
2334
639
    ref_id = s->hdr->ref_seq_id;
2335
639
    if (CRAM_MAJOR_VERS(fd->version) < 4)
2336
639
       embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0;
2337
0
    else
2338
0
       embed_ref = s->hdr->ref_base_id > 0 ? 1 : 0;
2339
2340
639
    if (ref_id >= 0) {
2341
591
        if (embed_ref) {
2342
568
            cram_block *b;
2343
568
            if (s->hdr->ref_base_id < 0) {
2344
0
                hts_log_error("No reference specified and no embedded reference is available"
2345
0
                              " at #%d:%"PRId64"-%"PRId64, ref_id, s->hdr->ref_seq_start,
2346
0
                              s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2347
0
                return -1;
2348
0
            }
2349
568
            b = cram_get_block_by_id(s, s->hdr->ref_base_id);
2350
568
            if (!b)
2351
21
                return -1;
2352
547
            if (cram_uncompress_block(b) != 0)
2353
0
                return -1;
2354
547
            s->ref = (char *)BLOCK_DATA(b);
2355
547
            s->ref_start = s->hdr->ref_seq_start;
2356
547
            s->ref_end   = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
2357
547
            if (s->hdr->ref_seq_span > b->uncomp_size) {
2358
5
                hts_log_error("Embedded reference is too small at #%d:%"PRIhts_pos"-%"PRIhts_pos,
2359
5
                              ref_id, s->ref_start, s->ref_end);
2360
5
                return -1;
2361
5
            }
2362
547
        } else if (!c->comp_hdr->no_ref) {
2363
            //// Avoid Java cramtools bug by loading entire reference seq
2364
            //s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0);
2365
            //s->ref_start = 1;
2366
2367
23
            if (fd->required_fields & SAM_SEQ) {
2368
23
                s->ref =
2369
23
                cram_get_ref(fd, s->hdr->ref_seq_id,
2370
23
                             s->hdr->ref_seq_start,
2371
23
                             s->hdr->ref_seq_start + s->hdr->ref_seq_span -1);
2372
23
            }
2373
23
            s->ref_start = s->hdr->ref_seq_start;
2374
23
            s->ref_end   = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
2375
2376
            /* Sanity check */
2377
23
            if (s->ref_start < 0) {
2378
0
                hts_log_warning("Slice starts before base 1"
2379
0
                                " at #%d:%"PRId64"-%"PRId64, ref_id, s->hdr->ref_seq_start,
2380
0
                                s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2381
0
                s->ref_start = 0;
2382
0
            }
2383
23
            pthread_mutex_lock(&fd->ref_lock);
2384
23
            pthread_mutex_lock(&fd->refs->lock);
2385
23
            if ((fd->required_fields & SAM_SEQ) &&
2386
23
                ref_id < fd->refs->nref && fd->refs->ref_id &&
2387
23
                s->ref_end > fd->refs->ref_id[ref_id]->length) {
2388
0
                s->ref_end = fd->refs->ref_id[ref_id]->length;
2389
0
            }
2390
23
            pthread_mutex_unlock(&fd->refs->lock);
2391
23
            pthread_mutex_unlock(&fd->ref_lock);
2392
23
        }
2393
591
    }
2394
2395
613
    if ((fd->required_fields & SAM_SEQ) &&
2396
613
        s->ref == NULL && s->hdr->ref_seq_id >= 0 && !c->comp_hdr->no_ref) {
2397
23
        hts_log_error("Unable to fetch reference %s:%"PRId64"-%"PRId64,
2398
23
                      fd->refs->ref_id && ref_id >= 0 && ref_id < fd->refs->nref
2399
23
                      ? fd->refs->ref_id[ref_id]->name
2400
23
                      : "unknown",
2401
23
                      s->hdr->ref_seq_start,
2402
23
                      s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2403
23
        return -1;
2404
23
    }
2405
2406
590
    if (CRAM_MAJOR_VERS(fd->version) != 1
2407
590
        && (fd->required_fields & SAM_SEQ)
2408
590
        && s->hdr->ref_seq_id >= 0
2409
590
        && !fd->ignore_md5
2410
590
        && memcmp(s->hdr->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) {
2411
0
        hts_md5_context *md5;
2412
0
        unsigned char digest[16];
2413
2414
0
        if (s->ref && s->hdr->ref_seq_id >= 0) {
2415
0
            int start, len;
2416
2417
0
            if (s->hdr->ref_seq_start >= s->ref_start) {
2418
0
                start = s->hdr->ref_seq_start - s->ref_start;
2419
0
            } else {
2420
0
                hts_log_warning("Slice starts before base 1 at #%d:%"PRIhts_pos"-%"PRIhts_pos,
2421
0
                                ref_id, s->ref_start, s->ref_end);
2422
0
                start = 0;
2423
0
            }
2424
2425
0
            if (s->hdr->ref_seq_span <= s->ref_end - s->ref_start + 1) {
2426
0
                len = s->hdr->ref_seq_span;
2427
0
            } else {
2428
0
                hts_log_warning("Slice ends beyond reference end at #%d:%"PRIhts_pos"-%"PRIhts_pos,
2429
0
                                ref_id, s->ref_start, s->ref_end);
2430
0
                len = s->ref_end - s->ref_start + 1;
2431
0
            }
2432
2433
0
            if (!(md5 = hts_md5_init()))
2434
0
                return -1;
2435
0
            if (start + len > s->ref_end - s->ref_start + 1)
2436
0
                len = s->ref_end - s->ref_start + 1 - start;
2437
0
            if (len >= 0)
2438
0
                hts_md5_update(md5, s->ref + start, len);
2439
0
            hts_md5_final(digest, md5);
2440
0
            hts_md5_destroy(md5);
2441
0
        } else if (!s->ref && s->hdr->ref_base_id >= 0) {
2442
0
            cram_block *b = cram_get_block_by_id(s, s->hdr->ref_base_id);
2443
0
            if (b) {
2444
0
                if (!(md5 = hts_md5_init()))
2445
0
                    return -1;
2446
0
                hts_md5_update(md5, b->data, b->uncomp_size);
2447
0
                hts_md5_final(digest, md5);
2448
0
                hts_md5_destroy(md5);
2449
0
            }
2450
0
        }
2451
2452
0
        if (!c->comp_hdr->no_ref &&
2453
0
            ((!s->ref && s->hdr->ref_base_id < 0)
2454
0
             || memcmp(digest, s->hdr->md5, 16) != 0)) {
2455
0
            char M[33];
2456
0
            const char *rname = sam_hdr_tid2name(sh, ref_id);
2457
0
            if (!rname) rname="?"; // cannot happen normally
2458
0
            hts_log_error("MD5 checksum reference mismatch at %s:%"PRIhts_pos"-%"PRIhts_pos,
2459
0
                          rname, s->ref_start, s->ref_end);
2460
0
            hts_log_error("CRAM  : %s", md5_print(s->hdr->md5, M));
2461
0
            hts_log_error("Ref   : %s", md5_print(digest, M));
2462
0
            kstring_t ks = KS_INITIALIZE;
2463
0
            if (sam_hdr_find_tag_id(sh, "SQ", "SN", rname, "M5", &ks) == 0)
2464
0
                hts_log_error("@SQ M5: %s", ks.s);
2465
0
            hts_log_error("Please check the reference given is correct");
2466
0
            ks_free(&ks);
2467
0
            return -1;
2468
0
        }
2469
0
    }
2470
2471
590
    if (ref_id == -2) {
2472
33
        pthread_mutex_lock(&fd->ref_lock);
2473
33
        pthread_mutex_lock(&fd->refs->lock);
2474
33
        refs = calloc(fd->refs->nref, sizeof(char *));
2475
33
        pthread_mutex_unlock(&fd->refs->lock);
2476
33
        pthread_mutex_unlock(&fd->ref_lock);
2477
33
        if (!refs)
2478
0
            return -1;
2479
33
    }
2480
2481
590
    int last_ref_id = -9; // Arbitrary -ve marker for not-yet-set
2482
590
    for (rec = 0; rec < s->hdr->num_records; rec++) {
2483
23
        cram_record *cr = &s->crecs[rec];
2484
23
        int has_MD, has_NM;
2485
2486
        //fprintf(stderr, "Decode seq %d, %d/%d\n", rec, blk->byte, blk->bit);
2487
2488
23
        cr->s = s;
2489
2490
23
        out_sz = 1; /* decode 1 item */
2491
23
        if (ds & CRAM_BF) {
2492
23
            if (!c->comp_hdr->codecs[DS_BF]) goto block_err;
2493
0
            r |= c->comp_hdr->codecs[DS_BF]
2494
0
                            ->decode(s, c->comp_hdr->codecs[DS_BF], blk,
2495
0
                                     (char *)&bf, &out_sz);
2496
0
            if (r || bf < 0 ||
2497
0
                bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap))
2498
0
                goto block_err;
2499
0
            bf = fd->bam_flag_swap[bf];
2500
0
            cr->flags = bf;
2501
0
        } else {
2502
0
            cr->flags = bf = 0x4; // unmapped
2503
0
        }
2504
2505
0
        if (ds & CRAM_CF) {
2506
0
            if (CRAM_MAJOR_VERS(fd->version) == 1) {
2507
                /* CF is byte in 1.0, int32 in 2.0 */
2508
0
                if (!c->comp_hdr->codecs[DS_CF]) goto block_err;
2509
0
                r |= c->comp_hdr->codecs[DS_CF]
2510
0
                                ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2511
0
                                         (char *)&cf, &out_sz);
2512
0
                if (r) goto block_err;
2513
0
                cr->cram_flags = cf;
2514
0
            } else {
2515
0
                if (!c->comp_hdr->codecs[DS_CF]) goto block_err;
2516
0
                r |= c->comp_hdr->codecs[DS_CF]
2517
0
                                ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2518
0
                                         (char *)&cr->cram_flags, &out_sz);
2519
0
                if (r) goto block_err;
2520
0
                cf = cr->cram_flags;
2521
0
            }
2522
0
        } else {
2523
0
            cf = cr->cram_flags = 0;
2524
0
        }
2525
2526
0
        if (CRAM_MAJOR_VERS(fd->version) != 1 && ref_id == -2) {
2527
0
            if (ds & CRAM_RI) {
2528
0
                if (!c->comp_hdr->codecs[DS_RI]) goto block_err;
2529
0
                r |= c->comp_hdr->codecs[DS_RI]
2530
0
                                ->decode(s, c->comp_hdr->codecs[DS_RI], blk,
2531
0
                                         (char *)&cr->ref_id, &out_sz);
2532
0
                if (r) goto block_err;
2533
0
                if ((fd->required_fields & (SAM_SEQ|SAM_TLEN))
2534
0
                    && cr->ref_id >= 0
2535
0
                    && cr->ref_id != last_ref_id) {
2536
0
                    if (!c->comp_hdr->no_ref) {
2537
                        // Range(fd):  seq >= 0, unmapped -1, unspecified   -2
2538
                        // Slice(s):   seq >= 0, unmapped -1, multiple refs -2
2539
                        // Record(cr): seq >= 0, unmapped -1
2540
0
                        pthread_mutex_lock(&fd->range_lock);
2541
0
                        int need_ref = (fd->range.refid == -2 || cr->ref_id == fd->range.refid);
2542
0
                        pthread_mutex_unlock(&fd->range_lock);
2543
0
                        if  (need_ref) {
2544
0
                            if (!refs[cr->ref_id])
2545
0
                                refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id, 1, 0);
2546
0
                            if (!(s->ref = refs[cr->ref_id]))
2547
0
                                goto block_err;
2548
0
                        } else {
2549
                            // For multi-ref containers, we don't need to fetch all
2550
                            // refs if we're only querying one.
2551
0
                            s->ref = NULL;
2552
0
                        }
2553
2554
0
                        pthread_mutex_lock(&fd->range_lock);
2555
0
                        int discard_last_ref = (last_ref_id >= 0 &&
2556
0
                                                refs[last_ref_id] &&
2557
0
                                                (fd->range.refid == -2 ||
2558
0
                                                 last_ref_id == fd->range.refid));
2559
0
                        pthread_mutex_unlock(&fd->range_lock);
2560
0
                        if (discard_last_ref) {
2561
0
                            pthread_mutex_lock(&fd->ref_lock);
2562
0
                            discard_last_ref = !fd->unsorted;
2563
0
                            pthread_mutex_unlock(&fd->ref_lock);
2564
0
                        }
2565
0
                        if (discard_last_ref) {
2566
0
                            cram_ref_decr(fd->refs, last_ref_id);
2567
0
                            refs[last_ref_id] = NULL;
2568
0
                        }
2569
0
                    }
2570
0
                    s->ref_start = 1;
2571
0
                    pthread_mutex_lock(&fd->ref_lock);
2572
0
                    pthread_mutex_lock(&fd->refs->lock);
2573
0
                    s->ref_end = fd->refs->ref_id[cr->ref_id]->length;
2574
0
                    pthread_mutex_unlock(&fd->refs->lock);
2575
0
                    pthread_mutex_unlock(&fd->ref_lock);
2576
2577
0
                    last_ref_id = cr->ref_id;
2578
0
                }
2579
0
            } else {
2580
0
                cr->ref_id = -1;
2581
0
            }
2582
0
        } else {
2583
0
            cr->ref_id = ref_id; // Forced constant in CRAM 1.0
2584
0
        }
2585
0
        if (cr->ref_id < -1 || cr->ref_id >= bfd->nref) {
2586
0
            hts_log_error("Requested unknown reference ID %d", cr->ref_id);
2587
0
            goto block_err;
2588
0
        }
2589
2590
0
        if (ds & CRAM_RL) {
2591
0
            if (!c->comp_hdr->codecs[DS_RL]) goto block_err;
2592
0
            r |= c->comp_hdr->codecs[DS_RL]
2593
0
                            ->decode(s, c->comp_hdr->codecs[DS_RL], blk,
2594
0
                                     (char *)&cr->len, &out_sz);
2595
0
            if (r) goto block_err;
2596
0
            if (cr->len < 0) {
2597
0
                hts_log_error("Read has negative length");
2598
0
                goto block_err;
2599
0
            }
2600
0
        }
2601
2602
0
        if (ds & CRAM_AP) {
2603
0
            if (!c->comp_hdr->codecs[DS_AP]) goto block_err;
2604
0
            if (CRAM_MAJOR_VERS(fd->version) >= 4) {
2605
0
                r |= c->comp_hdr->codecs[DS_AP]
2606
0
                                ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
2607
0
                                         (char *)&cr->apos, &out_sz);
2608
0
            } else  {
2609
0
                int32_t i32;
2610
0
                r |= c->comp_hdr->codecs[DS_AP]
2611
0
                                ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
2612
0
                                         (char *)&i32, &out_sz);
2613
0
                cr->apos = i32;
2614
0
            }
2615
0
            if (r) goto block_err;;
2616
0
            if (c->comp_hdr->AP_delta) {
2617
0
                if (cr->apos < 0 && c->unsorted == 0) {
2618
                    // cache locally in c->unsorted so we don't have an
2619
                    // excessive number of locks
2620
0
                    pthread_mutex_lock(&fd->ref_lock);
2621
0
                    c->unsorted = fd->unsorted = 1;
2622
0
                    pthread_mutex_unlock(&fd->ref_lock);
2623
0
                }
2624
0
                cr->apos += s->last_apos;
2625
0
            }
2626
0
            s->last_apos=  cr->apos;
2627
0
        } else {
2628
0
            cr->apos = c->ref_seq_start;
2629
0
        }
2630
2631
0
        if (ds & CRAM_RG) {
2632
0
            if (!c->comp_hdr->codecs[DS_RG]) goto block_err;
2633
0
            r |= c->comp_hdr->codecs[DS_RG]
2634
0
                           ->decode(s, c->comp_hdr->codecs[DS_RG], blk,
2635
0
                                    (char *)&cr->rg, &out_sz);
2636
0
            if (r) goto block_err;
2637
0
            if (cr->rg == unknown_rg)
2638
0
                cr->rg = -1;
2639
0
        } else {
2640
0
            cr->rg = -1;
2641
0
        }
2642
2643
0
        cr->name_len = 0;
2644
2645
0
        if (c->comp_hdr->read_names_included) {
2646
0
            int32_t out_sz2 = 1; // block auto grows in decode()
2647
2648
            // Read directly into name cram_block
2649
0
            cr->name = BLOCK_SIZE(s->name_blk);
2650
0
            if (ds & CRAM_RN) {
2651
0
                if (!c->comp_hdr->codecs[DS_RN]) goto block_err;
2652
0
                r |= c->comp_hdr->codecs[DS_RN]
2653
0
                                ->decode(s, c->comp_hdr->codecs[DS_RN], blk,
2654
0
                                         (char *)s->name_blk, &out_sz2);
2655
0
                if (r) goto block_err;
2656
0
                cr->name_len = out_sz2;
2657
0
            }
2658
0
        }
2659
2660
0
        cr->mate_pos = 0;
2661
0
        cr->mate_line = -1;
2662
0
        cr->mate_ref_id = -1;
2663
0
        cr->explicit_tlen = INT64_MIN;
2664
0
        if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) {
2665
0
            if (ds & CRAM_MF) {
2666
0
                if (CRAM_MAJOR_VERS(fd->version) == 1) {
2667
                    /* MF is byte in 1.0, int32 in 2.0 */
2668
0
                    unsigned char mf;
2669
0
                    if (!c->comp_hdr->codecs[DS_MF]) goto block_err;
2670
0
                    r |= c->comp_hdr->codecs[DS_MF]
2671
0
                                    ->decode(s, c->comp_hdr->codecs[DS_MF],
2672
0
                                             blk, (char *)&mf, &out_sz);
2673
0
                    if (r) goto block_err;
2674
0
                    cr->mate_flags = mf;
2675
0
                } else {
2676
0
                    if (!c->comp_hdr->codecs[DS_MF]) goto block_err;
2677
0
                    r |= c->comp_hdr->codecs[DS_MF]
2678
0
                                    ->decode(s, c->comp_hdr->codecs[DS_MF],
2679
0
                                             blk,
2680
0
                                             (char *)&cr->mate_flags,
2681
0
                                             &out_sz);
2682
0
                    if (r) goto block_err;
2683
0
                }
2684
0
            } else {
2685
0
                cr->mate_flags = 0;
2686
0
            }
2687
2688
0
            if (!c->comp_hdr->read_names_included) {
2689
0
                int32_t out_sz2 = 1;
2690
2691
                // Read directly into name cram_block
2692
0
                cr->name = BLOCK_SIZE(s->name_blk);
2693
0
                if (ds & CRAM_RN) {
2694
0
                    if (!c->comp_hdr->codecs[DS_RN]) goto block_err;
2695
0
                    r |= c->comp_hdr->codecs[DS_RN]
2696
0
                                    ->decode(s, c->comp_hdr->codecs[DS_RN],
2697
0
                                             blk, (char *)s->name_blk,
2698
0
                                             &out_sz2);
2699
0
                    if (r) goto block_err;
2700
0
                    cr->name_len = out_sz2;
2701
0
                }
2702
0
            }
2703
2704
0
            if (ds & CRAM_NS) {
2705
0
                if (!c->comp_hdr->codecs[DS_NS]) goto block_err;
2706
0
                r |= c->comp_hdr->codecs[DS_NS]
2707
0
                                ->decode(s, c->comp_hdr->codecs[DS_NS], blk,
2708
0
                                         (char *)&cr->mate_ref_id, &out_sz);
2709
0
                if (r) goto block_err;
2710
0
            }
2711
2712
            // Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown.
2713
            // if (cr->mate_ref_id == -1 && cr->flags & 0x01) {
2714
            //     /* Paired, but unmapped */
2715
            //     cr->flags |= BAM_FMUNMAP;
2716
            // }
2717
2718
0
            if (ds & CRAM_NP) {
2719
0
                if (!c->comp_hdr->codecs[DS_NP]) goto block_err;;
2720
0
                if (CRAM_MAJOR_VERS(fd->version) < 4) {
2721
0
                    int32_t i32;
2722
0
                    r |= c->comp_hdr->codecs[DS_NP]
2723
0
                                    ->decode(s, c->comp_hdr->codecs[DS_NP], blk,
2724
0
                                             (char *)&i32, &out_sz);
2725
0
                    cr->mate_pos = i32;
2726
0
                } else {
2727
0
                    r |= c->comp_hdr->codecs[DS_NP]
2728
0
                        ->decode(s, c->comp_hdr->codecs[DS_NP], blk,
2729
0
                                 (char *)&cr->mate_pos, &out_sz);
2730
0
                }
2731
0
                if (r) goto block_err;
2732
0
            }
2733
2734
0
            if (ds & CRAM_TS) {
2735
0
                if (!c->comp_hdr->codecs[DS_TS]) goto block_err;
2736
0
                r = cram_decode_tlen(fd, c, s, blk, &cr->tlen);
2737
0
                if (r) goto block_err;
2738
0
            } else {
2739
0
                cr->tlen = INT64_MIN;
2740
0
            }
2741
0
        } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) {
2742
            // else not detached
2743
0
            if (ds & CRAM_NF) {
2744
0
                if (!c->comp_hdr->codecs[DS_NF]) goto block_err;
2745
0
                r |= c->comp_hdr->codecs[DS_NF]
2746
0
                                ->decode(s, c->comp_hdr->codecs[DS_NF], blk,
2747
0
                                         (char *)&cr->mate_line, &out_sz);
2748
0
                if (r) goto block_err;
2749
0
                cr->mate_line += rec + 1;
2750
2751
                //cr->name_len = sprintf(name, "%d", name_id++);
2752
                //cr->name = DSTRING_LEN(name_ds);
2753
                //dstring_nappend(name_ds, name, cr->name_len);
2754
2755
0
                cr->mate_ref_id = -1;
2756
0
                cr->tlen = INT64_MIN;
2757
0
                cr->mate_pos = 0;
2758
0
            } else  {
2759
0
                cr->mate_flags = 0;
2760
0
                cr->tlen = INT64_MIN;
2761
0
            }
2762
0
            if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) {
2763
0
                if (ds & CRAM_TS) {
2764
0
                    r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen);
2765
0
                    if (r) return r;
2766
0
                } else {
2767
0
                    cr->mate_flags = 0;
2768
0
                    cr->tlen = INT64_MIN;
2769
0
                }
2770
0
            }
2771
0
        } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) {
2772
0
            if (ds & CRAM_TS) {
2773
0
                r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen);
2774
0
                if (r) return r;
2775
0
            } else {
2776
0
                cr->mate_flags = 0;
2777
0
                cr->tlen = INT64_MIN;
2778
0
            }
2779
0
        } else {
2780
0
            cr->mate_flags = 0;
2781
0
            cr->tlen = INT64_MIN;
2782
0
        }
2783
        /*
2784
        else if (!name[0]) {
2785
            //name[0] = '?'; name[1] = 0;
2786
            //cr->name_len = 1;
2787
            //cr->name=  DSTRING_LEN(s->name_ds);
2788
            //dstring_nappend(s->name_ds, "?", 1);
2789
2790
            cr->mate_ref_id = -1;
2791
            cr->tlen = 0;
2792
            cr->mate_pos = 0;
2793
        }
2794
        */
2795
2796
        /* Auxiliary tags */
2797
0
        has_MD = has_NM = 0;
2798
0
        if (CRAM_MAJOR_VERS(fd->version) == 1)
2799
0
            r |= cram_decode_aux_1_0(c, s, blk, cr);
2800
0
        else
2801
0
            r |= cram_decode_aux(fd, c, s, blk, cr, &has_MD, &has_NM);
2802
0
        if (r) goto block_err;
2803
2804
        /* Fake up dynamic string growth and appending */
2805
0
        if (ds & CRAM_RL) {
2806
0
            cr->seq = BLOCK_SIZE(s->seqs_blk);
2807
0
            BLOCK_RESIZE(s->seqs_blk, cr->seq + cr->len);
2808
0
            seq = (char *)BLOCK_END(s->seqs_blk);
2809
0
            BLOCK_SIZE(s->seqs_blk) += cr->len;
2810
2811
0
            if (!seq)
2812
0
                goto block_err;
2813
2814
0
            cr->qual = BLOCK_SIZE(s->qual_blk);
2815
0
            BLOCK_RESIZE(s->qual_blk, cr->qual + cr->len);
2816
0
            qual = (char *)BLOCK_END(s->qual_blk);
2817
0
            BLOCK_SIZE(s->qual_blk) += cr->len;
2818
2819
0
            if (!s->ref)
2820
0
                memset(seq, '=', cr->len);
2821
0
        }
2822
2823
0
        if (!(bf & BAM_FUNMAP)) {
2824
0
            if ((ds & CRAM_AP) && cr->apos <= 0) {
2825
0
                hts_log_error("Read has alignment position %"PRId64
2826
0
                              " but no unmapped flag",
2827
0
                              cr->apos);
2828
0
                goto block_err;
2829
0
            }
2830
            /* Decode sequence and generate CIGAR */
2831
0
            if (ds & (CRAM_SEQ | CRAM_MQ)) {
2832
0
                r |= cram_decode_seq(fd, c, s, blk, cr, sh, cf, seq, qual,
2833
0
                                     has_MD, has_NM);
2834
0
                if (r) goto block_err;
2835
0
            } else {
2836
0
                cr->cigar = 0;
2837
0
                cr->ncigar = 0;
2838
0
                cr->aend = cr->apos;
2839
0
                cr->mqual = 0;
2840
0
            }
2841
0
        } else {
2842
0
            int out_sz2 = cr->len;
2843
2844
            //puts("Unmapped");
2845
0
            cr->cigar = 0;
2846
0
            cr->ncigar = 0;
2847
0
            cr->aend = cr->apos;
2848
0
            cr->mqual = 0;
2849
2850
0
            if (ds & CRAM_BA && cr->len) {
2851
0
                if (!c->comp_hdr->codecs[DS_BA]) goto block_err;
2852
0
                r |= c->comp_hdr->codecs[DS_BA]
2853
0
                                ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
2854
0
                                         (char *)seq, &out_sz2);
2855
0
                if (r) goto block_err;
2856
0
            }
2857
2858
0
            if ((ds & CRAM_CF) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
2859
0
                out_sz2 = cr->len;
2860
0
                if (ds & CRAM_QS && cr->len >= 0) {
2861
0
                    if (!c->comp_hdr->codecs[DS_QS]) goto block_err;
2862
0
                    r |= c->comp_hdr->codecs[DS_QS]
2863
0
                                    ->decode(s, c->comp_hdr->codecs[DS_QS],
2864
0
                                             blk, qual, &out_sz2);
2865
0
                    if (r) goto block_err;
2866
0
                }
2867
0
            } else {
2868
0
                if (ds & CRAM_RL)
2869
0
                    memset(qual, 255, cr->len);
2870
0
            }
2871
0
        }
2872
2873
0
        if (!c->comp_hdr->qs_seq_orient && (ds & CRAM_QS) && (cr->flags & BAM_FREVERSE)) {
2874
0
            int i, j;
2875
0
            for (i = 0, j = cr->len-1; i < j; i++, j--) {
2876
0
                unsigned char c;
2877
0
                c = qual[i];
2878
0
                qual[i] = qual[j];
2879
0
                qual[j] = c;
2880
0
            }
2881
0
        }
2882
0
    }
2883
2884
567
    pthread_mutex_lock(&fd->ref_lock);
2885
567
    if (refs) {
2886
18
        int i;
2887
36
        for (i = 0; i < fd->refs->nref; i++) {
2888
18
            if (refs[i])
2889
0
                cram_ref_decr(fd->refs, i);
2890
18
        }
2891
18
        free(refs);
2892
18
        refs = NULL;
2893
549
    } else if (ref_id >= 0 && s->ref != fd->ref_free && !embed_ref) {
2894
0
        cram_ref_decr(fd->refs, ref_id);
2895
0
    }
2896
567
    pthread_mutex_unlock(&fd->ref_lock);
2897
2898
    /* Resolve mate pair cross-references between recs within this slice */
2899
567
    r |= cram_decode_slice_xref(s, fd->required_fields);
2900
2901
    // Free the original blocks as we no longer need these.
2902
567
    {
2903
567
        int i;
2904
2.97k
        for (i = 0; i < s->hdr->num_blocks; i++) {
2905
2.40k
            cram_block *b = s->block[i];
2906
2.40k
            cram_free_block(b);
2907
2.40k
            s->block[i] = NULL;
2908
2.40k
        }
2909
567
    }
2910
2911
    // Also see initial BLOCK_RESIZE_EXACT at top of function.
2912
    // As we grow blocks we overallocate by up to 50%. So shrink
2913
    // back to their final sizes here.
2914
    //
2915
    //fprintf(stderr, "%d %d // %d %d // %d %d // %d %d\n",
2916
    //      (int)s->seqs_blk->byte, (int)s->seqs_blk->alloc,
2917
    //      (int)s->qual_blk->byte, (int)s->qual_blk->alloc,
2918
    //      (int)s->name_blk->byte, (int)s->name_blk->alloc,
2919
    //      (int)s->aux_blk->byte,  (int)s->aux_blk->alloc);
2920
567
    BLOCK_RESIZE_EXACT(s->seqs_blk, BLOCK_SIZE(s->seqs_blk)+1);
2921
567
    BLOCK_RESIZE_EXACT(s->qual_blk, BLOCK_SIZE(s->qual_blk)+1);
2922
567
    BLOCK_RESIZE_EXACT(s->name_blk, BLOCK_SIZE(s->name_blk)+1);
2923
567
    BLOCK_RESIZE_EXACT(s->aux_blk,  BLOCK_SIZE(s->aux_blk)+1);
2924
2925
567
    return r;
2926
2927
23
 block_err:
2928
23
    if (refs) {
2929
15
        int i;
2930
15
        pthread_mutex_lock(&fd->ref_lock);
2931
45
        for (i = 0; i < fd->refs->nref; i++) {
2932
30
            if (refs[i])
2933
0
                cram_ref_decr(fd->refs, i);
2934
30
        }
2935
15
        free(refs);
2936
15
        pthread_mutex_unlock(&fd->ref_lock);
2937
15
    }
2938
2939
23
    return -1;
2940
567
}
2941
2942
typedef struct {
2943
    cram_fd *fd;
2944
    cram_container *c;
2945
    cram_slice *s;
2946
    sam_hdr_t *h;
2947
    int exit_code;
2948
} cram_decode_job;
2949
2950
0
void *cram_decode_slice_thread(void *arg) {
2951
0
    cram_decode_job *j = (cram_decode_job *)arg;
2952
2953
0
    j->exit_code = cram_decode_slice(j->fd, j->c, j->s, j->h);
2954
2955
0
    return j;
2956
0
}
2957
2958
/*
2959
 * Spawn a multi-threaded version of cram_decode_slice().
2960
 */
2961
int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s,
2962
660
                         sam_hdr_t *bfd) {
2963
660
    cram_decode_job *j;
2964
660
    int nonblock;
2965
2966
660
    if (!fd->pool)
2967
660
        return cram_decode_slice(fd, c, s, bfd);
2968
2969
0
    if (!(j = malloc(sizeof(*j))))
2970
0
        return -1;
2971
2972
0
    j->fd = fd;
2973
0
    j->c  = c;
2974
0
    j->s  = s;
2975
0
    j->h  = bfd;
2976
2977
0
    nonblock = hts_tpool_process_sz(fd->rqueue) ? 1 : 0;
2978
2979
0
    int saved_errno = errno;
2980
0
    errno = 0;
2981
0
    if (-1 == hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_decode_slice_thread,
2982
0
                                  j, nonblock)) {
2983
        /* Would block */
2984
0
        if (errno != EAGAIN)
2985
0
            return -1;
2986
0
        fd->job_pending = j;
2987
0
    } else {
2988
0
        fd->job_pending = NULL;
2989
0
    }
2990
0
    errno = saved_errno;
2991
2992
    // flush too
2993
0
    return 0;
2994
0
}
2995
2996
2997
/* ----------------------------------------------------------------------
2998
 * CRAM sequence iterators.
2999
 */
3000
3001
/*
3002
 * Converts a cram in-memory record into a bam in-memory record. We
3003
 * pass a pointer to a bam_seq_t pointer along with the a pointer to
3004
 * the allocated size. These can initially be pointers to NULL and zero.
3005
 *
3006
 * This function will reallocate the bam buffer as required and update
3007
 * (*bam)->alloc accordingly, allowing it to be used within a loop
3008
 * efficiently without needing to allocate new bam objects over and
3009
 * over again.
3010
 *
3011
 * Returns the used size of the bam record on success
3012
 *         -1 on failure.
3013
 */
3014
int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s,
3015
0
                cram_record *cr, int rec, bam_seq_t **bam) {
3016
0
    int ret, rg_len;
3017
0
    char name_a[1024], *name;
3018
0
    int name_len;
3019
0
    char *aux;
3020
0
    char *seq, *qual;
3021
0
    sam_hrecs_t *bfd = sh->hrecs;
3022
3023
    /* Assign names if not explicitly set */
3024
0
    if (fd->required_fields & SAM_QNAME) {
3025
0
        if (cr->name_len) {
3026
0
            name = (char *)BLOCK_DATA(s->name_blk) + cr->name;
3027
0
            name_len = cr->name_len;
3028
0
        } else {
3029
0
            name = name_a;
3030
0
            if (cr->mate_line >= 0 && cr->mate_line < s->max_rec &&
3031
0
                s->crecs[cr->mate_line].name_len > 0) {
3032
                // Copy our mate if non-zero.
3033
0
                memcpy(name_a, BLOCK_DATA(s->name_blk)+s->crecs[cr->mate_line].name,
3034
0
                       s->crecs[cr->mate_line].name_len);
3035
0
                name = name_a + s->crecs[cr->mate_line].name_len;
3036
0
            } else {
3037
                // Otherwise generate a name based on prefix
3038
0
                name_len = strlen(fd->prefix);
3039
0
                memcpy(name, fd->prefix, name_len);
3040
0
                name += name_len;
3041
0
                *name++ = ':';
3042
0
                if (cr->mate_line >= 0 && cr->mate_line < rec) {
3043
0
                    name = (char *)append_uint64((unsigned char *)name,
3044
0
                                                 s->hdr->record_counter +
3045
0
                                                 cr->mate_line + 1);
3046
0
                } else {
3047
0
                    name = (char *)append_uint64((unsigned char *)name,
3048
0
                                                 s->hdr->record_counter +
3049
0
                                                 rec + 1);
3050
0
                }
3051
0
            }
3052
0
            name_len = name - name_a;
3053
0
            name = name_a;
3054
0
        }
3055
0
    } else {
3056
0
        name = "?";
3057
0
        name_len = 1;
3058
0
    }
3059
3060
    /* Generate BAM record */
3061
0
    if (cr->rg < -1 || cr->rg >= bfd->nrg)
3062
0
        return -1;
3063
0
    rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0;
3064
3065
0
    if (fd->required_fields & (SAM_SEQ | SAM_QUAL)) {
3066
0
        if (!BLOCK_DATA(s->seqs_blk))
3067
0
            return -1;
3068
0
        seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
3069
0
    } else {
3070
0
        seq = "*";
3071
0
        cr->len = 0;
3072
0
    }
3073
3074
0
    if (fd->required_fields & SAM_QUAL) {
3075
0
        if (!BLOCK_DATA(s->qual_blk))
3076
0
            return -1;
3077
0
        qual = (char *)BLOCK_DATA(s->qual_blk) + cr->qual;
3078
0
    } else {
3079
0
        qual = NULL;
3080
0
    }
3081
3082
0
    ret = bam_set1(*bam,
3083
0
                   name_len, name,
3084
0
                   cr->flags, cr->ref_id, cr->apos - 1, cr->mqual,
3085
0
                   cr->ncigar, &s->cigar[cr->cigar],
3086
0
                   cr->mate_ref_id, cr->mate_pos - 1, cr->tlen,
3087
0
                   cr->len, seq, qual,
3088
0
                   cr->aux_size + rg_len);
3089
0
    if (ret < 0) {
3090
0
        return ret;
3091
0
    }
3092
3093
0
    aux = (char *)bam_aux(*bam);
3094
3095
    /* Auxiliary strings */
3096
0
    if (cr->aux_size != 0) {
3097
0
        memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size);
3098
0
        aux += cr->aux_size;
3099
0
        (*bam)->l_data += cr->aux_size;
3100
0
    }
3101
3102
    /* RG:Z: */
3103
0
    if (rg_len > 0) {
3104
0
        *aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z';
3105
0
        int len = bfd->rg[cr->rg].name_len;
3106
0
        memcpy(aux, bfd->rg[cr->rg].name, len);
3107
0
        aux += len;
3108
0
        *aux++ = 0;
3109
0
        (*bam)->l_data += rg_len;
3110
0
    }
3111
3112
0
    return (*bam)->l_data;
3113
0
}
3114
3115
/*
3116
 * Here be dragons! The multi-threading code in this is crufty beyond belief.
3117
 */
3118
3119
/*
3120
 * Load first container.
3121
 * Called when fd->ctr is NULL>
3122
 *
3123
 * Returns container on success
3124
 *        NULL on failure.
3125
 */
3126
7.57k
static cram_container *cram_first_slice(cram_fd *fd) {
3127
7.57k
    cram_container *c;
3128
3129
20.0k
    do {
3130
20.0k
        if (fd->ctr)
3131
12.4k
            cram_free_container(fd->ctr);
3132
3133
20.0k
        if (!(c = fd->ctr = cram_read_container(fd)))
3134
639
            return NULL;
3135
19.4k
        c->curr_slice_mt = c->curr_slice;
3136
19.4k
    } while (c->length == 0);
3137
3138
    /*
3139
     * The first container may be a result of a sub-range query.
3140
     * In which case it may still not be the optimal starting point
3141
     * due to skipped containers/slices in the index.
3142
     */
3143
    // No need for locks here as we're in the main thread.
3144
6.93k
    if (fd->range.refid != -2) {
3145
0
        while (c->ref_seq_id != -2 &&
3146
0
               (c->ref_seq_id < fd->range.refid ||
3147
0
                (fd->range.refid >= 0 && c->ref_seq_id == fd->range.refid
3148
0
                 && c->ref_seq_start + c->ref_seq_span-1 < fd->range.start))) {
3149
0
            if (0 != cram_seek(fd, c->length, SEEK_CUR))
3150
0
                return NULL;
3151
0
            cram_free_container(fd->ctr);
3152
0
            do {
3153
0
                if (!(c = fd->ctr = cram_read_container(fd)))
3154
0
                    return NULL;
3155
0
            } while (c->length == 0);
3156
0
        }
3157
3158
0
        if (c->ref_seq_id != -2 && c->ref_seq_id != fd->range.refid) {
3159
0
            fd->eof = 1;
3160
0
            return NULL;
3161
0
        }
3162
0
    }
3163
3164
6.93k
    if (!(c->comp_hdr_block = cram_read_block(fd)))
3165
219
        return NULL;
3166
6.72k
    if (c->comp_hdr_block->content_type != COMPRESSION_HEADER)
3167
87
        return NULL;
3168
3169
6.63k
    c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
3170
6.63k
    if (!c->comp_hdr)
3171
6.27k
        return NULL;
3172
362
    if (!c->comp_hdr->AP_delta &&
3173
362
        sam_hrecs_sort_order(fd->header->hrecs) != ORDER_COORD) {
3174
0
        pthread_mutex_lock(&fd->ref_lock);
3175
0
        fd->unsorted = 1;
3176
0
        pthread_mutex_unlock(&fd->ref_lock);
3177
0
    }
3178
3179
362
    return c;
3180
6.63k
}
3181
3182
8.14k
cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
3183
8.14k
    cram_container *c_curr;  // container being consumed via cram_get_seq()
3184
8.14k
    cram_slice *s_curr = NULL;
3185
3186
    // Populate the first container if unknown.
3187
8.14k
    if (!(c_curr = fd->ctr)) {
3188
7.57k
        if (!(c_curr = cram_first_slice(fd)))
3189
7.21k
            return NULL;
3190
7.57k
    }
3191
3192
    // Discard previous slice
3193
929
    if ((s_curr = c_curr->slice)) {
3194
567
        c_curr->slice = NULL;
3195
567
        cram_free_slice(s_curr);
3196
567
        s_curr = NULL;
3197
567
    }
3198
3199
    // If we've consumed all slices in this container, also discard
3200
    // the container too.
3201
929
    if (c_curr->curr_slice == c_curr->max_slice) {
3202
106
        if (fd->ctr == c_curr)
3203
106
            fd->ctr = NULL;
3204
106
        if (fd->ctr_mt == c_curr)
3205
39
            fd->ctr_mt = NULL;
3206
106
        cram_free_container(c_curr);
3207
106
        c_curr = NULL;
3208
106
    }
3209
3210
929
    if (!fd->ctr_mt)
3211
401
        fd->ctr_mt = c_curr;
3212
3213
    // Fetch the next slice (and the container if necessary).
3214
    //
3215
    // If single threaded this loop bails out as soon as it finds
3216
    // a slice in range.  In this case c_next and c_curr end up being
3217
    // the same thing.
3218
    //
3219
    // If multi-threaded, we loop until we have filled out
3220
    // thread pool input queue.  Here c_next and c_curr *may* differ, as
3221
    // can fd->ctr and fd->ctr_mt.
3222
929
    for (;;) {
3223
929
        cram_container *c_next = fd->ctr_mt;
3224
929
        cram_slice *s_next = NULL;
3225
3226
        // Next slice; either from the last job we failed to push
3227
        // to the input queue or via more I/O.
3228
929
        if (fd->job_pending) {
3229
0
            cram_decode_job *j = (cram_decode_job *)fd->job_pending;
3230
0
            c_next = j->c;
3231
0
            s_next = j->s;
3232
0
            free(fd->job_pending);
3233
0
            fd->job_pending = NULL;
3234
929
        } else if (!fd->ooc) {
3235
1.05k
        empty_container:
3236
1.05k
            if (!c_next || c_next->curr_slice_mt == c_next->max_slice) {
3237
                // new container
3238
6.28k
                for(;;) {
3239
6.28k
                    if (!(c_next = cram_read_container(fd))) {
3240
34
                        if (fd->pool) {
3241
0
                            fd->ooc = 1;
3242
0
                            break;
3243
0
                        }
3244
3245
34
                        return NULL;
3246
34
                    }
3247
6.25k
                    c_next->curr_slice_mt = c_next->curr_slice;
3248
3249
6.25k
                    if (c_next->length != 0)
3250
194
                        break;
3251
3252
6.06k
                    cram_free_container(c_next);
3253
6.06k
                }
3254
194
                if (fd->ooc)
3255
0
                    break;
3256
3257
//                printf("%p %d:%ld-%ld vs %d:%ld-%ld\n", fd,
3258
//                       c_next->ref_seq_id, c_next->ref_seq_start, c_next->ref_seq_start+c_next->ref_seq_span-1,
3259
//                       fd->range.refid, fd->range.start, fd->range.end);
3260
3261
                /* Skip containers not yet spanning our range */
3262
194
                if (fd->range.refid != -2 && c_next->ref_seq_id != -2) {
3263
                    // ref_id beyond end of range; bail out
3264
0
                    if (c_next->ref_seq_id != fd->range.refid) {
3265
0
                        cram_free_container(c_next);
3266
0
                        fd->ctr_mt = NULL;
3267
0
                        fd->ooc = 1;
3268
0
                        break;
3269
0
                    }
3270
3271
                    // position beyond end of range; bail out
3272
0
                    if (fd->range.refid != -1 &&
3273
0
                        c_next->ref_seq_start > fd->range.end) {
3274
0
                        cram_free_container(c_next);
3275
0
                        fd->ctr_mt = NULL;
3276
0
                        fd->ooc = 1;
3277
0
                        break;
3278
0
                    }
3279
3280
                    // before start of range; skip to next container
3281
0
                    if (fd->range.refid != -1 &&
3282
0
                        c_next->ref_seq_start + c_next->ref_seq_span-1 <
3283
0
                        fd->range.start) {
3284
0
                        c_next->curr_slice_mt = c_next->max_slice;
3285
0
                        cram_seek(fd, c_next->length, SEEK_CUR);
3286
0
                        cram_free_container(c_next);
3287
0
                        c_next = NULL;
3288
0
                        continue;
3289
0
                    }
3290
0
                }
3291
3292
                // Container is valid range, so remember it for restarting
3293
                // this function.
3294
194
                fd->ctr_mt = c_next;
3295
3296
194
                if (!(c_next->comp_hdr_block = cram_read_block(fd)))
3297
13
                    return NULL;
3298
181
                if (c_next->comp_hdr_block->content_type != COMPRESSION_HEADER)
3299
23
                    return NULL;
3300
3301
158
                c_next->comp_hdr =
3302
158
                    cram_decode_compression_header(fd, c_next->comp_hdr_block);
3303
158
                if (!c_next->comp_hdr)
3304
23
                    return NULL;
3305
3306
135
                if (!c_next->comp_hdr->AP_delta &&
3307
135
                    sam_hrecs_sort_order(fd->header->hrecs) != ORDER_COORD) {
3308
0
                    pthread_mutex_lock(&fd->ref_lock);
3309
0
                    fd->unsorted = 1;
3310
0
                    pthread_mutex_unlock(&fd->ref_lock);
3311
0
                }
3312
135
            }
3313
3314
958
            if (c_next->num_records == 0) {
3315
122
                if (fd->ctr == c_next)
3316
11
                    fd->ctr = NULL;
3317
122
                if (c_curr == c_next)
3318
11
                    c_curr = NULL;
3319
122
                if (fd->ctr_mt == c_next)
3320
122
                    fd->ctr_mt = NULL;
3321
122
                cram_free_container(c_next);
3322
122
                c_next = NULL;
3323
122
                goto empty_container;
3324
122
            }
3325
3326
836
            if (!(s_next = c_next->slice = cram_read_slice(fd)))
3327
176
                return NULL;
3328
3329
660
            s_next->slice_num = ++c_next->curr_slice_mt;
3330
660
            s_next->curr_rec = 0;
3331
660
            s_next->max_rec = s_next->hdr->num_records;
3332
3333
660
            s_next->last_apos = s_next->hdr->ref_seq_start;
3334
3335
            // We know the container overlaps our range, but with multi-slice
3336
            // containers we may have slices that do not.  Skip these also.
3337
660
            if (fd->range.refid != -2 && s_next->hdr->ref_seq_id != -2) {
3338
                // ref_id beyond end of range; bail out
3339
0
                if (s_next->hdr->ref_seq_id != fd->range.refid) {
3340
0
                    fd->ooc = 1;
3341
0
                    cram_free_slice(s_next);
3342
0
                    c_next->slice = s_next = NULL;
3343
0
                    break;
3344
0
                }
3345
3346
                // position beyond end of range; bail out
3347
0
                if (fd->range.refid != -1 &&
3348
0
                    s_next->hdr->ref_seq_start > fd->range.end) {
3349
0
                    fd->ooc = 1;
3350
0
                    cram_free_slice(s_next);
3351
0
                    c_next->slice = s_next = NULL;
3352
0
                    break;
3353
0
                }
3354
3355
                // before start of range; skip to next slice
3356
0
                if (fd->range.refid != -1 &&
3357
0
                    s_next->hdr->ref_seq_start + s_next->hdr->ref_seq_span-1 <
3358
0
                    fd->range.start) {
3359
0
                    cram_free_slice(s_next);
3360
0
                    c_next->slice = s_next = NULL;
3361
0
                    continue;
3362
0
                }
3363
0
            }
3364
660
        } // end: if (!fd->ooc)
3365
3366
660
        if (!c_next || !s_next)
3367
0
            break;
3368
3369
        // Decode the slice, either right now (non-threaded) or by pushing
3370
        // it to the a decode queue (threaded).
3371
660
        if (cram_decode_slice_mt(fd, c_next, s_next, fd->header) != 0) {
3372
93
            hts_log_error("Failure to decode slice");
3373
93
            cram_free_slice(s_next);
3374
93
            c_next->slice = NULL;
3375
93
            return NULL;
3376
93
        }
3377
3378
        // No thread pool, so don't loop again
3379
567
        if (!fd->pool) {
3380
567
            c_curr = c_next;
3381
567
            s_curr = s_next;
3382
567
            break;
3383
567
        }
3384
3385
        // With thread pool, but we have a job pending so our decode queue
3386
        // is full.
3387
0
        if (fd->job_pending)
3388
0
            break;
3389
3390
        // Otherwise we're threaded with room in the decode input queue, so
3391
        // keep reading slices for decode.
3392
        // Push it a bit far, to qsize in queue rather than pending arrival,
3393
        // as cram tends to be a bit bursty in decode timings.
3394
0
        if (hts_tpool_process_len(fd->rqueue) >
3395
0
            hts_tpool_process_qsize(fd->rqueue))
3396
0
            break;
3397
0
    } // end of for(;;)
3398
3399
3400
    // When not threaded we've already have c_curr and s_curr.
3401
    // Otherwise we need get them by pulling off the decode output queue.
3402
567
    if (fd->pool) {
3403
0
        hts_tpool_result *res;
3404
0
        cram_decode_job *j;
3405
3406
0
        if (fd->ooc && hts_tpool_process_empty(fd->rqueue)) {
3407
0
            fd->eof = 1;
3408
0
            return NULL;
3409
0
        }
3410
3411
0
        res = hts_tpool_next_result_wait(fd->rqueue);
3412
3413
0
        if (!res || !hts_tpool_result_data(res)) {
3414
0
            hts_log_error("Call to hts_tpool_next_result failed");
3415
0
            return NULL;
3416
0
        }
3417
3418
0
        j = (cram_decode_job *)hts_tpool_result_data(res);
3419
0
        c_curr = j->c;
3420
0
        s_curr = j->s;
3421
3422
0
        if (j->exit_code != 0) {
3423
0
            hts_log_error("Slice decode failure");
3424
0
            fd->eof = 0;
3425
0
            hts_tpool_delete_result(res, 1);
3426
0
            return NULL;
3427
0
        }
3428
3429
0
        hts_tpool_delete_result(res, 1);
3430
0
    }
3431
3432
567
    *cp = c_curr;
3433
3434
    // Update current slice being processed (as opposed to current
3435
    // slice in the multi-threaded reahead.
3436
567
    fd->ctr = c_curr;
3437
567
    if (c_curr) {
3438
567
        c_curr->slice = s_curr;
3439
567
        if (s_curr)
3440
567
            c_curr->curr_slice = s_curr->slice_num;
3441
567
    }
3442
567
    if (s_curr)
3443
567
        s_curr->curr_rec = 0;
3444
0
    else
3445
0
        fd->eof = 1;
3446
3447
567
    return s_curr;
3448
567
}
3449
3450
/*
3451
 * Read the next cram record and return it.
3452
 * Note that to decode cram_record the caller will need to look up some data
3453
 * in the current slice, pointed to by fd->ctr->slice. This is valid until
3454
 * the next call to cram_get_seq (which may invalidate it).
3455
 *
3456
 * Returns record pointer on success (do not free)
3457
 *        NULL on failure
3458
 */
3459
7.57k
cram_record *cram_get_seq(cram_fd *fd) {
3460
7.57k
    cram_container *c;
3461
7.57k
    cram_slice *s;
3462
3463
8.14k
    for (;;) {
3464
8.14k
        c = fd->ctr;
3465
8.14k
        if (c && c->slice && c->slice->curr_rec < c->slice->max_rec) {
3466
0
            s = c->slice;
3467
8.14k
        } else {
3468
8.14k
            if (!(s = cram_next_slice(fd, &c)))
3469
7.57k
                return NULL;
3470
567
            continue; /* In case slice contains no records */
3471
8.14k
        }
3472
3473
        // No need to lock here as get_seq is running in the main thread,
3474
        // which is also the same one that does the range modifications.
3475
0
        if (fd->range.refid != -2) {
3476
0
            if (fd->range.refid == -1 && s->crecs[s->curr_rec].ref_id != -1) {
3477
                // Special case when looking for unmapped blocks at end.
3478
                // If these are mixed in with mapped data (c->ref_id == -2)
3479
                // then we need skip until we find the unmapped data, if at all
3480
0
                s->curr_rec++;
3481
0
                continue;
3482
0
            }
3483
0
            if (s->crecs[s->curr_rec].ref_id < fd->range.refid &&
3484
0
                s->crecs[s->curr_rec].ref_id != -1) {
3485
                // Looking for a mapped read, but not there yet.  Special case
3486
                // as -1 (unmapped) shouldn't be considered < refid.
3487
0
                s->curr_rec++;
3488
0
                continue;
3489
0
            }
3490
3491
0
            if (s->crecs[s->curr_rec].ref_id != fd->range.refid) {
3492
0
                fd->eof = 1;
3493
0
                cram_free_slice(s);
3494
0
                c->slice = NULL;
3495
0
                return NULL;
3496
0
            }
3497
3498
0
            if (fd->range.refid != -1 && s->crecs[s->curr_rec].apos > fd->range.end) {
3499
0
                fd->eof = 1;
3500
0
                cram_free_slice(s);
3501
0
                c->slice = NULL;
3502
0
                return NULL;
3503
0
            }
3504
3505
0
            if (fd->range.refid != -1 && s->crecs[s->curr_rec].aend < fd->range.start) {
3506
0
                s->curr_rec++;
3507
0
                continue;
3508
0
            }
3509
0
        }
3510
3511
0
        break;
3512
0
    }
3513
3514
0
    fd->ctr = c;
3515
0
    c->slice = s;
3516
0
    return &s->crecs[s->curr_rec++];
3517
7.57k
}
3518
3519
/*
3520
 * Read the next cram record and convert it to a bam_seq_t struct.
3521
 *
3522
 * Returns >= 0 success (number of bytes written to *bam)
3523
 *        -1 on EOF or failure (check fd->err)
3524
 */
3525
7.57k
int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam) {
3526
7.57k
    cram_record *cr;
3527
7.57k
    cram_container *c;
3528
7.57k
    cram_slice *s;
3529
3530
7.57k
    if (!(cr = cram_get_seq(fd)))
3531
7.57k
        return -1;
3532
3533
0
    c = fd->ctr;
3534
0
    s = c->slice;
3535
3536
0
    return cram_to_bam(fd->header, fd, s, cr, s->curr_rec-1, bam);
3537
7.57k
}
3538
3539
/*
3540
 * Drains and frees the decode read-queue for a multi-threaded reader.
3541
 */
3542
10.3k
void cram_drain_rqueue(cram_fd *fd) {
3543
10.3k
    cram_container *lc = NULL;
3544
3545
10.3k
    if (!fd->pool || !fd->rqueue)
3546
10.3k
        return;
3547
3548
    // drain queue of any in-flight decode jobs
3549
0
    while (!hts_tpool_process_empty(fd->rqueue)) {
3550
0
        hts_tpool_result *r = hts_tpool_next_result_wait(fd->rqueue);
3551
0
        if (!r)
3552
0
            break;
3553
0
        cram_decode_job *j = (cram_decode_job *)hts_tpool_result_data(r);
3554
0
        if (j->c->slice == j->s)
3555
0
            j->c->slice = NULL;
3556
0
        if (j->c != lc) {
3557
0
            if (lc) {
3558
0
                if (fd->ctr == lc)
3559
0
                    fd->ctr = NULL;
3560
0
                if (fd->ctr_mt == lc)
3561
0
                    fd->ctr_mt = NULL;
3562
0
                cram_free_container(lc);
3563
0
            }
3564
0
            lc = j->c;
3565
0
        }
3566
0
        cram_free_slice(j->s);
3567
0
        hts_tpool_delete_result(r, 1);
3568
0
    }
3569
3570
    // Also tidy up any pending decode job that we didn't submit to the workers
3571
    // due to the input queue being full.
3572
0
    if (fd->job_pending) {
3573
0
        cram_decode_job *j = (cram_decode_job *)fd->job_pending;
3574
0
        if (j->c->slice == j->s)
3575
0
            j->c->slice = NULL;
3576
0
        if (j->c != lc) {
3577
0
            if (lc) {
3578
0
                if (fd->ctr == lc)
3579
0
                    fd->ctr = NULL;
3580
0
                if (fd->ctr_mt == lc)
3581
0
                    fd->ctr_mt = NULL;
3582
0
                cram_free_container(lc);
3583
0
            }
3584
0
            lc = j->c;
3585
0
        }
3586
0
        cram_free_slice(j->s);
3587
0
        free(j);
3588
0
        fd->job_pending = NULL;
3589
0
    }
3590
3591
0
    if (lc) {
3592
0
        if (fd->ctr == lc)
3593
0
            fd->ctr = NULL;
3594
0
        if (fd->ctr_mt == lc)
3595
0
            fd->ctr_mt = NULL;
3596
0
        cram_free_container(lc);
3597
0
    }
3598
0
}