Coverage Report

Created: 2025-12-20 06:40

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/cram/cram_decode.c
Line
Count
Source
1
/*
2
Copyright (c) 2012-2020, 2022-2025 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * - In-memory decoding of CRAM data structures.
33
 * - Iterator for reading CRAM record by record.
34
 */
35
36
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
37
#include <config.h>
38
39
#include <stdio.h>
40
#include <errno.h>
41
#include <assert.h>
42
#include <stdlib.h>
43
#include <string.h>
44
#include <zlib.h>
45
#include <sys/types.h>
46
#include <sys/stat.h>
47
#include <math.h>
48
#include <stdint.h>
49
#include <inttypes.h>
50
51
#include "cram.h"
52
#include "os.h"
53
#include "../htslib/hts.h"
54
#include "../htslib/hfile.h"
55
56
//Whether CIGAR has just M or uses = and X to indicate match and mismatch
57
//#define USE_X
58
59
/* ----------------------------------------------------------------------
60
 * CRAM compression headers
61
 */
62
63
/*
64
 * Decodes the Tag Dictionary record in the preservation map
65
 * Updates the cram compression header.
66
 *
67
 * Returns number of bytes decoded on success
68
 *        -1 on failure
69
 */
70
int cram_decode_TD(cram_fd *fd, char *cp, const char *endp,
71
308
                   cram_block_compression_hdr *h) {
72
308
    char *op = cp;
73
308
    unsigned char *dat;
74
308
    cram_block *b;
75
308
    int32_t blk_size = 0;
76
308
    int nTL, i, sz, err = 0;
77
78
308
    if (!(b = cram_new_block(0, 0)))
79
0
        return -1;
80
81
308
    if (h->TD_blk || h->TL) {
82
174
        hts_log_warning("More than one TD block found in compression header");
83
174
        cram_free_block(h->TD_blk);
84
174
        free(h->TL);
85
174
        h->TD_blk = NULL;
86
174
        h->TL = NULL;
87
174
    }
88
89
    /* Decode */
90
308
    blk_size = fd->vv.varint_get32(&cp, endp, &err);
91
308
    if (!blk_size) {
92
104
        h->nTL = 0;
93
104
        cram_free_block(b);
94
104
        return cp - op;
95
104
    }
96
97
204
    if (err || blk_size < 0 || endp - cp < blk_size) {
98
12
        cram_free_block(b);
99
12
        return -1;
100
12
    }
101
102
192
    BLOCK_APPEND(b, cp, blk_size);
103
192
    cp += blk_size;
104
192
    sz = cp - op;
105
    // Force nul termination if missing
106
192
    if (BLOCK_DATA(b)[BLOCK_SIZE(b)-1])
107
177
        BLOCK_APPEND_CHAR(b, '\0');
108
109
    /* Set up TL lookup table */
110
192
    dat = BLOCK_DATA(b);
111
112
    // Count
113
4.31k
    for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
114
4.12k
        nTL++;
115
22.9k
        while (dat[i])
116
18.8k
            i++;
117
4.12k
    }
118
119
    // Copy
120
192
    if (!(h->TL = calloc(nTL, sizeof(*h->TL)))) {
121
0
        cram_free_block(b);
122
0
        return -1;
123
0
    }
124
4.31k
    for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
125
4.12k
        h->TL[nTL++] = &dat[i];
126
22.9k
        while (dat[i])
127
18.8k
            i++;
128
4.12k
    }
129
192
    h->TD_blk = b;
130
192
    h->nTL = nTL;
131
132
192
    return sz;
133
134
0
 block_err:
135
0
    cram_free_block(b);
136
0
    return -1;
137
192
}
138
139
/*
140
 * Decodes a CRAM block compression header.
141
 * Returns header ptr on success
142
 *         NULL on failure
143
 */
144
cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
145
752
                                                           cram_block *b) {
146
752
    char *cp, *endp, *cp_copy;
147
752
    cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
148
752
    int i, err = 0;
149
752
    int32_t map_size = 0, map_count = 0;
150
151
752
    if (!hdr)
152
0
        return NULL;
153
154
752
    if (b->method != RAW) {
155
323
        if (cram_uncompress_block(b)) {
156
313
            free(hdr);
157
313
            return NULL;
158
313
        }
159
323
    }
160
161
439
    cp = (char *)b->data;
162
439
    endp = cp + b->uncomp_size;
163
164
439
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
165
439
        hdr->ref_seq_id = fd->vv.varint_get32(&cp, endp, &err);
166
439
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
167
0
            hdr->ref_seq_start = fd->vv.varint_get64(&cp, endp, &err);
168
0
            hdr->ref_seq_span  = fd->vv.varint_get64(&cp, endp, &err);
169
439
        } else {
170
439
            hdr->ref_seq_start = fd->vv.varint_get32(&cp, endp, &err);
171
439
            hdr->ref_seq_span  = fd->vv.varint_get32(&cp, endp, &err);
172
439
        }
173
439
        hdr->num_records   = fd->vv.varint_get32(&cp, endp, &err);
174
439
        hdr->num_landmarks = fd->vv.varint_get32(&cp, endp, &err);
175
439
        if (hdr->num_landmarks < 0 ||
176
437
            hdr->num_landmarks >= SIZE_MAX / sizeof(int32_t) ||
177
437
            endp - cp < hdr->num_landmarks) {
178
4
            free(hdr);
179
4
            return NULL;
180
4
        }
181
435
        if (!(hdr->landmark = malloc(hdr->num_landmarks * sizeof(int32_t)))) {
182
0
            free(hdr);
183
0
            return NULL;
184
0
        }
185
17.1k
        for (i = 0; i < hdr->num_landmarks; i++)
186
16.6k
            hdr->landmark[i] = fd->vv.varint_get32(&cp, endp, &err);;
187
435
    }
188
189
435
    hdr->preservation_map = kh_init(map);
190
191
435
    memset(hdr->rec_encoding_map, 0,
192
435
           CRAM_MAP_HASH * sizeof(hdr->rec_encoding_map[0]));
193
435
    memset(hdr->tag_encoding_map, 0,
194
435
           CRAM_MAP_HASH * sizeof(hdr->tag_encoding_map[0]));
195
196
435
    if (!hdr->preservation_map) {
197
0
        cram_free_compression_header(hdr);
198
0
        return NULL;
199
0
    }
200
201
    /* Initialise defaults for preservation map */
202
435
    hdr->read_names_included = 0;
203
435
    hdr->AP_delta = 1;
204
435
    hdr->qs_seq_orient = 1;
205
435
    memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20);
206
207
    /* Preservation map */
208
435
    map_size  = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp;
209
435
    map_count = fd->vv.varint_get32(&cp, endp, &err);
210
44.8k
    for (i = 0; i < map_count; i++) {
211
44.4k
        pmap_t hd;
212
44.4k
        khint_t k;
213
44.4k
        int r;
214
215
44.4k
        if (endp - cp < 3) {
216
62
            cram_free_compression_header(hdr);
217
62
            return NULL;
218
62
        }
219
44.4k
        cp += 2;
220
44.4k
        switch(CRAM_KEY(cp[-2],cp[-1])) {
221
9
        case CRAM_KEY('M','I'): // was mapped QS included in V1.0
222
78
        case CRAM_KEY('U','I'): // was unmapped QS included in V1.0
223
111
        case CRAM_KEY('P','I'): // was unmapped placed in V1.0
224
111
            hd.i = *cp++;
225
111
            break;
226
227
60
        case CRAM_KEY('R','N'):
228
60
            hd.i = *cp++;
229
60
            k = kh_put(map, hdr->preservation_map, "RN", &r);
230
60
            if (-1 == r) {
231
0
                cram_free_compression_header(hdr);
232
0
                return NULL;
233
0
            }
234
235
60
            kh_val(hdr->preservation_map, k) = hd;
236
60
            hdr->read_names_included = hd.i;
237
60
            break;
238
239
129
        case CRAM_KEY('A','P'):
240
129
            hd.i = *cp++;
241
129
            k = kh_put(map, hdr->preservation_map, "AP", &r);
242
129
            if (-1 == r) {
243
0
                cram_free_compression_header(hdr);
244
0
                return NULL;
245
0
            }
246
247
129
            kh_val(hdr->preservation_map, k) = hd;
248
129
            hdr->AP_delta = hd.i;
249
129
            break;
250
251
1.56k
        case CRAM_KEY('R','R'):
252
1.56k
            hd.i = *cp++;
253
1.56k
            k = kh_put(map, hdr->preservation_map, "RR", &r);
254
1.56k
            if (-1 == r) {
255
0
                cram_free_compression_header(hdr);
256
0
                return NULL;
257
0
            }
258
259
1.56k
            kh_val(hdr->preservation_map, k) = hd;
260
1.56k
            hdr->no_ref = !hd.i;
261
1.56k
            break;
262
263
45
        case CRAM_KEY('Q','O'):
264
45
            hd.i = *cp++;
265
45
            k = kh_put(map, hdr->preservation_map, "QO", &r);
266
45
            if (-1 == r) {
267
0
                cram_free_compression_header(hdr);
268
0
                return NULL;
269
0
            }
270
271
45
            kh_val(hdr->preservation_map, k) = hd;
272
45
            hdr->qs_seq_orient = hd.i;
273
45
            break;
274
275
621
        case CRAM_KEY('S','M'):
276
621
            if (endp - cp < 5) {
277
0
                cram_free_compression_header(hdr);
278
0
                return NULL;
279
0
            }
280
621
            hdr->substitution_matrix[0][(cp[0]>>6)&3] = 'C';
281
621
            hdr->substitution_matrix[0][(cp[0]>>4)&3] = 'G';
282
621
            hdr->substitution_matrix[0][(cp[0]>>2)&3] = 'T';
283
621
            hdr->substitution_matrix[0][(cp[0]>>0)&3] = 'N';
284
285
621
            hdr->substitution_matrix[1][(cp[1]>>6)&3] = 'A';
286
621
            hdr->substitution_matrix[1][(cp[1]>>4)&3] = 'G';
287
621
            hdr->substitution_matrix[1][(cp[1]>>2)&3] = 'T';
288
621
            hdr->substitution_matrix[1][(cp[1]>>0)&3] = 'N';
289
290
621
            hdr->substitution_matrix[2][(cp[2]>>6)&3] = 'A';
291
621
            hdr->substitution_matrix[2][(cp[2]>>4)&3] = 'C';
292
621
            hdr->substitution_matrix[2][(cp[2]>>2)&3] = 'T';
293
621
            hdr->substitution_matrix[2][(cp[2]>>0)&3] = 'N';
294
295
621
            hdr->substitution_matrix[3][(cp[3]>>6)&3] = 'A';
296
621
            hdr->substitution_matrix[3][(cp[3]>>4)&3] = 'C';
297
621
            hdr->substitution_matrix[3][(cp[3]>>2)&3] = 'G';
298
621
            hdr->substitution_matrix[3][(cp[3]>>0)&3] = 'N';
299
300
621
            hdr->substitution_matrix[4][(cp[4]>>6)&3] = 'A';
301
621
            hdr->substitution_matrix[4][(cp[4]>>4)&3] = 'C';
302
621
            hdr->substitution_matrix[4][(cp[4]>>2)&3] = 'G';
303
621
            hdr->substitution_matrix[4][(cp[4]>>0)&3] = 'T';
304
305
621
            hd.p = cp;
306
621
            cp += 5;
307
308
621
            k = kh_put(map, hdr->preservation_map, "SM", &r);
309
621
            if (-1 == r) {
310
0
                cram_free_compression_header(hdr);
311
0
                return NULL;
312
0
            }
313
621
            kh_val(hdr->preservation_map, k) = hd;
314
621
            break;
315
316
308
        case CRAM_KEY('T','D'): {
317
308
            int sz = cram_decode_TD(fd, cp, endp, hdr); // tag dictionary
318
308
            if (sz < 0) {
319
12
                cram_free_compression_header(hdr);
320
12
                return NULL;
321
12
            }
322
323
296
            hd.p = cp;
324
296
            cp += sz;
325
326
296
            k = kh_put(map, hdr->preservation_map, "TD", &r);
327
296
            if (-1 == r) {
328
0
                cram_free_compression_header(hdr);
329
0
                return NULL;
330
0
            }
331
296
            kh_val(hdr->preservation_map, k) = hd;
332
296
            break;
333
296
        }
334
335
41.5k
        default:
336
41.5k
            hts_log_warning("Unrecognised preservation map key %c%c", cp[-2], cp[-1]);
337
            // guess byte;
338
41.5k
            cp++;
339
41.5k
            break;
340
44.4k
        }
341
44.4k
    }
342
361
    if (cp - cp_copy != map_size) {
343
15
        cram_free_compression_header(hdr);
344
15
        return NULL;
345
15
    }
346
347
    /* Record encoding map */
348
346
    map_size  = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp;
349
346
    map_count = fd->vv.varint_get32(&cp, endp, &err);
350
346
    int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0;
351
21.9k
    for (i = 0; i < map_count; i++) {
352
21.8k
        char *key = cp;
353
21.8k
        int32_t encoding = E_NULL;
354
21.8k
        int32_t size = 0;
355
21.8k
        ptrdiff_t offset;
356
21.8k
        cram_map *m;
357
21.8k
        enum cram_DS_ID ds_id;
358
21.8k
        enum cram_external_type type;
359
360
21.8k
        if (endp - cp < 4) {
361
6
            cram_free_compression_header(hdr);
362
6
            return NULL;
363
6
        }
364
365
21.8k
        cp += 2;
366
21.8k
        encoding = fd->vv.varint_get32(&cp, endp, &err);
367
21.8k
        size     = fd->vv.varint_get32(&cp, endp, &err);
368
369
21.8k
        offset = cp - (char *)b->data;
370
371
21.8k
        if (encoding == E_NULL)
372
8.11k
            continue;
373
374
13.7k
        if (size < 0 || endp - cp < size) {
375
106
            cram_free_compression_header(hdr);
376
106
            return NULL;
377
106
        }
378
379
        //printf("%s codes for %.2s\n", cram_encoding2str(encoding), key);
380
381
        /*
382
         * For CRAM1.0 CF and BF are Byte and not Int.
383
         * Practically speaking it makes no difference unless we have a
384
         * 1.0 format file that stores these in EXTERNAL as only then
385
         * does Byte vs Int matter.
386
         *
387
         * Neither this C code nor Java reference implementations did this,
388
         * so we gloss over it and treat them as int.
389
         */
390
13.6k
        ds_id = DS_CORE;
391
13.6k
        if (key[0] == 'B' && key[1] == 'F') {
392
36
            ds_id = DS_BF; type = E_INT;
393
13.5k
        } else if (key[0] == 'C' && key[1] == 'F') {
394
195
            ds_id = DS_CF; type = E_INT;
395
13.4k
        } else if (key[0] == 'R' && key[1] == 'I') {
396
119
            ds_id = DS_RI; type = E_INT;
397
13.2k
        } else if (key[0] == 'R' && key[1] == 'L') {
398
15
            ds_id = DS_RL; type = E_INT;
399
13.2k
        } else if (key[0] == 'A' && key[1] == 'P') {
400
231
            ds_id = DS_AP;
401
231
            type = is_v4 ? E_SLONG : E_INT;
402
13.0k
        } else if (key[0] == 'R' && key[1] == 'G') {
403
0
            ds_id = DS_RG;
404
0
            type = E_INT;
405
13.0k
        } else if (key[0] == 'M' && key[1] == 'F') {
406
38
            ds_id = DS_MF; type = E_INT;
407
12.9k
        } else if (key[0] == 'N' && key[1] == 'S') {
408
84
            ds_id = DS_NS; type = E_INT;
409
12.9k
        } else if (key[0] == 'N' && key[1] == 'P') {
410
366
            ds_id = DS_NP;
411
366
            type = is_v4 ? E_LONG : E_INT;
412
12.5k
        } else if (key[0] == 'T' && key[1] == 'S') {
413
36
            ds_id = DS_TS;
414
36
            type = is_v4 ? E_SLONG : E_INT;
415
12.5k
        } else if (key[0] == 'N' && key[1] == 'F') {
416
120
            ds_id = DS_NF; type = E_INT;
417
12.3k
        } else if (key[0] == 'T' && key[1] == 'C') {
418
63
            ds_id = DS_TC; type = E_BYTE;
419
12.3k
        } else if (key[0] == 'T' && key[1] == 'N') {
420
0
            ds_id = DS_TN; type = E_INT;
421
12.3k
        } else if (key[0] == 'F' && key[1] == 'N') {
422
0
            ds_id = DS_FN; type = E_INT;
423
12.3k
        } else if (key[0] == 'F' && key[1] == 'C') {
424
207
            ds_id = DS_FC; type = E_BYTE;
425
12.1k
        } else if (key[0] == 'F' && key[1] == 'P') {
426
0
            ds_id = DS_FP; type = E_INT;
427
12.1k
        } else if (key[0] == 'B' && key[1] == 'S') {
428
0
            ds_id = DS_BS; type = E_BYTE;
429
12.1k
        } else if (key[0] == 'I' && key[1] == 'N') {
430
12
            ds_id = DS_IN; type = E_BYTE_ARRAY;
431
12.1k
        } else if (key[0] == 'S' && key[1] == 'C') {
432
0
            ds_id = DS_SC; type = E_BYTE_ARRAY;
433
12.1k
        } else if (key[0] == 'D' && key[1] == 'L') {
434
63
            ds_id = DS_DL; type = E_INT;
435
12.0k
        } else if (key[0] == 'B' && key[1] == 'A') {
436
27
            ds_id = DS_BA; type = E_BYTE;
437
12.0k
        } else if (key[0] == 'B' && key[1] == 'B') {
438
18
            ds_id = DS_BB; type = E_BYTE_ARRAY;
439
12.0k
        } else if (key[0] == 'R' && key[1] == 'S') {
440
117
            ds_id = DS_RS; type = E_INT;
441
11.8k
        } else if (key[0] == 'P' && key[1] == 'D') {
442
0
            ds_id = DS_PD; type = E_INT;
443
11.8k
        } else if (key[0] == 'H' && key[1] == 'C') {
444
0
            ds_id = DS_HC; type = E_INT;
445
11.8k
        } else if (key[0] == 'M' && key[1] == 'Q') {
446
1.41k
            ds_id = DS_MQ; type = E_INT;
447
10.4k
        } else if (key[0] == 'R' && key[1] == 'N') {
448
18
            ds_id = DS_RN; type = E_BYTE_ARRAY_BLOCK;
449
10.4k
        } else if (key[0] == 'Q' && key[1] == 'S') {
450
132
            ds_id = DS_QS; type = E_BYTE;
451
10.3k
        } else if (key[0] == 'Q' && key[1] == 'Q') {
452
0
            ds_id = DS_QQ; type = E_BYTE_ARRAY;
453
10.3k
        } else if (key[0] == 'T' && key[1] == 'L') {
454
0
            ds_id = DS_TL; type = E_INT;
455
10.3k
        } else if (key[0] == 'T' && key[1] == 'M') {
456
10.3k
        } else if (key[0] == 'T' && key[1] == 'V') {
457
10.3k
        } else {
458
10.3k
            hts_log_warning("Unrecognised key: %.2s", key);
459
10.3k
        }
460
461
13.6k
        if (ds_id != DS_CORE) {
462
3.31k
            if (hdr->codecs[ds_id] != NULL) {
463
2.88k
                hts_log_warning("Codec for key %.2s defined more than once",
464
2.88k
                                key);
465
2.88k
                hdr->codecs[ds_id]->free(hdr->codecs[ds_id]);
466
2.88k
            }
467
3.31k
            hdr->codecs[ds_id] = cram_decoder_init(hdr, encoding, cp, size,
468
3.31k
                                                   type, fd->version, &fd->vv);
469
3.31k
            if (!hdr->codecs[ds_id]) {
470
106
                cram_free_compression_header(hdr);
471
106
                return NULL;
472
106
            }
473
3.31k
        }
474
475
13.5k
        cp += size;
476
477
        // Fill out cram_map purely for cram_dump to dump out.
478
13.5k
        m = malloc(sizeof(*m));
479
13.5k
        if (!m) {
480
0
            cram_free_compression_header(hdr);
481
0
            return NULL;
482
0
        }
483
13.5k
        m->key = CRAM_KEY(key[0], key[1]);
484
13.5k
        m->encoding = encoding;
485
13.5k
        m->size     = size;
486
13.5k
        m->offset   = offset;
487
13.5k
        m->codec = NULL;
488
489
13.5k
        m->next = hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])];
490
13.5k
        hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])] = m;
491
13.5k
    }
492
128
    if (cp - cp_copy != map_size) {
493
2
        cram_free_compression_header(hdr);
494
2
        return NULL;
495
2
    }
496
497
    /* Tag encoding map */
498
126
    map_size  = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp;
499
126
    map_count = fd->vv.varint_get32(&cp, endp, &err);
500
1.05k
    for (i = 0; i < map_count; i++) {
501
970
        int32_t encoding = E_NULL;
502
970
        int32_t size = 0;
503
970
        cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
504
970
        uint8_t key[3];
505
506
970
        if (!m || endp - cp < 6) {
507
0
            free(m);
508
0
            cram_free_compression_header(hdr);
509
0
            return NULL;
510
0
        }
511
512
970
        m->key = fd->vv.varint_get32(&cp, endp, &err);
513
970
        key[0] = m->key>>16;
514
970
        key[1] = m->key>>8;
515
970
        key[2] = m->key;
516
970
        encoding = fd->vv.varint_get32(&cp, endp, &err);
517
970
        size     = fd->vv.varint_get32(&cp, endp, &err);
518
519
970
        m->encoding = encoding;
520
970
        m->size     = size;
521
970
        m->offset   = cp - (char *)b->data;
522
970
        if (size < 0 || endp - cp < size ||
523
964
            !(m->codec = cram_decoder_init(hdr, encoding, cp, size,
524
964
                                           E_BYTE_ARRAY_BLOCK, fd->version, &fd->vv))) {
525
37
            cram_free_compression_header(hdr);
526
37
            free(m);
527
37
            return NULL;
528
37
        }
529
530
933
        cp += size;
531
532
933
        m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])];
533
933
        hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m;
534
933
    }
535
89
    if (err || cp - cp_copy != map_size) {
536
2
        cram_free_compression_header(hdr);
537
2
        return NULL;
538
2
    }
539
540
87
    return hdr;
541
89
}
542
543
/*
544
 * Note we also need to scan through the record encoding map to
545
 * see which data series share the same block, either external or
546
 * CORE. For example if we need the BF data series but MQ and CF
547
 * are also encoded in the same block then we need to add those in
548
 * as a dependency in order to correctly decode BF.
549
 *
550
 * Returns 0 on success
551
 *        -1 on failure
552
 */
553
int cram_dependent_data_series(cram_fd *fd,
554
                               cram_block_compression_hdr *hdr,
555
603
                               cram_slice *s) {
556
603
    int *block_used;
557
603
    int core_used = 0;
558
603
    int i;
559
603
    static int i_to_id[] = {
560
603
        DS_BF, DS_AP, DS_FP, DS_RL, DS_DL, DS_NF, DS_BA, DS_QS,
561
603
        DS_FC, DS_FN, DS_BS, DS_IN, DS_RG, DS_MQ, DS_TL, DS_RN,
562
603
        DS_NS, DS_NP, DS_TS, DS_MF, DS_CF, DS_RI, DS_RS, DS_PD,
563
603
        DS_HC, DS_SC, DS_BB, DS_QQ,
564
603
    };
565
603
    uint32_t orig_ds;
566
567
    /*
568
     * Set the data_series bit field based on fd->required_fields
569
     * contents.
570
     */
571
603
    if (fd->required_fields && fd->required_fields != INT_MAX) {
572
0
        s->data_series = 0;
573
574
0
        if (fd->required_fields & SAM_QNAME)
575
0
            s->data_series |= CRAM_RN;
576
577
0
        if (fd->required_fields & SAM_FLAG)
578
0
            s->data_series |= CRAM_BF;
579
580
0
        if (fd->required_fields & SAM_RNAME)
581
0
            s->data_series |= CRAM_RI | CRAM_BF;
582
583
0
        if (fd->required_fields & SAM_POS)
584
0
            s->data_series |= CRAM_AP | CRAM_BF;
585
586
0
        if (fd->required_fields & SAM_MAPQ)
587
0
            s->data_series |= CRAM_MQ;
588
589
0
        if (fd->required_fields & SAM_CIGAR)
590
0
            s->data_series |= CRAM_CIGAR;
591
592
0
        if (fd->required_fields & SAM_RNEXT)
593
0
            s->data_series |= CRAM_CF | CRAM_NF | CRAM_RI | CRAM_NS |CRAM_BF;
594
595
0
        if (fd->required_fields & SAM_PNEXT)
596
0
            s->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_NP | CRAM_BF;
597
598
0
        if (fd->required_fields & SAM_TLEN)
599
0
            s->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_TS |
600
0
                CRAM_BF | CRAM_MF | CRAM_RI | CRAM_CIGAR;
601
602
0
        if (fd->required_fields & SAM_SEQ)
603
0
            s->data_series |= CRAM_SEQ;
604
605
0
        if (!(fd->required_fields & SAM_AUX))
606
            // No easy way to get MD/NM without other tags at present
607
0
            s->decode_md = 0;
608
609
0
        if (fd->required_fields & SAM_QUAL)
610
0
            s->data_series |= CRAM_QUAL;
611
612
0
        if (fd->required_fields & SAM_AUX)
613
0
            s->data_series |= CRAM_RG | CRAM_TL | CRAM_aux;
614
615
0
        if (fd->required_fields & SAM_RGAUX)
616
0
            s->data_series |= CRAM_RG | CRAM_BF;
617
618
        // Always uncompress CORE block
619
0
        if (cram_uncompress_block(s->block[0]))
620
0
            return -1;
621
603
    } else {
622
603
        s->data_series = CRAM_ALL;
623
624
3.20k
        for (i = 0; i < s->hdr->num_blocks; i++) {
625
2.59k
            if (cram_uncompress_block(s->block[i]))
626
0
                return -1;
627
2.59k
        }
628
629
603
        return 0;
630
603
    }
631
632
0
    block_used = calloc(s->hdr->num_blocks+1, sizeof(int));
633
0
    if (!block_used)
634
0
        return -1;
635
636
0
    do {
637
        /*
638
         * Also set data_series based on code prerequisites. Eg if we need
639
         * CRAM_QS then we also need to know CRAM_RL so we know how long it
640
         * is, or if we need FC/FP then we also need FN (number of features).
641
         *
642
         * It's not reciprocal though. We may be needing to decode FN
643
         * but have no need to decode FC, FP and cigar ops.
644
         */
645
0
        if (s->data_series & CRAM_RS)    s->data_series |= CRAM_FC|CRAM_FP;
646
0
        if (s->data_series & CRAM_PD)    s->data_series |= CRAM_FC|CRAM_FP;
647
0
        if (s->data_series & CRAM_HC)    s->data_series |= CRAM_FC|CRAM_FP;
648
0
        if (s->data_series & CRAM_QS)    s->data_series |= CRAM_FC|CRAM_FP;
649
0
        if (s->data_series & CRAM_IN)    s->data_series |= CRAM_FC|CRAM_FP;
650
0
        if (s->data_series & CRAM_SC)    s->data_series |= CRAM_FC|CRAM_FP;
651
0
        if (s->data_series & CRAM_BS)    s->data_series |= CRAM_FC|CRAM_FP;
652
0
        if (s->data_series & CRAM_DL)    s->data_series |= CRAM_FC|CRAM_FP;
653
0
        if (s->data_series & CRAM_BA)    s->data_series |= CRAM_FC|CRAM_FP;
654
0
        if (s->data_series & CRAM_BB)    s->data_series |= CRAM_FC|CRAM_FP;
655
0
        if (s->data_series & CRAM_QQ)    s->data_series |= CRAM_FC|CRAM_FP;
656
657
        // cram_decode_seq() needs seq[] array
658
0
        if (s->data_series & (CRAM_SEQ|CRAM_CIGAR)) s->data_series |= CRAM_RL;
659
660
0
        if (s->data_series & CRAM_FP)    s->data_series |= CRAM_FC;
661
0
        if (s->data_series & CRAM_FC)    s->data_series |= CRAM_FN;
662
0
        if (s->data_series & CRAM_aux)   s->data_series |= CRAM_TL;
663
0
        if (s->data_series & CRAM_MF)    s->data_series |= CRAM_CF;
664
0
        if (s->data_series & CRAM_MQ)    s->data_series |= CRAM_BF;
665
0
        if (s->data_series & CRAM_BS)    s->data_series |= CRAM_RI;
666
0
        if (s->data_series & (CRAM_MF |CRAM_NS |CRAM_NP |CRAM_TS |CRAM_NF))
667
0
            s->data_series |= CRAM_CF;
668
0
        if (!hdr->read_names_included && s->data_series & CRAM_RN)
669
0
            s->data_series |= CRAM_CF | CRAM_NF;
670
0
        if (s->data_series & (CRAM_BA | CRAM_QS | CRAM_BB | CRAM_QQ))
671
0
            s->data_series |= CRAM_BF | CRAM_CF | CRAM_RL;
672
0
        if (s->data_series & CRAM_FN) {
673
            // The CRAM_FN loop checks for reference length boundaries,
674
            // which needs a working seq_pos.  Some fields are fixed size
675
            // irrespective of if we decode (BS), but others need to know
676
            // the size of the string fetched back (SC, IN, BB).
677
0
            s->data_series |= CRAM_SC | CRAM_IN | CRAM_BB;
678
0
        }
679
680
0
        orig_ds = s->data_series;
681
682
        // Find which blocks are in use.
683
0
        for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
684
0
            int bnum1, bnum2, j;
685
0
            cram_codec *c = hdr->codecs[i_to_id[i]];
686
687
0
            if (!(s->data_series & (1<<i)))
688
0
                continue;
689
690
0
            if (!c)
691
0
                continue;
692
693
0
            bnum1 = cram_codec_to_id(c, &bnum2);
694
695
0
            for (;;) {
696
0
                switch (bnum1) {
697
0
                case -2:
698
0
                    break;
699
700
0
                case -1:
701
0
                    core_used = 1;
702
0
                    break;
703
704
0
                default:
705
0
                    for (j = 0; j < s->hdr->num_blocks; j++) {
706
0
                        if (s->block[j]->content_type == EXTERNAL &&
707
0
                            s->block[j]->content_id == bnum1) {
708
0
                            block_used[j] = 1;
709
0
                            if (cram_uncompress_block(s->block[j])) {
710
0
                                free(block_used);
711
0
                                return -1;
712
0
                            }
713
0
                        }
714
0
                    }
715
0
                    break;
716
0
                }
717
718
0
                if (bnum2 == -2 || bnum1 == bnum2)
719
0
                    break;
720
721
0
                bnum1 = bnum2; // 2nd pass
722
0
            }
723
0
        }
724
725
        // Tags too
726
0
        if ((fd->required_fields & SAM_AUX) ||
727
0
            (s->data_series & CRAM_aux)) {
728
0
            for (i = 0; i < CRAM_MAP_HASH; i++) {
729
0
                int bnum1, bnum2, j;
730
0
                cram_map *m = hdr->tag_encoding_map[i];
731
732
0
                while (m) {
733
0
                    cram_codec *c = m->codec;
734
0
                    if (!c) {
735
0
                        m = m->next;
736
0
                        continue;
737
0
                    }
738
739
0
                    bnum1 = cram_codec_to_id(c, &bnum2);
740
741
0
                    for (;;) {
742
0
                        switch (bnum1) {
743
0
                        case -2:
744
0
                            break;
745
746
0
                        case -1:
747
0
                            core_used = 1;
748
0
                            break;
749
750
0
                        default:
751
0
                            for (j = 0; j < s->hdr->num_blocks; j++) {
752
0
                                if (s->block[j]->content_type == EXTERNAL &&
753
0
                                    s->block[j]->content_id == bnum1) {
754
0
                                    block_used[j] = 1;
755
0
                                    if (cram_uncompress_block(s->block[j])) {
756
0
                                        free(block_used);
757
0
                                        return -1;
758
0
                                    }
759
0
                                }
760
0
                            }
761
0
                            break;
762
0
                        }
763
764
0
                        if (bnum2 == -2 || bnum1 == bnum2)
765
0
                            break;
766
767
0
                        bnum1 = bnum2; // 2nd pass
768
0
                    }
769
770
0
                    m = m->next;
771
0
                }
772
0
            }
773
0
        }
774
775
        // We now know which blocks are in used, so repeat and find
776
        // which other data series need to be added.
777
0
        for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
778
0
            int bnum1, bnum2, j;
779
0
            cram_codec *c = hdr->codecs[i_to_id[i]];
780
781
0
            if (!c)
782
0
                continue;
783
784
0
            bnum1 = cram_codec_to_id(c, &bnum2);
785
786
0
            for (;;) {
787
0
                switch (bnum1) {
788
0
                case -2:
789
0
                    break;
790
791
0
                case -1:
792
0
                    if (core_used) {
793
                        //printf(" + data series %08x:\n", 1<<i);
794
0
                        s->data_series |= 1<<i;
795
0
                    }
796
0
                    break;
797
798
0
                default:
799
0
                    for (j = 0; j < s->hdr->num_blocks; j++) {
800
0
                        if (s->block[j]->content_type == EXTERNAL &&
801
0
                            s->block[j]->content_id == bnum1) {
802
0
                            if (block_used[j]) {
803
                                //printf(" + data series %08x:\n", 1<<i);
804
0
                                s->data_series |= 1<<i;
805
0
                            }
806
0
                        }
807
0
                    }
808
0
                    break;
809
0
                }
810
811
0
                if (bnum2 == -2 || bnum1 == bnum2)
812
0
                    break;
813
814
0
                bnum1 = bnum2; // 2nd pass
815
0
            }
816
0
        }
817
818
        // Tags too
819
0
        for (i = 0; i < CRAM_MAP_HASH; i++) {
820
0
            int bnum1, bnum2, j;
821
0
            cram_map *m = hdr->tag_encoding_map[i];
822
823
0
            while (m) {
824
0
                cram_codec *c = m->codec;
825
0
                if (!c) {
826
0
                    m = m->next;
827
0
                    continue;
828
0
                }
829
830
0
                bnum1 = cram_codec_to_id(c, &bnum2);
831
832
0
                for (;;) {
833
0
                    switch (bnum1) {
834
0
                    case -2:
835
0
                        break;
836
837
0
                    case -1:
838
                        //printf(" + data series %08x:\n", CRAM_aux);
839
0
                        s->data_series |= CRAM_aux;
840
0
                        break;
841
842
0
                    default:
843
0
                        for (j = 0; j < s->hdr->num_blocks; j++) {
844
0
                            if (s->block[j]->content_type == EXTERNAL &&
845
0
                                s->block[j]->content_id == bnum1) {
846
0
                                if (block_used[j]) {
847
                                    //printf(" + data series %08x:\n",
848
                                    //       CRAM_aux);
849
0
                                    s->data_series |= CRAM_aux;
850
0
                                }
851
0
                            }
852
0
                        }
853
0
                        break;
854
0
                    }
855
856
0
                    if (bnum2 == -2 || bnum1 == bnum2)
857
0
                        break;
858
859
0
                    bnum1 = bnum2; // 2nd pass
860
0
                }
861
862
0
                m = m->next;
863
0
            }
864
0
        }
865
0
    } while (orig_ds != s->data_series);
866
867
0
    free(block_used);
868
0
    return 0;
869
0
}
870
871
/*
872
 * Checks whether an external block is used solely by a single data series.
873
 * Returns the codec type if so (EXTERNAL, BYTE_ARRAY_LEN, BYTE_ARRAY_STOP)
874
 *         or 0 if not (E_NULL).
875
 */
876
static int cram_ds_unique(cram_block_compression_hdr *hdr, cram_codec *c,
877
0
                          int id) {
878
0
    int i, n_id = 0;
879
0
    enum cram_encoding e_type = 0;
880
881
0
    for (i = 0; i < DS_END; i++) {
882
0
        cram_codec *c;
883
0
        int bnum1, bnum2, old_n_id;
884
885
0
        if (!(c = hdr->codecs[i]))
886
0
            continue;
887
888
0
        bnum1 = cram_codec_to_id(c, &bnum2);
889
890
0
        old_n_id = n_id;
891
0
        if (bnum1 == id) {
892
0
            n_id++;
893
0
            e_type = c->codec;
894
0
        }
895
0
        if (bnum2 == id) {
896
0
            n_id++;
897
0
            e_type = c->codec;
898
0
        }
899
900
0
        if (n_id == old_n_id+2)
901
0
            n_id--; // len/val in same place counts once only.
902
0
    }
903
904
0
    return n_id == 1 ? e_type : 0;
905
0
}
906
907
/*
908
 * Attempts to estimate the size of some blocks so we can preallocate them
909
 * before decoding.  Although decoding will automatically grow the blocks,
910
 * it is typically more efficient to preallocate.
911
 */
912
void cram_decode_estimate_sizes(cram_block_compression_hdr *hdr, cram_slice *s,
913
                                int *qual_size, int *name_size,
914
0
                                int *q_id) {
915
0
    int bnum1, bnum2;
916
0
    cram_codec *cd;
917
918
0
    *qual_size = 0;
919
0
    *name_size = 0;
920
921
    /* Qual */
922
0
    cd = hdr->codecs[DS_QS];
923
0
    if (cd == NULL) return;
924
0
    bnum1 = cram_codec_to_id(cd, &bnum2);
925
0
    if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
926
0
    if (cram_ds_unique(hdr, cd, bnum1)) {
927
0
        cram_block *b = cram_get_block_by_id(s, bnum1);
928
0
        if (b) *qual_size = b->uncomp_size;
929
0
        if (q_id && cd->codec == E_EXTERNAL)
930
0
            *q_id = bnum1;
931
0
    }
932
933
    /* Name */
934
0
    cd = hdr->codecs[DS_RN];
935
0
    if (cd == NULL) return;
936
0
    bnum1 = cram_codec_to_id(cd, &bnum2);
937
0
    if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
938
0
    if (cram_ds_unique(hdr, cd, bnum1)) {
939
0
        cram_block *b = cram_get_block_by_id(s, bnum1);
940
0
        if (b) *name_size = b->uncomp_size;
941
0
    }
942
0
}
943
944
945
/* ----------------------------------------------------------------------
946
 * CRAM slices
947
 */
948
949
/*
950
 * Decodes a CRAM (un)mapped slice header block.
951
 * Returns slice header ptr on success
952
 *         NULL on failure
953
 */
954
615
cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) {
955
615
    cram_block_slice_hdr *hdr;
956
615
    unsigned char *cp;
957
615
    unsigned char *cp_end;
958
615
    int i, err = 0;
959
960
615
    if (b->method != RAW) {
961
        /* Spec. says slice header should be RAW, but we can future-proof
962
           by trying to decode it if it isn't. */
963
0
        if (cram_uncompress_block(b) < 0)
964
0
            return NULL;
965
0
    }
966
615
    cp =  (unsigned char *)BLOCK_DATA(b);
967
615
    cp_end = cp + b->uncomp_size;
968
969
615
    if (b->content_type != MAPPED_SLICE &&
970
0
        b->content_type != UNMAPPED_SLICE)
971
0
        return NULL;
972
973
615
    if (!(hdr  = calloc(1, sizeof(*hdr))))
974
0
        return NULL;
975
976
615
    hdr->content_type = b->content_type;
977
978
615
    if (b->content_type == MAPPED_SLICE) {
979
615
        hdr->ref_seq_id = fd->vv.varint_get32s((char **)&cp, (char *)cp_end, &err);
980
615
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
981
0
            hdr->ref_seq_start = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err);
982
0
            hdr->ref_seq_span  = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err);
983
615
        } else {
984
615
            hdr->ref_seq_start = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
985
615
            hdr->ref_seq_span  = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
986
615
        }
987
615
        if (hdr->ref_seq_start < 0 || hdr->ref_seq_span < 0) {
988
3
            free(hdr);
989
3
            hts_log_error("Negative values not permitted for header "
990
3
                          "sequence start or span fields");
991
3
            return NULL;
992
3
        }
993
615
    }
994
612
    hdr->num_records = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err);
995
612
    hdr->record_counter = 0;
996
612
    if (CRAM_MAJOR_VERS(fd->version) == 2) {
997
0
        hdr->record_counter = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
998
612
    } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
999
0
        hdr->record_counter = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err);
1000
0
    }
1001
612
    hdr->num_blocks      = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
1002
612
    hdr->num_content_ids = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
1003
612
    if (hdr->num_content_ids < 1 ||
1004
612
        hdr->num_content_ids >= 10000) {
1005
        // Slice must have at least one data block, and there is no need
1006
        // for more than 2 per possible aux-tag plus ancillary.
1007
0
        free(hdr);
1008
0
        return NULL;
1009
0
    }
1010
612
    hdr->block_content_ids = malloc(hdr->num_content_ids * sizeof(int32_t));
1011
612
    if (!hdr->block_content_ids) {
1012
0
        free(hdr);
1013
0
        return NULL;
1014
0
    }
1015
1016
12.2k
    for (i = 0; i < hdr->num_content_ids; i++)
1017
11.6k
        hdr->block_content_ids[i] = fd->vv.varint_get32((char **)&cp,
1018
11.6k
                                                         (char *)cp_end,
1019
11.6k
                                                         &err);
1020
612
    if (err) {
1021
0
        free(hdr->block_content_ids);
1022
0
        free(hdr);
1023
0
        return NULL;
1024
0
    }
1025
1026
612
    if (b->content_type == MAPPED_SLICE)
1027
612
        hdr->ref_base_id = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err);
1028
1029
612
    if (CRAM_MAJOR_VERS(fd->version) != 1) {
1030
0
        if (cp_end - cp < 16) {
1031
0
            free(hdr->block_content_ids);
1032
0
            free(hdr);
1033
0
            return NULL;
1034
0
        }
1035
0
        memcpy(hdr->md5, cp, 16);
1036
612
    } else {
1037
612
        memset(hdr->md5, 0, 16);
1038
612
    }
1039
1040
612
    if (!err)
1041
612
        return hdr;
1042
1043
0
    free(hdr->block_content_ids);
1044
0
    free(hdr);
1045
0
    return NULL;
1046
612
}
1047
1048
1049
#if 0
1050
/* Returns the number of bits set in val; it the highest bit used */
1051
static int nbits(int v) {
1052
    static const int MultiplyDeBruijnBitPosition[32] = {
1053
        1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31,
1054
        9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32
1055
    };
1056
1057
    v |= v >> 1; // first up to set all bits 1 after the first 1 */
1058
    v |= v >> 2;
1059
    v |= v >> 4;
1060
    v |= v >> 8;
1061
    v |= v >> 16;
1062
1063
    // DeBruijn magic to find top bit
1064
    return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
1065
}
1066
#endif
1067
1068
#if 0
1069
static int sort_freqs(const void *vp1, const void *vp2) {
1070
    const int i1 = *(const int *)vp1;
1071
    const int i2 = *(const int *)vp2;
1072
    return i1-i2;
1073
}
1074
#endif
1075
1076
/* ----------------------------------------------------------------------
1077
 * Primary CRAM sequence decoder
1078
 */
1079
1080
0
static inline int add_md_char(cram_slice *s, int decode_md, char c, int32_t *md_dist) {
1081
0
    if (decode_md) {
1082
0
        BLOCK_APPEND_UINT(s->aux_blk, *md_dist);
1083
0
        BLOCK_APPEND_CHAR(s->aux_blk, c);
1084
0
        *md_dist = 0;
1085
0
    }
1086
0
    return 0;
1087
1088
0
 block_err:
1089
0
    return -1;
1090
0
}
1091
1092
/*
1093
 * Internal part of cram_decode_slice().
1094
 * Generates the sequence, quality and cigar components.
1095
 */
1096
static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
1097
                           cram_block *blk, cram_record *cr, sam_hdr_t *sh,
1098
                           int cf, char *seq, char *qual,
1099
0
                           int has_MD, int has_NM) {
1100
0
    int prev_pos = 0, f, r = 0, out_sz = 1;
1101
0
    int seq_pos = 1;
1102
0
    int cig_len = 0;
1103
0
    int64_t ref_pos = cr->apos;
1104
0
    int32_t fn, i32;
1105
0
    enum cigar_op cig_op = BAM_CMATCH;
1106
0
    uint32_t *cigar = s->cigar;
1107
0
    uint32_t ncigar = s->ncigar;
1108
0
    uint32_t cigar_alloc = s->cigar_alloc;
1109
0
    uint32_t nm = 0;
1110
0
    int32_t md_dist = 0;
1111
0
    int orig_aux = 0;
1112
    // CRAM <  4.0 decode_md is off/on
1113
    // CRAM >= 4.0 decode_md is auto/on (auto=on if MD* present, off otherwise)
1114
0
    int do_md = CRAM_MAJOR_VERS(fd->version) >= 4
1115
0
        ? (s->decode_md > 0)
1116
0
        : (s->decode_md != 0);
1117
0
    int decode_md = s->ref && cr->ref_id >= 0 && ((do_md && !has_MD) || has_MD < 0);
1118
0
    int decode_nm = s->ref && cr->ref_id >= 0 && ((do_md && !has_NM) || has_NM < 0);
1119
0
    uint32_t ds = s->data_series;
1120
0
    sam_hrecs_t *bfd = sh->hrecs;
1121
1122
0
    cram_codec **codecs = c->comp_hdr->codecs;
1123
1124
0
    if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1125
0
        memset(qual, 255, cr->len);
1126
0
    }
1127
1128
0
    if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
1129
0
        decode_md = decode_nm = 0;
1130
1131
0
    if (decode_md) {
1132
0
        orig_aux = BLOCK_SIZE(s->aux_blk);
1133
0
        if (has_MD == 0)
1134
0
            BLOCK_APPEND(s->aux_blk, "MDZ", 3);
1135
0
    }
1136
1137
0
    if (ds & CRAM_FN) {
1138
0
        if (!codecs[DS_FN]) return -1;
1139
0
        r |= codecs[DS_FN]->decode(s,codecs[DS_FN],
1140
0
                                   blk, (char *)&fn, &out_sz);
1141
0
        if (r) return r;
1142
0
    } else {
1143
0
        fn = 0;
1144
0
    }
1145
1146
0
    ref_pos--; // count from 0
1147
0
    cr->cigar = ncigar;
1148
1149
0
    if (!(ds & (CRAM_FC | CRAM_FP)))
1150
0
        goto skip_cigar;
1151
1152
0
    if (fn) {
1153
0
        if ((ds & CRAM_FC) && !codecs[DS_FC])
1154
0
            return -1;
1155
0
        if ((ds & CRAM_FP) && !codecs[DS_FP])
1156
0
            return -1;
1157
0
    }
1158
1159
0
    for (f = 0; f < fn; f++) {
1160
0
        int32_t pos = 0;
1161
0
        char op;
1162
1163
0
        if (ncigar+2 >= cigar_alloc) {
1164
0
            cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1165
0
            if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar))))
1166
0
                return -1;
1167
0
            s->cigar = cigar;
1168
0
        }
1169
1170
0
        if (ds & CRAM_FC) {
1171
0
            r |= codecs[DS_FC]->decode(s,
1172
0
                                       codecs[DS_FC],
1173
0
                                       blk,
1174
0
                                       &op,  &out_sz);
1175
0
            if (r) return r;
1176
0
        }
1177
1178
0
        if (!(ds & CRAM_FP))
1179
0
            continue;
1180
1181
0
        r |= codecs[DS_FP]->decode(s,
1182
0
                                   codecs[DS_FP],
1183
0
                                   blk,
1184
0
                                   (char *)&pos, &out_sz);
1185
0
        if (r) return r;
1186
0
        pos += prev_pos;
1187
1188
0
        if (pos <= 0) {
1189
0
            hts_log_error("Feature position %d before start of read", pos);
1190
0
            return -1;
1191
0
        }
1192
1193
0
        if (pos > seq_pos) {
1194
0
            if (pos > cr->len+1)
1195
0
                return -1;
1196
1197
0
            if (s->ref && cr->ref_id >= 0) {
1198
0
                if (ref_pos + pos - seq_pos > bfd->ref[cr->ref_id].len) {
1199
0
                    static int whinged = 0;
1200
0
                    int rlen;
1201
0
                    if (!whinged)
1202
0
                        hts_log_warning("Ref pos outside of ref sequence boundary");
1203
0
                    whinged = 1;
1204
0
                    rlen = bfd->ref[cr->ref_id].len - ref_pos;
1205
                    // May miss MD/NM cases where both seq/ref are N, but this is a
1206
                    // malformed cram file anyway.
1207
0
                    if (rlen > 0) {
1208
0
                        if (ref_pos + rlen > s->ref_end)
1209
0
                            goto beyond_slice;
1210
1211
0
                        memcpy(&seq[seq_pos-1],
1212
0
                               &s->ref[ref_pos - s->ref_start +1], rlen);
1213
0
                        if ((pos - seq_pos) - rlen > 0)
1214
0
                            memset(&seq[seq_pos-1+rlen], 'N',
1215
0
                                   (pos - seq_pos) - rlen);
1216
0
                    } else {
1217
0
                        memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
1218
0
                    }
1219
0
                    if (md_dist >= 0)
1220
0
                        md_dist += pos - seq_pos;
1221
0
                } else {
1222
                    // 'N' in both ref and seq is also mismatch for NM/MD
1223
0
                    if (ref_pos + pos-seq_pos > s->ref_end)
1224
0
                        goto beyond_slice;
1225
1226
0
                    const char *refp = s->ref + ref_pos - s->ref_start + 1;
1227
0
                    const int frag_len = pos - seq_pos;
1228
0
                    int do_cpy = 1;
1229
0
                    if (decode_md || decode_nm) {
1230
0
                        char *N = memchr(refp, 'N', frag_len);
1231
0
                        if (N) {
1232
0
                            int i;
1233
0
                            for (i = 0; i < frag_len; i++) {
1234
0
                                char base = refp[i];
1235
0
                                if (base == 'N') {
1236
0
                                    if (add_md_char(s, decode_md,
1237
0
                                                    'N', &md_dist) < 0)
1238
0
                                        return -1;
1239
0
                                    nm++;
1240
0
                                } else {
1241
0
                                    md_dist++;
1242
0
                                }
1243
0
                                seq[seq_pos-1+i] = base;
1244
0
                            }
1245
0
                            do_cpy = 0;
1246
0
                        } else {
1247
0
                            md_dist += frag_len;
1248
0
                        }
1249
0
                    }
1250
0
                    if (do_cpy)
1251
0
                        memcpy(&seq[seq_pos-1], refp, frag_len);
1252
0
                }
1253
0
            }
1254
#ifdef USE_X
1255
            if (cig_len && cig_op != BAM_CBASE_MATCH) {
1256
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1257
                cig_len = 0;
1258
            }
1259
            cig_op = BAM_CBASE_MATCH;
1260
#else
1261
0
            if (cig_len && cig_op != BAM_CMATCH) {
1262
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1263
0
                cig_len = 0;
1264
0
            }
1265
0
            cig_op = BAM_CMATCH;
1266
0
#endif
1267
0
            cig_len += pos - seq_pos;
1268
0
            ref_pos += pos - seq_pos;
1269
0
            seq_pos = pos;
1270
0
        }
1271
1272
0
        prev_pos = pos;
1273
1274
0
        if (!(ds & CRAM_FC))
1275
0
            goto skip_cigar;
1276
1277
0
        switch(op) {
1278
0
        case 'S': { // soft clip: IN
1279
0
            int32_t out_sz2 = cr->len ? cr->len-(pos-1) : 1;
1280
0
            int have_sc = 0;
1281
1282
0
            if (cig_len) {
1283
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1284
0
                cig_len = 0;
1285
0
            }
1286
0
            switch (CRAM_MAJOR_VERS(fd->version)) {
1287
0
            case 1:
1288
0
                if (ds & CRAM_IN) {
1289
0
                    r |= codecs[DS_IN]
1290
0
                        ? codecs[DS_IN]->decode(s, codecs[DS_IN],
1291
0
                                                blk,
1292
0
                                                cr->len ? &seq[pos-1] : NULL,
1293
0
                                                &out_sz2)
1294
0
                        : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1295
0
                    have_sc = 1;
1296
0
                }
1297
0
                break;
1298
0
            case 2:
1299
0
            default:
1300
0
                if (ds & CRAM_SC) {
1301
0
                    r |= codecs[DS_SC]
1302
0
                        ? codecs[DS_SC]->decode(s, codecs[DS_SC],
1303
0
                                                blk,
1304
0
                                                cr->len ? &seq[pos-1] : NULL,
1305
0
                                                &out_sz2)
1306
0
                        : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1307
0
                    have_sc = 1;
1308
0
                }
1309
0
                break;
1310
1311
                //default:
1312
                //    r |= codecs[DS_BB]
1313
                //        ? codecs[DS_BB]->decode(s, codecs[DS_BB],
1314
                //                                blk, &seq[pos-1], &out_sz2)
1315
                //        : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1316
0
            }
1317
0
            if (have_sc) {
1318
0
                if (r) return r;
1319
0
                cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP;
1320
0
                cig_op = BAM_CSOFT_CLIP;
1321
0
                seq_pos += out_sz2;
1322
0
            }
1323
0
            break;
1324
0
        }
1325
1326
0
        case 'X': { // Substitution; BS
1327
0
            unsigned char base;
1328
#ifdef USE_X
1329
            if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1330
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1331
                cig_len = 0;
1332
            }
1333
            if (ds & CRAM_BS) {
1334
                if (!codecs[DS_BS]) return -1;
1335
                r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk,
1336
                                           (char *)&base, &out_sz);
1337
                if (pos-1 < cr->len)
1338
                    seq[pos-1] = 'N'; // FIXME look up BS=base value
1339
            }
1340
            cig_op = BAM_CBASE_MISMATCH;
1341
#else
1342
0
            int ref_base;
1343
0
            if (cig_len && cig_op != BAM_CMATCH) {
1344
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1345
0
                cig_len = 0;
1346
0
            }
1347
0
            if (ds & CRAM_BS) {
1348
0
                if (!codecs[DS_BS]) return -1;
1349
0
                r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk,
1350
0
                                           (char *)&base, &out_sz);
1351
0
                if (r) return -1;
1352
0
                if (cr->ref_id < 0 || ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
1353
0
                    if (pos-1 < cr->len)
1354
0
                        seq[pos-1] = c->comp_hdr->
1355
0
                            substitution_matrix[fd->L1['N']][base];
1356
0
                    if (decode_md || decode_nm) {
1357
0
                        if (md_dist >= 0 && decode_md)
1358
0
                            BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1359
0
                        md_dist = -1;
1360
0
                        nm--;
1361
0
                    }
1362
0
                } else {
1363
0
                    unsigned char ref_call = ref_pos < s->ref_end
1364
0
                        ? (uc)s->ref[ref_pos - s->ref_start +1]
1365
0
                        : 'N';
1366
0
                    ref_base = fd->L1[ref_call];
1367
0
                    if (pos-1 < cr->len)
1368
0
                        seq[pos-1] = c->comp_hdr->
1369
0
                            substitution_matrix[ref_base][base];
1370
0
                    if (add_md_char(s, decode_md, ref_call, &md_dist) < 0)
1371
0
                        return -1;
1372
0
                }
1373
0
            }
1374
0
            cig_op = BAM_CMATCH;
1375
0
#endif
1376
0
            nm++;
1377
0
            cig_len++;
1378
0
            seq_pos++;
1379
0
            ref_pos++;
1380
0
            break;
1381
0
        }
1382
1383
0
        case 'D': { // Deletion; DL
1384
0
            if (cig_len && cig_op != BAM_CDEL) {
1385
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1386
0
                cig_len = 0;
1387
0
            }
1388
0
            if (ds & CRAM_DL) {
1389
0
                if (!codecs[DS_DL]) return -1;
1390
0
                r |= codecs[DS_DL]->decode(s, codecs[DS_DL], blk,
1391
0
                                           (char *)&i32, &out_sz);
1392
0
                if (r) return r;
1393
0
                if (decode_md || decode_nm) {
1394
0
                    if (ref_pos + i32 > s->ref_end)
1395
0
                        goto beyond_slice;
1396
0
                    if (md_dist >= 0 && decode_md)
1397
0
                        BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1398
0
                    if (ref_pos + i32 <= bfd->ref[cr->ref_id].len) {
1399
0
                        if (decode_md) {
1400
0
                            BLOCK_APPEND_CHAR(s->aux_blk, '^');
1401
0
                            BLOCK_APPEND(s->aux_blk,
1402
0
                                         &s->ref[ref_pos - s->ref_start +1],
1403
0
                                         i32);
1404
0
                            md_dist = 0;
1405
0
                        }
1406
0
                        nm += i32;
1407
0
                    } else {
1408
0
                        uint32_t dlen;
1409
0
                        if (bfd->ref[cr->ref_id].len >= ref_pos) {
1410
0
                            if (decode_md) {
1411
0
                                BLOCK_APPEND_CHAR(s->aux_blk, '^');
1412
0
                                BLOCK_APPEND(s->aux_blk,
1413
0
                                             &s->ref[ref_pos - s->ref_start+1],
1414
0
                                             bfd->ref[cr->ref_id].len-ref_pos);
1415
0
                                BLOCK_APPEND_UINT(s->aux_blk, 0);
1416
0
                            }
1417
0
                            dlen = i32 - (bfd->ref[cr->ref_id].len - ref_pos);
1418
0
                            nm += i32 - dlen;
1419
0
                        } else {
1420
0
                            dlen = i32;
1421
0
                        }
1422
1423
0
                        md_dist = -1;
1424
0
                    }
1425
0
                }
1426
0
                cig_op = BAM_CDEL;
1427
0
                cig_len += i32;
1428
0
                ref_pos += i32;
1429
                //printf("  %d: DL = %d (ret %d)\n", f, i32, r);
1430
0
            }
1431
0
            break;
1432
0
        }
1433
1434
0
        case 'I': { // Insertion (several bases); IN
1435
0
            int32_t out_sz2 = cr->len ? cr->len-(pos-1) : 1;
1436
1437
0
            if (cig_len && cig_op != BAM_CINS) {
1438
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1439
0
                cig_len = 0;
1440
0
            }
1441
1442
0
            if (ds & CRAM_IN) {
1443
0
                if (!codecs[DS_IN]) return -1;
1444
0
                r |= codecs[DS_IN]->decode(s, codecs[DS_IN], blk,
1445
0
                                           cr->len ? &seq[pos-1] : NULL,
1446
0
                                           &out_sz2);
1447
0
                if (r) return r;
1448
0
                cig_op = BAM_CINS;
1449
0
                cig_len += out_sz2;
1450
0
                seq_pos += out_sz2;
1451
0
                nm      += out_sz2;
1452
                //printf("  %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2);
1453
0
            }
1454
0
            break;
1455
0
        }
1456
1457
0
        case 'i': { // Insertion (single base); BA
1458
0
            if (cig_len && cig_op != BAM_CINS) {
1459
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1460
0
                cig_len = 0;
1461
0
            }
1462
0
            if (ds & CRAM_BA) {
1463
0
                if (!codecs[DS_BA]) return -1;
1464
0
                r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk,
1465
0
                                           cr->len ? &seq[pos-1] : NULL,
1466
0
                                           &out_sz);
1467
0
                if (r) return r;
1468
0
            }
1469
0
            cig_op = BAM_CINS;
1470
0
            cig_len++;
1471
0
            seq_pos++;
1472
0
            nm++;
1473
0
            break;
1474
0
        }
1475
1476
0
        case 'b': { // Several bases
1477
0
            int32_t len = cr->len ? cr->len-(pos-1) : 1;
1478
1479
0
            if (cig_len && cig_op != BAM_CMATCH) {
1480
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1481
0
                cig_len = 0;
1482
0
            }
1483
1484
0
            if (ds & CRAM_BB) {
1485
0
                if (!codecs[DS_BB]) return -1;
1486
0
                r |= codecs[DS_BB]->decode(s, codecs[DS_BB], blk,
1487
0
                                           cr->len ? &seq[pos-1] : NULL,
1488
0
                                           &len);
1489
0
                if (r) return r;
1490
1491
0
                if (decode_md || decode_nm) {
1492
0
                    int x;
1493
0
                    if (md_dist >= 0 && decode_md)
1494
0
                        BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1495
1496
0
                    for (x = 0; x < len; x++) {
1497
0
                        if (x && decode_md)
1498
0
                            BLOCK_APPEND_UINT(s->aux_blk, 0);
1499
0
                        if (ref_pos+x >= bfd->ref[cr->ref_id].len || !s->ref) {
1500
0
                            md_dist = -1;
1501
0
                            break;
1502
0
                        } else {
1503
0
                            if (decode_md) {
1504
0
                                if (ref_pos + x > s->ref_end)
1505
0
                                    goto beyond_slice;
1506
0
                                char r = s->ref[ref_pos+x-s->ref_start +1];
1507
0
                                BLOCK_APPEND_CHAR(s->aux_blk, r);
1508
0
                            }
1509
0
                        }
1510
0
                    }
1511
1512
0
                    nm += x;
1513
0
                    md_dist = 0;
1514
0
                }
1515
0
            }
1516
1517
0
            cig_op = BAM_CMATCH;
1518
1519
0
            cig_len+=len;
1520
0
            seq_pos+=len;
1521
0
            ref_pos+=len;
1522
            //prev_pos+=len;
1523
0
            break;
1524
0
        }
1525
1526
0
        case 'q': { // Several quality values
1527
0
            int32_t len = cr->len ? cr->len - (pos-1) : 1;
1528
1529
0
            if (cig_len && cig_op != BAM_CMATCH) {
1530
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1531
0
                cig_len = 0;
1532
0
            }
1533
1534
0
            if (ds & CRAM_QQ) {
1535
0
                if (!codecs[DS_QQ]) return -1;
1536
0
                if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)
1537
0
                    && (unsigned char)*qual == 255)
1538
0
                    memset(qual, 30, cr->len); // ?
1539
0
                r |= codecs[DS_QQ]->decode(s, codecs[DS_QQ], blk,
1540
0
                                           (char *)&qual[pos-1], &len);
1541
0
                if (r) return r;
1542
0
            }
1543
1544
0
            cig_op = BAM_CMATCH;
1545
1546
            //prev_pos+=len;
1547
0
            break;
1548
0
        }
1549
1550
0
        case 'B': { // Read base; BA, QS
1551
#ifdef USE_X
1552
            if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1553
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1554
                cig_len = 0;
1555
            }
1556
#else
1557
0
            if (cig_len && cig_op != BAM_CMATCH) {
1558
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1559
0
                cig_len = 0;
1560
0
            }
1561
0
#endif
1562
0
            if (ds & CRAM_BA) {
1563
0
                if (!codecs[DS_BA]) return -1;
1564
0
                r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk,
1565
0
                                           cr->len ? &seq[pos-1] : NULL,
1566
0
                                           &out_sz);
1567
1568
0
                if (decode_md || decode_nm) {
1569
0
                    if (md_dist >= 0 && decode_md)
1570
0
                        BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1571
0
                    if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
1572
0
                        md_dist = -1;
1573
0
                    } else {
1574
0
                        if (decode_md) {
1575
0
                            if (ref_pos > s->ref_end)
1576
0
                                goto beyond_slice;
1577
0
                            BLOCK_APPEND_CHAR(s->aux_blk,
1578
0
                                              s->ref[ref_pos-s->ref_start +1]);
1579
0
                        }
1580
0
                        nm++;
1581
0
                        md_dist = 0;
1582
0
                    }
1583
0
                }
1584
0
            }
1585
0
            if (ds & CRAM_QS) {
1586
0
                if (!codecs[DS_QS]) return -1;
1587
0
                if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)
1588
0
                    && (unsigned char)*qual == 255)
1589
0
                    memset(qual, 30, cr->len); // ASCII ?.  Same as htsjdk
1590
0
                r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk,
1591
0
                                           (char *)&qual[pos-1], &out_sz);
1592
0
            }
1593
#ifdef USE_X
1594
            cig_op = BAM_CBASE_MISMATCH;
1595
#else
1596
0
            cig_op = BAM_CMATCH;
1597
0
#endif
1598
0
            cig_len++;
1599
0
            seq_pos++;
1600
0
            ref_pos++;
1601
            //printf("  %d: BA/QS(B) = %c/%d (ret %d)\n", f, i32, qc, r);
1602
0
            break;
1603
0
        }
1604
1605
0
        case 'Q': { // Quality score; QS
1606
0
            if (ds & CRAM_QS) {
1607
0
                if (!codecs[DS_QS]) return -1;
1608
0
                if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) &&
1609
0
                    (unsigned char)*qual == 255)
1610
0
                    memset(qual, 30, cr->len); // ?
1611
0
                r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk,
1612
0
                                           (char *)&qual[pos-1], &out_sz);
1613
                //printf("  %d: QS = %d (ret %d)\n", f, qc, r);
1614
0
            }
1615
0
            break;
1616
0
        }
1617
1618
0
        case 'H': { // hard clip; HC
1619
0
            if (cig_len && cig_op != BAM_CHARD_CLIP) {
1620
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1621
0
                cig_len = 0;
1622
0
            }
1623
0
            if (ds & CRAM_HC) {
1624
0
                if (!codecs[DS_HC]) return -1;
1625
0
                r |= codecs[DS_HC]->decode(s, codecs[DS_HC], blk,
1626
0
                                           (char *)&i32, &out_sz);
1627
0
                if (r) return r;
1628
0
                cig_op = BAM_CHARD_CLIP;
1629
0
                cig_len += i32;
1630
0
            }
1631
0
            break;
1632
0
        }
1633
1634
0
        case 'P': { // padding; PD
1635
0
            if (cig_len && cig_op != BAM_CPAD) {
1636
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1637
0
                cig_len = 0;
1638
0
            }
1639
0
            if (ds & CRAM_PD) {
1640
0
                if (!codecs[DS_PD]) return -1;
1641
0
                r |= codecs[DS_PD]->decode(s, codecs[DS_PD], blk,
1642
0
                                           (char *)&i32, &out_sz);
1643
0
                if (r) return r;
1644
0
                cig_op = BAM_CPAD;
1645
0
                cig_len += i32;
1646
0
            }
1647
0
            break;
1648
0
        }
1649
1650
0
        case 'N': { // Ref skip; RS
1651
0
            if (cig_len && cig_op != BAM_CREF_SKIP) {
1652
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1653
0
                cig_len = 0;
1654
0
            }
1655
0
            if (ds & CRAM_RS) {
1656
0
                if (!codecs[DS_RS]) return -1;
1657
0
                r |= codecs[DS_RS]->decode(s, codecs[DS_RS], blk,
1658
0
                                           (char *)&i32, &out_sz);
1659
0
                if (r) return r;
1660
0
                cig_op = BAM_CREF_SKIP;
1661
0
                cig_len += i32;
1662
0
                ref_pos += i32;
1663
0
            }
1664
0
            break;
1665
0
        }
1666
1667
0
        default:
1668
0
            hts_log_error("Unknown feature code '%c'", op);
1669
0
            return -1;
1670
0
        }
1671
0
    }
1672
1673
0
    if (!(ds & CRAM_FC))
1674
0
        goto skip_cigar;
1675
1676
    /* An implicit match op for any unaccounted for bases */
1677
0
    if ((ds & CRAM_FN) && cr->len >= seq_pos) {
1678
0
        if (s->ref && cr->ref_id >= 0) {
1679
0
            if (ref_pos + cr->len - seq_pos + 1 > bfd->ref[cr->ref_id].len) {
1680
0
                static int whinged = 0;
1681
0
                int rlen;
1682
0
                if (!whinged)
1683
0
                    hts_log_warning("Ref pos outside of ref sequence boundary");
1684
0
                whinged = 1;
1685
0
                rlen = bfd->ref[cr->ref_id].len - ref_pos;
1686
                // May miss MD/NM cases where both seq/ref are N, but this is a
1687
                // malformed cram file anyway.
1688
0
                if (rlen > 0) {
1689
0
                    if (seq_pos-1 + rlen < cr->len)
1690
0
                        memcpy(&seq[seq_pos-1],
1691
0
                               &s->ref[ref_pos - s->ref_start +1], rlen);
1692
0
                    if ((cr->len - seq_pos + 1) - rlen > 0)
1693
0
                        memset(&seq[seq_pos-1+rlen], 'N',
1694
0
                               (cr->len - seq_pos + 1) - rlen);
1695
0
                } else {
1696
0
                    if (cr->len - seq_pos + 1 > 0)
1697
0
                        memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
1698
0
                }
1699
0
                if (md_dist >= 0)
1700
0
                    md_dist += cr->len - seq_pos + 1;
1701
0
            } else {
1702
0
                if (cr->len - seq_pos + 1 > 0) {
1703
0
                    if (ref_pos + cr->len-seq_pos +1 > s->ref_end)
1704
0
                        goto beyond_slice;
1705
0
                    int remainder = cr->len - (seq_pos-1);
1706
0
                    int j = ref_pos - s->ref_start + 1;
1707
0
                    if (decode_md || decode_nm) {
1708
0
                        int i;
1709
0
                        char *N = memchr(&s->ref[j], 'N', remainder);
1710
0
                        if (!N) {
1711
                            // short cut the common case
1712
0
                            md_dist += cr->len - (seq_pos-1);
1713
0
                        } else {
1714
0
                            char *refp = &s->ref[j-(seq_pos-1)];
1715
0
                            md_dist += N-&s->ref[j];
1716
0
                            int i_start = seq_pos-1 + (N - &s->ref[j]);
1717
0
                            for (i = i_start; i < cr->len; i++) {
1718
0
                                char base = refp[i];
1719
0
                                if (base == 'N') {
1720
0
                                    if (add_md_char(s, decode_md, 'N',
1721
0
                                                    &md_dist) < 0)
1722
0
                                        return -1;
1723
0
                                    nm++;
1724
0
                                } else {
1725
0
                                    md_dist++;
1726
0
                                }
1727
0
                            }
1728
0
                        }
1729
0
                    }
1730
0
                    memcpy(&seq[seq_pos-1], &s->ref[j], remainder);
1731
0
                }
1732
0
                ref_pos += cr->len - seq_pos + 1;
1733
0
            }
1734
0
        } else if (cr->ref_id >= 0) {
1735
            // So alignment end can be computed even when not decoding sequence
1736
0
            ref_pos += cr->len - seq_pos + 1;
1737
0
        }
1738
1739
0
        if (ncigar+1 >= cigar_alloc) {
1740
0
            cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1741
0
            if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar))))
1742
0
                return -1;
1743
0
            s->cigar = cigar;
1744
0
        }
1745
#ifdef USE_X
1746
        if (cig_len && cig_op != BAM_CBASE_MATCH) {
1747
            cigar[ncigar++] = (cig_len<<4) + cig_op;
1748
            cig_len = 0;
1749
        }
1750
        cig_op = BAM_CBASE_MATCH;
1751
#else
1752
0
        if (cig_len && cig_op != BAM_CMATCH) {
1753
0
            cigar[ncigar++] = (cig_len<<4) + cig_op;
1754
0
            cig_len = 0;
1755
0
        }
1756
0
        cig_op = BAM_CMATCH;
1757
0
#endif
1758
0
        cig_len += cr->len - seq_pos+1;
1759
0
    }
1760
1761
0
 skip_cigar:
1762
1763
0
    if ((ds & CRAM_FN) && decode_md) {
1764
0
        if (md_dist >= 0)
1765
0
            BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1766
0
    }
1767
1768
0
    if (cig_len) {
1769
0
        if (ncigar >= cigar_alloc) {
1770
0
            cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1771
0
            if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar))))
1772
0
                return -1;
1773
0
            s->cigar = cigar;
1774
0
        }
1775
1776
0
        cigar[ncigar++] = (cig_len<<4) + cig_op;
1777
0
    }
1778
1779
0
    cr->ncigar = ncigar - cr->cigar;
1780
0
    cr->aend = ref_pos > cr->apos ? ref_pos : cr->apos;
1781
1782
    //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos);
1783
1784
0
    if (ds & CRAM_MQ) {
1785
0
        if (!codecs[DS_MQ]) return -1;
1786
0
        r |= codecs[DS_MQ]->decode(s, codecs[DS_MQ], blk,
1787
0
                                   (char *)&cr->mqual, &out_sz);
1788
0
    } else {
1789
0
        cr->mqual = 40;
1790
0
    }
1791
1792
0
    if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1793
0
        int32_t out_sz2 = cr->len;
1794
1795
0
        if (!codecs[DS_QS]) return -1;
1796
0
        r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk,
1797
0
                                   qual, &out_sz2);
1798
0
    }
1799
1800
0
    s->cigar = cigar;
1801
0
    s->cigar_alloc = cigar_alloc;
1802
0
    s->ncigar = ncigar;
1803
1804
0
    if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
1805
0
        cr->len = 0;
1806
1807
0
    if (decode_md) {
1808
0
        BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z:
1809
0
        size_t sz = BLOCK_SIZE(s->aux_blk) - orig_aux;
1810
0
        if (has_MD < 0) {
1811
            // has_MD < 0; already have MDZ allocated in aux at -has_MD,
1812
            // but wrote MD to end of aux (at orig_aux).
1813
            // We need some memmoves to shuffle it around.
1814
0
            char tmp_MD_[1024], *tmp_MD = tmp_MD_;
1815
0
            unsigned char *orig_aux_p = BLOCK_DATA(s->aux_blk) + orig_aux;
1816
0
            if (sz > 1024) {
1817
0
                tmp_MD = malloc(sz);
1818
0
                if (!tmp_MD)
1819
0
                    return -1;
1820
0
            }
1821
0
            memcpy(tmp_MD, orig_aux_p, sz);
1822
0
            memmove(&BLOCK_DATA(s->aux_blk)[-has_MD] + sz,
1823
0
                    &BLOCK_DATA(s->aux_blk)[-has_MD],
1824
0
                    orig_aux_p - &BLOCK_DATA(s->aux_blk)[-has_MD]);
1825
0
            memcpy(&BLOCK_DATA(s->aux_blk)[-has_MD], tmp_MD, sz);
1826
0
            if (tmp_MD != tmp_MD_)
1827
0
                free(tmp_MD);
1828
1829
0
            if (-has_NM > -has_MD)
1830
                // we inserted before NM, so move it up a bit
1831
0
                has_NM -= sz;
1832
0
        }
1833
        // else has_MD == 0 and we've already appended MD to the end.
1834
1835
0
        cr->aux_size += sz;
1836
0
    }
1837
1838
0
    if (decode_nm) {
1839
0
        if (has_NM == 0) {
1840
0
            char buf[7];
1841
0
            size_t buf_size;
1842
0
            buf[0] = 'N'; buf[1] = 'M';
1843
0
            if (nm <= UINT8_MAX) {
1844
0
                buf_size = 4;
1845
0
                buf[2] = 'C';
1846
0
                buf[3] = (nm>> 0) & 0xff;
1847
0
            } else if (nm <= UINT16_MAX) {
1848
0
                buf_size = 5;
1849
0
                buf[2] = 'S';
1850
0
                buf[3] = (nm>> 0) & 0xff;
1851
0
                buf[4] = (nm>> 8) & 0xff;
1852
0
            } else {
1853
0
                buf_size = 7;
1854
0
                buf[2] = 'I';
1855
0
                buf[3] = (nm>> 0) & 0xff;
1856
0
                buf[4] = (nm>> 8) & 0xff;
1857
0
                buf[5] = (nm>>16) & 0xff;
1858
0
                buf[6] = (nm>>24) & 0xff;
1859
0
            }
1860
0
            BLOCK_APPEND(s->aux_blk, buf, buf_size);
1861
0
            cr->aux_size += buf_size;
1862
0
        } else {
1863
            // Preallocated space for NM at -has_NM into aux block
1864
0
            unsigned char *buf = BLOCK_DATA(s->aux_blk) + -has_NM;
1865
0
            buf[0] = (nm>> 0) & 0xff;
1866
0
            buf[1] = (nm>> 8) & 0xff;
1867
0
            buf[2] = (nm>>16) & 0xff;
1868
0
            buf[3] = (nm>>24) & 0xff;
1869
0
        }
1870
0
    }
1871
1872
0
    return r;
1873
1874
0
 beyond_slice:
1875
    // Cramtools can create CRAMs that have sequence features outside the
1876
    // stated range of the container & slice reference extents (start + span).
1877
    // We have to check for these in many places, but for brevity have the
1878
    // error reporting in only one.
1879
0
    hts_log_error("CRAM CIGAR extends beyond slice reference extents");
1880
0
    return -1;
1881
1882
0
 block_err:
1883
0
    return -1;
1884
0
}
1885
1886
/*
1887
 * Quick and simple hash lookup for cram_map arrays
1888
 */
1889
0
static cram_map *map_find(cram_map **map, unsigned char *key, int id) {
1890
0
    cram_map *m;
1891
1892
0
    m = map[CRAM_MAP(key[0],key[1])];
1893
0
    while (m && m->key != id)
1894
0
        m= m->next;
1895
1896
0
    return m;
1897
0
}
1898
1899
//#define map_find(M,K,I) M[CRAM_MAP(K[0],K[1])];while (m && m->key != I);m= m->next
1900
1901
1902
static int cram_decode_aux_1_0(cram_container *c, cram_slice *s,
1903
0
                               cram_block *blk, cram_record *cr) {
1904
0
    int i, r = 0, out_sz = 1;
1905
0
    unsigned char ntags;
1906
1907
0
    if (!c->comp_hdr->codecs[DS_TC]) return -1;
1908
0
    r |= c->comp_hdr->codecs[DS_TC]->decode(s, c->comp_hdr->codecs[DS_TC], blk,
1909
0
                                            (char *)&ntags, &out_sz);
1910
0
    cr->ntags = ntags;
1911
1912
    //printf("TC=%d\n", cr->ntags);
1913
0
    cr->aux_size = 0;
1914
0
    cr->aux = BLOCK_SIZE(s->aux_blk);
1915
1916
0
    for (i = 0; i < cr->ntags; i++) {
1917
0
        int32_t id, out_sz = 1;
1918
0
        unsigned char tag_data[3];
1919
0
        cram_map *m;
1920
1921
        //printf("Tag %d/%d\n", i+1, cr->ntags);
1922
0
        if (!c->comp_hdr->codecs[DS_TN]) return -1;
1923
0
        r |= c->comp_hdr->codecs[DS_TN]->decode(s, c->comp_hdr->codecs[DS_TN],
1924
0
                                                blk, (char *)&id, &out_sz);
1925
0
        if (out_sz == 3) {
1926
            // Tag name stored as 3 chars instead of an int?
1927
0
            memcpy(tag_data, &id, 3);
1928
0
        } else {
1929
0
            tag_data[0] = (id>>16) & 0xff;
1930
0
            tag_data[1] = (id>>8)  & 0xff;
1931
0
            tag_data[2] = id       & 0xff;
1932
0
        }
1933
1934
0
        m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
1935
0
        if (!m)
1936
0
            return -1;
1937
0
        BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
1938
1939
0
        if (!m->codec) return -1;
1940
0
        r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
1941
1942
0
        cr->aux_size += out_sz + 3;
1943
0
    }
1944
1945
0
    return r;
1946
1947
0
 block_err:
1948
0
    return -1;
1949
0
}
1950
1951
// has_MD and has_NM are filled out with 0 for none present,
1952
// 1 for present and verbatim, and -pos for present as placeholder
1953
// (MD*, NM*) to be generated and filled out at offset +pos.
1954
static int cram_decode_aux(cram_fd *fd,
1955
                           cram_container *c, cram_slice *s,
1956
                           cram_block *blk, cram_record *cr,
1957
0
                           int *has_MD, int *has_NM) {
1958
0
    int i, r = 0, out_sz = 1;
1959
0
    int32_t TL = 0;
1960
0
    unsigned char *TN;
1961
0
    uint32_t ds = s->data_series;
1962
1963
0
    if (!(ds & (CRAM_TL|CRAM_aux))) {
1964
0
        cr->aux = 0;
1965
0
        cr->aux_size = 0;
1966
0
        return 0;
1967
0
    }
1968
1969
0
    if (!c->comp_hdr->codecs[DS_TL]) return -1;
1970
0
    r |= c->comp_hdr->codecs[DS_TL]->decode(s, c->comp_hdr->codecs[DS_TL], blk,
1971
0
                                            (char *)&TL, &out_sz);
1972
0
    if (r || TL < 0 || TL >= c->comp_hdr->nTL)
1973
0
        return -1;
1974
1975
0
    TN = c->comp_hdr->TL[TL];
1976
0
    cr->ntags = strlen((char *)TN)/3; // optimise to remove strlen
1977
1978
    //printf("TC=%d\n", cr->ntags);
1979
0
    cr->aux_size = 0;
1980
0
    cr->aux = BLOCK_SIZE(s->aux_blk);
1981
1982
0
    if (!(ds & CRAM_aux))
1983
0
        return 0;
1984
1985
0
    for (i = 0; i < cr->ntags; i++) {
1986
0
        int32_t id, out_sz = 1;
1987
0
        unsigned char tag_data[7];
1988
0
        cram_map *m;
1989
1990
0
        if (TN[0] == 'M' && TN[1] == 'D' && has_MD)
1991
0
            *has_MD = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1);
1992
0
        if (TN[0] == 'N' && TN[1] == 'M' && has_NM)
1993
0
            *has_NM = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1);;
1994
1995
        //printf("Tag %d/%d\n", i+1, cr->ntags);
1996
0
        tag_data[0] = TN[0];
1997
0
        tag_data[1] = TN[1];
1998
0
        tag_data[2] = TN[2];
1999
0
        id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2];
2000
2001
0
        if (CRAM_MAJOR_VERS(fd->version) >= 4 && TN[2] == '*') {
2002
            // Place holder, fill out contents later.
2003
0
            int tag_data_size;
2004
0
            if (TN[0] == 'N' && TN[1] == 'M') {
2005
                // Use a fixed size, so we can allocate room for it now.
2006
0
                memcpy(&tag_data[2], "I\0\0\0\0", 5);
2007
0
                tag_data_size = 7;
2008
0
            } else if (TN[0] == 'R' && TN[1] == 'G') {
2009
                // RG is variable size, but known already.  Insert now
2010
0
                TN += 3;
2011
                // Equiv to fd->header->hrecs->rg[cr->rg], but this is the
2012
                // new header API equivalent.
2013
0
                const char *rg = sam_hdr_line_name(fd->header, "RG", cr->rg);
2014
0
                if (!rg)
2015
0
                    continue;
2016
2017
0
                size_t rg_len = strlen(rg);
2018
0
                tag_data[2] = 'Z';
2019
0
                BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
2020
0
                BLOCK_APPEND(s->aux_blk, rg, rg_len);
2021
0
                BLOCK_APPEND_CHAR(s->aux_blk, '\0');
2022
0
                cr->aux_size += 3 + rg_len + 1;
2023
0
                cr->rg = -1; // prevents auto-add later
2024
0
                continue;
2025
0
            } else {
2026
                // Unknown size.  We'll insert MD into stream later.
2027
0
                tag_data[2] = 'Z';
2028
0
                tag_data_size = 3;
2029
0
            }
2030
0
            BLOCK_APPEND(s->aux_blk, (char *)tag_data, tag_data_size);
2031
0
            cr->aux_size += tag_data_size;
2032
0
            TN += 3;
2033
0
        } else {
2034
0
            TN += 3;
2035
0
            m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
2036
0
            if (!m)
2037
0
                return -1;
2038
2039
0
            BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
2040
2041
0
            if (!m->codec) return -1;
2042
0
            r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
2043
0
            if (r) break;
2044
0
            cr->aux_size += out_sz + 3;
2045
2046
            // cF CRAM flags.
2047
0
            if (TN[-3]=='c' && TN[-2]=='F' && TN[-1]=='C' && out_sz == 1) {
2048
                // Remove cF tag
2049
0
                uint8_t cF = BLOCK_END(s->aux_blk)[-1];
2050
0
                BLOCK_SIZE(s->aux_blk) -= out_sz+3;
2051
0
                cr->aux_size -= out_sz+3;
2052
2053
                // bit 1 => don't auto-decode MD.
2054
                // Pretend MD is present verbatim, so we don't auto-generate
2055
0
                if ((cF & 1) && has_MD && *has_MD == 0)
2056
0
                    *has_MD = 1;
2057
2058
                // bit 1 => don't auto-decode NM
2059
0
                if ((cF & 2) && has_NM && *has_NM == 0)
2060
0
                    *has_NM = 1;
2061
0
            }
2062
0
        }
2063
2064
        // We could go to 2^32 fine, but we shouldn't be hitting this anyway,
2065
        // and it's protecting against memory hogs too.
2066
0
        if (BLOCK_SIZE(s->aux_blk) > (1u<<31)) {
2067
0
            hts_log_error("CRAM->BAM aux block size overflow");
2068
0
            goto block_err;
2069
0
        }
2070
0
    }
2071
2072
0
    return r;
2073
2074
0
 block_err:
2075
0
    return -1;
2076
0
}
2077
2078
/* Resolve mate pair cross-references between recs within this slice */
2079
597
static int cram_decode_slice_xref(cram_slice *s, int required_fields) {
2080
597
    int rec;
2081
2082
597
    if (!(required_fields & (SAM_RNEXT | SAM_PNEXT | SAM_TLEN))) {
2083
0
        for (rec = 0; rec < s->hdr->num_records; rec++) {
2084
0
            cram_record *cr = &s->crecs[rec];
2085
2086
0
            cr->tlen = 0;
2087
0
            cr->mate_pos = 0;
2088
0
            cr->mate_ref_id = -1;
2089
0
        }
2090
2091
0
        return 0;
2092
0
    }
2093
2094
597
    for (rec = 0; rec < s->hdr->num_records; rec++) {
2095
0
        cram_record *cr = &s->crecs[rec];
2096
2097
0
        if (cr->mate_line >= 0) {
2098
0
            if (cr->mate_line < s->hdr->num_records) {
2099
                /*
2100
                 * On the first read, loop through computing lengths.
2101
                 * It's not perfect as we have one slice per reference so we
2102
                 * cannot detect when TLEN should be zero due to seqs that
2103
                 * map to multiple references.
2104
                 *
2105
                 * We also cannot set tlen correct when it spans a slice for
2106
                 * other reasons. This may make tlen too small. Should we
2107
                 * fix this by forcing TLEN to be stored verbatim in such cases?
2108
                 *
2109
                 * Or do we just admit defeat and output 0 for tlen? It's the
2110
                 * safe option...
2111
                 */
2112
0
                if (cr->tlen == INT64_MIN) {
2113
0
                    int id1 = rec, id2 = rec;
2114
0
                    int64_t aleft = cr->apos, aright = cr->aend;
2115
0
                    int64_t tlen;
2116
0
                    int ref = cr->ref_id;
2117
2118
                    // number of segments starting at the same point.
2119
0
                    int left_cnt = 0;
2120
0
                    int right_cnt = 0;
2121
2122
0
                    do {
2123
0
                        if (aleft > s->crecs[id2].apos)
2124
0
                            aleft = s->crecs[id2].apos, left_cnt = 1;
2125
0
                        else if (aleft == s->crecs[id2].apos)
2126
0
                            left_cnt++;
2127
0
                        if (aright < s->crecs[id2].aend) {
2128
0
                            aright = s->crecs[id2].aend;
2129
0
                            right_cnt = 1;
2130
0
                        } else if (aright == s->crecs[id2].aend) {
2131
0
                            right_cnt++;
2132
0
                        }
2133
0
                        if (s->crecs[id2].mate_line == -1) {
2134
0
                            s->crecs[id2].mate_line = rec;
2135
0
                            break;
2136
0
                        }
2137
0
                        if (s->crecs[id2].mate_line <= id2 ||
2138
0
                            s->crecs[id2].mate_line >= s->hdr->num_records)
2139
0
                            return -1;
2140
0
                        id2 = s->crecs[id2].mate_line;
2141
2142
0
                        if (s->crecs[id2].ref_id != ref)
2143
0
                            ref = -1;
2144
0
                    } while (id2 != id1);
2145
2146
0
                    if (ref != -1) {
2147
0
                        tlen = aright - aleft + 1;
2148
0
                        id1 = id2 = rec;
2149
2150
                        /*
2151
                         * When we have two seqs with identical start and
2152
                         * end coordinates, set +/- tlen based on 1st/last
2153
                         * bit flags instead, as a tie breaker.
2154
                         */
2155
0
                        if (s->crecs[id2].apos == aleft &&
2156
0
                            (s->crecs[id2].aend < aright ||
2157
0
                             left_cnt <= 1)) {
2158
                            // Leftmost, and not the rightmost
2159
0
                            s->crecs[id2].tlen = tlen;
2160
0
                            tlen = -tlen;
2161
0
                        } else if (s->crecs[id2].apos == aleft &&
2162
0
                                   s->crecs[id2].aend == aright &&
2163
0
                                   left_cnt > 1 && right_cnt > 1) {
2164
                            // Both leftmost and rightmost, resolve tie via
2165
                            // the BAM flags so changing order doesn't change
2166
                            // TLEN signs.
2167
0
                            if (s->crecs[id2].flags & BAM_FREAD1) {
2168
0
                                s->crecs[id2].tlen = tlen;
2169
0
                                tlen = -tlen;
2170
0
                            } else {
2171
0
                                s->crecs[id2].tlen = -tlen;
2172
0
                            }
2173
0
                        } else {
2174
                            // Rightmost or an internal
2175
0
                            s->crecs[id2].tlen = -tlen;
2176
0
                        }
2177
2178
0
                        id2 = s->crecs[id2].mate_line;
2179
0
                        while (id2 != id1) {
2180
0
                            s->crecs[id2].tlen = tlen;
2181
0
                            id2 = s->crecs[id2].mate_line;
2182
0
                        }
2183
0
                    } else {
2184
0
                        id1 = id2 = rec;
2185
2186
0
                        s->crecs[id2].tlen = 0;
2187
0
                        id2 = s->crecs[id2].mate_line;
2188
0
                        while (id2 != id1) {
2189
0
                            s->crecs[id2].tlen = 0;
2190
0
                            id2 = s->crecs[id2].mate_line;
2191
0
                        }
2192
0
                    }
2193
0
                }
2194
2195
0
                cr->mate_pos = s->crecs[cr->mate_line].apos;
2196
0
                cr->mate_ref_id = s->crecs[cr->mate_line].ref_id;
2197
2198
                // paired
2199
0
                cr->flags |= BAM_FPAIRED;
2200
2201
                // set mate unmapped if needed
2202
0
                if (s->crecs[cr->mate_line].flags & BAM_FUNMAP) {
2203
0
                    cr->flags |= BAM_FMUNMAP;
2204
0
                    cr->tlen = 0;
2205
0
                }
2206
0
                if (cr->flags & BAM_FUNMAP) {
2207
0
                    cr->tlen = 0;
2208
0
                }
2209
2210
                // set mate reversed if needed
2211
0
                if (s->crecs[cr->mate_line].flags & BAM_FREVERSE)
2212
0
                    cr->flags |= BAM_FMREVERSE;
2213
0
            } else {
2214
0
                hts_log_error("Mate line out of bounds: %d vs [0, %d]",
2215
0
                              cr->mate_line, s->hdr->num_records-1);
2216
0
            }
2217
2218
            /* FIXME: construct read names here too if needed */
2219
0
        } else {
2220
0
            if (cr->mate_flags & CRAM_M_REVERSE) {
2221
0
                cr->flags |= BAM_FPAIRED | BAM_FMREVERSE;
2222
0
            }
2223
0
            if (cr->mate_flags & CRAM_M_UNMAP) {
2224
0
                cr->flags |= BAM_FMUNMAP;
2225
                //cr->mate_ref_id = -1;
2226
0
            }
2227
0
            if (!(cr->flags & BAM_FPAIRED))
2228
0
                cr->mate_ref_id = -1;
2229
0
        }
2230
2231
0
        if (cr->tlen == INT64_MIN)
2232
0
            cr->tlen = 0; // Just incase
2233
0
    }
2234
2235
597
    for (rec = 0; rec < s->hdr->num_records; rec++) {
2236
0
        cram_record *cr = &s->crecs[rec];
2237
0
        if (cr->explicit_tlen != INT64_MIN)
2238
0
            cr->tlen = cr->explicit_tlen;
2239
0
    }
2240
2241
597
    return 0;
2242
597
}
2243
2244
0
static char *md5_print(unsigned char *md5, char *out) {
2245
0
    int i;
2246
0
    for (i = 0; i < 16; i++) {
2247
0
        out[i*2+0] = "0123456789abcdef"[md5[i]>>4];
2248
0
        out[i*2+1] = "0123456789abcdef"[md5[i]&15];
2249
0
    }
2250
0
    out[32] = 0;
2251
2252
0
    return out;
2253
0
}
2254
2255
/*
2256
 * Utility function to decode tlen (ISIZE), as it's called
2257
 * in multiple places.
2258
 *
2259
 * Returns codec return value (0 on success).
2260
 */
2261
static int cram_decode_tlen(cram_fd *fd, cram_container *c, cram_slice *s,
2262
0
                            cram_block *blk, int64_t *tlen) {
2263
0
    int out_sz = 1, r = 0;
2264
2265
0
    if (!c->comp_hdr->codecs[DS_TS]) return -1;
2266
0
    if (CRAM_MAJOR_VERS(fd->version) < 4) {
2267
0
        int32_t i32;
2268
0
        r |= c->comp_hdr->codecs[DS_TS]
2269
0
            ->decode(s, c->comp_hdr->codecs[DS_TS], blk,
2270
0
                     (char *)&i32, &out_sz);
2271
0
        *tlen = i32;
2272
0
    } else {
2273
0
        r |= c->comp_hdr->codecs[DS_TS]
2274
0
            ->decode(s, c->comp_hdr->codecs[DS_TS], blk,
2275
0
                     (char *)tlen, &out_sz);
2276
0
    }
2277
0
    return r;
2278
0
}
2279
2280
/*
2281
 * Decode an entire slice from container blocks. Fills out s->crecs[] array.
2282
 * Returns 0 on success
2283
 *        -1 on failure
2284
 */
2285
int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
2286
603
                      sam_hdr_t *sh) {
2287
603
    cram_block *blk = s->block[0];
2288
603
    int32_t bf, ref_id;
2289
603
    unsigned char cf;
2290
603
    int out_sz, r = 0;
2291
603
    int rec;
2292
603
    char *seq = NULL, *qual = NULL;
2293
603
    int unknown_rg = -1;
2294
603
    int embed_ref;
2295
603
    char **refs = NULL;
2296
603
    uint32_t ds;
2297
603
    sam_hrecs_t *bfd = sh->hrecs;
2298
2299
603
    if (cram_dependent_data_series(fd, c->comp_hdr, s) != 0)
2300
0
        return -1;
2301
2302
603
    ds = s->data_series;
2303
2304
603
    blk->bit = 7; // MSB first
2305
2306
    // Study the blocks and estimate approx sizes to preallocate.
2307
    // This looks to speed up decoding by around 8-9%.
2308
    // We can always shrink back down at the end if we overestimated.
2309
    // However it's likely that this also saves memory as own growth
2310
    // factor (*=1.5) is never applied.
2311
603
    {
2312
603
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
2313
603
        int qsize=0, nsize=0, q_id=0;
2314
#else
2315
        int qsize, nsize, q_id;
2316
        cram_decode_estimate_sizes(c->comp_hdr, s, &qsize, &nsize, &q_id);
2317
        //fprintf(stderr, "qsize=%d nsize=%d\n", qsize, nsize);
2318
#endif
2319
2320
603
        if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->seqs_blk, qsize+1);
2321
603
        if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->qual_blk, qsize+1);
2322
603
        if (nsize && (ds & CRAM_NS)) BLOCK_RESIZE_EXACT(s->name_blk, nsize+1);
2323
2324
        // To do - consider using q_id here to usurp the quality block and
2325
        // avoid a memcpy during decode.
2326
        // Specifically when quality is an external block uniquely used by
2327
        // DS_QS only, then we can set s->qual_blk directly to this
2328
        // block and save the codec->decode() calls. (Approx 3% cpu saving)
2329
603
    }
2330
2331
    /* Look for unknown RG, added as last by Java CRAM? */
2332
603
    if (bfd->nrg > 0 &&
2333
552
        bfd->rg[bfd->nrg-1].name != NULL &&
2334
552
        !strcmp(bfd->rg[bfd->nrg-1].name, "UNKNOWN"))
2335
0
        unknown_rg = bfd->nrg-1;
2336
2337
603
    if (blk->content_type != CORE)
2338
3
        return -1;
2339
2340
600
    if (s->crecs)
2341
0
        free(s->crecs);
2342
600
    if (!(s->crecs = malloc(s->hdr->num_records * sizeof(*s->crecs))))
2343
0
        return -1;
2344
2345
600
    ref_id = s->hdr->ref_seq_id;
2346
600
    if (CRAM_MAJOR_VERS(fd->version) < 4)
2347
600
       embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0;
2348
0
    else
2349
0
       embed_ref = s->hdr->ref_base_id > 0 ? 1 : 0;
2350
2351
600
    if (ref_id >= 0) {
2352
600
        if (embed_ref) {
2353
597
            cram_block *b;
2354
597
            if (s->hdr->ref_base_id < 0) {
2355
0
                hts_log_error("No reference specified and no embedded reference is available"
2356
0
                              " at #%d:%"PRId64"-%"PRId64, ref_id, s->hdr->ref_seq_start,
2357
0
                              s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2358
0
                return -1;
2359
0
            }
2360
597
            b = cram_get_block_by_id(s, s->hdr->ref_base_id);
2361
597
            if (!b)
2362
0
                return -1;
2363
597
            if (cram_uncompress_block(b) != 0)
2364
0
                return -1;
2365
597
            s->ref = (char *)BLOCK_DATA(b);
2366
597
            s->ref_start = s->hdr->ref_seq_start;
2367
597
            s->ref_end   = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
2368
597
            if (s->hdr->ref_seq_span > b->uncomp_size) {
2369
0
                hts_log_error("Embedded reference is too small at #%d:%"PRIhts_pos"-%"PRIhts_pos,
2370
0
                              ref_id, s->ref_start, s->ref_end);
2371
0
                return -1;
2372
0
            }
2373
597
        } else if (!c->comp_hdr->no_ref) {
2374
            //// Avoid Java cramtools bug by loading entire reference seq
2375
            //s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0);
2376
            //s->ref_start = 1;
2377
2378
3
            if (fd->required_fields & SAM_SEQ) {
2379
3
                s->ref =
2380
3
                cram_get_ref(fd, s->hdr->ref_seq_id,
2381
3
                             s->hdr->ref_seq_start,
2382
3
                             s->hdr->ref_seq_start + s->hdr->ref_seq_span -1);
2383
3
            }
2384
3
            s->ref_start = s->hdr->ref_seq_start;
2385
3
            s->ref_end   = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
2386
2387
            /* Sanity check */
2388
3
            if (s->ref_start < 0) {
2389
0
                hts_log_warning("Slice starts before base 1"
2390
0
                                " at #%d:%"PRId64"-%"PRId64, ref_id, s->hdr->ref_seq_start,
2391
0
                                s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2392
0
                s->ref_start = 0;
2393
0
            }
2394
3
            pthread_mutex_lock(&fd->ref_lock);
2395
3
            pthread_mutex_lock(&fd->refs->lock);
2396
3
            if ((fd->required_fields & SAM_SEQ) &&
2397
3
                ref_id < fd->refs->nref && fd->refs->ref_id &&
2398
0
                s->ref_end > fd->refs->ref_id[ref_id]->length) {
2399
0
                s->ref_end = fd->refs->ref_id[ref_id]->length;
2400
0
            }
2401
3
            pthread_mutex_unlock(&fd->refs->lock);
2402
3
            pthread_mutex_unlock(&fd->ref_lock);
2403
3
        }
2404
600
    }
2405
2406
600
    if ((fd->required_fields & SAM_SEQ) &&
2407
600
        s->ref == NULL && s->hdr->ref_seq_id >= 0 && !c->comp_hdr->no_ref) {
2408
3
        hts_log_error("Unable to fetch reference %s:%"PRId64"-%"PRId64,
2409
3
                      fd->refs->ref_id && ref_id >= 0 && ref_id < fd->refs->nref
2410
3
                      ? fd->refs->ref_id[ref_id]->name
2411
3
                      : "unknown",
2412
3
                      s->hdr->ref_seq_start,
2413
3
                      s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2414
3
        return -1;
2415
3
    }
2416
2417
597
    if (CRAM_MAJOR_VERS(fd->version) != 1
2418
0
        && (fd->required_fields & SAM_SEQ)
2419
0
        && s->hdr->ref_seq_id >= 0
2420
0
        && !fd->ignore_md5
2421
0
        && memcmp(s->hdr->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) {
2422
0
        hts_md5_context *md5;
2423
0
        unsigned char digest[16];
2424
2425
0
        if (s->ref && s->hdr->ref_seq_id >= 0) {
2426
0
            int start, len;
2427
2428
0
            if (s->hdr->ref_seq_start >= s->ref_start) {
2429
0
                start = s->hdr->ref_seq_start - s->ref_start;
2430
0
            } else {
2431
0
                hts_log_warning("Slice starts before base 1 at #%d:%"PRIhts_pos"-%"PRIhts_pos,
2432
0
                                ref_id, s->ref_start, s->ref_end);
2433
0
                start = 0;
2434
0
            }
2435
2436
0
            if (s->hdr->ref_seq_span <= s->ref_end - s->ref_start + 1) {
2437
0
                len = s->hdr->ref_seq_span;
2438
0
            } else {
2439
0
                hts_log_warning("Slice ends beyond reference end at #%d:%"PRIhts_pos"-%"PRIhts_pos,
2440
0
                                ref_id, s->ref_start, s->ref_end);
2441
0
                len = s->ref_end - s->ref_start + 1;
2442
0
            }
2443
2444
0
            if (!(md5 = hts_md5_init()))
2445
0
                return -1;
2446
0
            if (start + len > s->ref_end - s->ref_start + 1)
2447
0
                len = s->ref_end - s->ref_start + 1 - start;
2448
0
            if (len >= 0)
2449
0
                hts_md5_update(md5, s->ref + start, len);
2450
0
            hts_md5_final(digest, md5);
2451
0
            hts_md5_destroy(md5);
2452
0
        } else if (!s->ref && s->hdr->ref_base_id >= 0) {
2453
0
            cram_block *b = cram_get_block_by_id(s, s->hdr->ref_base_id);
2454
0
            if (b) {
2455
0
                if (!(md5 = hts_md5_init()))
2456
0
                    return -1;
2457
0
                hts_md5_update(md5, b->data, b->uncomp_size);
2458
0
                hts_md5_final(digest, md5);
2459
0
                hts_md5_destroy(md5);
2460
0
            }
2461
0
        }
2462
2463
0
        if (!c->comp_hdr->no_ref &&
2464
0
            ((!s->ref && s->hdr->ref_base_id < 0)
2465
0
             || memcmp(digest, s->hdr->md5, 16) != 0)) {
2466
0
            char M[33];
2467
0
            const char *rname = sam_hdr_tid2name(sh, ref_id);
2468
0
            if (!rname) rname="?"; // cannot happen normally
2469
0
            hts_log_error("MD5 checksum reference mismatch at %s:%"PRIhts_pos"-%"PRIhts_pos,
2470
0
                          rname, s->ref_start, s->ref_end);
2471
0
            hts_log_error("CRAM  : %s", md5_print(s->hdr->md5, M));
2472
0
            hts_log_error("Ref   : %s", md5_print(digest, M));
2473
0
            kstring_t ks = KS_INITIALIZE;
2474
0
            if (sam_hdr_find_tag_id(sh, "SQ", "SN", rname, "M5", &ks) == 0)
2475
0
                hts_log_error("@SQ M5: %s", ks.s);
2476
0
            hts_log_error("Please check the reference given is correct");
2477
0
            ks_free(&ks);
2478
0
            return -1;
2479
0
        }
2480
0
    }
2481
2482
597
    if (ref_id == -2) {
2483
0
        pthread_mutex_lock(&fd->ref_lock);
2484
0
        pthread_mutex_lock(&fd->refs->lock);
2485
0
        refs = calloc(fd->refs->nref, sizeof(char *));
2486
0
        pthread_mutex_unlock(&fd->refs->lock);
2487
0
        pthread_mutex_unlock(&fd->ref_lock);
2488
0
        if (!refs)
2489
0
            return -1;
2490
0
    }
2491
2492
597
    int last_ref_id = -9; // Arbitrary -ve marker for not-yet-set
2493
597
    for (rec = 0; rec < s->hdr->num_records; rec++) {
2494
0
        cram_record *cr = &s->crecs[rec];
2495
0
        int has_MD, has_NM;
2496
2497
        //fprintf(stderr, "Decode seq %d, %d/%d\n", rec, blk->byte, blk->bit);
2498
2499
0
        cr->s = s;
2500
2501
0
        out_sz = 1; /* decode 1 item */
2502
0
        if (ds & CRAM_BF) {
2503
0
            if (!c->comp_hdr->codecs[DS_BF]) goto block_err;
2504
0
            r |= c->comp_hdr->codecs[DS_BF]
2505
0
                            ->decode(s, c->comp_hdr->codecs[DS_BF], blk,
2506
0
                                     (char *)&bf, &out_sz);
2507
0
            if (r || bf < 0 ||
2508
0
                bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap))
2509
0
                goto block_err;
2510
0
            bf = fd->bam_flag_swap[bf];
2511
0
            cr->flags = bf;
2512
0
        } else {
2513
0
            cr->flags = bf = 0x4; // unmapped
2514
0
        }
2515
2516
0
        if (ds & CRAM_CF) {
2517
0
            if (CRAM_MAJOR_VERS(fd->version) == 1) {
2518
                /* CF is byte in 1.0, int32 in 2.0 */
2519
0
                if (!c->comp_hdr->codecs[DS_CF]) goto block_err;
2520
0
                r |= c->comp_hdr->codecs[DS_CF]
2521
0
                                ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2522
0
                                         (char *)&cf, &out_sz);
2523
0
                if (r) goto block_err;
2524
0
                cr->cram_flags = cf;
2525
0
            } else {
2526
0
                if (!c->comp_hdr->codecs[DS_CF]) goto block_err;
2527
0
                r |= c->comp_hdr->codecs[DS_CF]
2528
0
                                ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2529
0
                                         (char *)&cr->cram_flags, &out_sz);
2530
0
                if (r) goto block_err;
2531
0
                cf = cr->cram_flags;
2532
0
            }
2533
0
        } else {
2534
0
            cf = cr->cram_flags = 0;
2535
0
        }
2536
2537
0
        if (CRAM_MAJOR_VERS(fd->version) != 1 && ref_id == -2) {
2538
0
            if (ds & CRAM_RI) {
2539
0
                if (!c->comp_hdr->codecs[DS_RI]) goto block_err;
2540
0
                r |= c->comp_hdr->codecs[DS_RI]
2541
0
                                ->decode(s, c->comp_hdr->codecs[DS_RI], blk,
2542
0
                                         (char *)&cr->ref_id, &out_sz);
2543
0
                if (r) goto block_err;
2544
0
                if ((fd->required_fields & (SAM_SEQ|SAM_TLEN))
2545
0
                    && cr->ref_id >= 0
2546
0
                    && cr->ref_id != last_ref_id) {
2547
0
                    if (!c->comp_hdr->no_ref) {
2548
                        // Range(fd):  seq >= 0, unmapped -1, unspecified   -2
2549
                        // Slice(s):   seq >= 0, unmapped -1, multiple refs -2
2550
                        // Record(cr): seq >= 0, unmapped -1
2551
0
                        pthread_mutex_lock(&fd->range_lock);
2552
0
                        int need_ref = (fd->range.refid == -2 || cr->ref_id == fd->range.refid);
2553
0
                        pthread_mutex_unlock(&fd->range_lock);
2554
0
                        if  (need_ref) {
2555
0
                            if (!refs[cr->ref_id])
2556
0
                                refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id, 1, 0);
2557
0
                            if (!(s->ref = refs[cr->ref_id]))
2558
0
                                goto block_err;
2559
0
                        } else {
2560
                            // For multi-ref containers, we don't need to fetch all
2561
                            // refs if we're only querying one.
2562
0
                            s->ref = NULL;
2563
0
                        }
2564
2565
0
                        pthread_mutex_lock(&fd->range_lock);
2566
0
                        int discard_last_ref = (last_ref_id >= 0 &&
2567
0
                                                refs[last_ref_id] &&
2568
0
                                                (fd->range.refid == -2 ||
2569
0
                                                 last_ref_id == fd->range.refid));
2570
0
                        pthread_mutex_unlock(&fd->range_lock);
2571
0
                        if (discard_last_ref) {
2572
0
                            pthread_mutex_lock(&fd->ref_lock);
2573
0
                            discard_last_ref = !fd->unsorted;
2574
0
                            pthread_mutex_unlock(&fd->ref_lock);
2575
0
                        }
2576
0
                        if (discard_last_ref) {
2577
0
                            cram_ref_decr(fd->refs, last_ref_id);
2578
0
                            refs[last_ref_id] = NULL;
2579
0
                        }
2580
0
                    }
2581
0
                    s->ref_start = 1;
2582
0
                    pthread_mutex_lock(&fd->ref_lock);
2583
0
                    pthread_mutex_lock(&fd->refs->lock);
2584
0
                    s->ref_end = fd->refs->ref_id[cr->ref_id]->length;
2585
0
                    pthread_mutex_unlock(&fd->refs->lock);
2586
0
                    pthread_mutex_unlock(&fd->ref_lock);
2587
2588
0
                    last_ref_id = cr->ref_id;
2589
0
                }
2590
0
            } else {
2591
0
                cr->ref_id = -1;
2592
0
            }
2593
0
        } else {
2594
0
            cr->ref_id = ref_id; // Forced constant in CRAM 1.0
2595
0
        }
2596
0
        if (cr->ref_id < -1 || cr->ref_id >= bfd->nref) {
2597
0
            hts_log_error("Requested unknown reference ID %d", cr->ref_id);
2598
0
            goto block_err;
2599
0
        }
2600
2601
0
        if (ds & CRAM_RL) {
2602
0
            if (!c->comp_hdr->codecs[DS_RL]) goto block_err;
2603
0
            r |= c->comp_hdr->codecs[DS_RL]
2604
0
                            ->decode(s, c->comp_hdr->codecs[DS_RL], blk,
2605
0
                                     (char *)&cr->len, &out_sz);
2606
0
            if (r) goto block_err;
2607
0
            if (cr->len < 0) {
2608
0
                hts_log_error("Read has negative length");
2609
0
                goto block_err;
2610
0
            }
2611
0
        }
2612
2613
0
        if (ds & CRAM_AP) {
2614
0
            if (!c->comp_hdr->codecs[DS_AP]) goto block_err;
2615
0
            if (CRAM_MAJOR_VERS(fd->version) >= 4) {
2616
0
                r |= c->comp_hdr->codecs[DS_AP]
2617
0
                                ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
2618
0
                                         (char *)&cr->apos, &out_sz);
2619
0
            } else  {
2620
0
                int32_t i32;
2621
0
                r |= c->comp_hdr->codecs[DS_AP]
2622
0
                                ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
2623
0
                                         (char *)&i32, &out_sz);
2624
0
                cr->apos = i32;
2625
0
            }
2626
0
            if (r) goto block_err;;
2627
0
            if (c->comp_hdr->AP_delta) {
2628
0
                if (cr->apos < 0 && c->unsorted == 0) {
2629
                    // cache locally in c->unsorted so we don't have an
2630
                    // excessive number of locks
2631
0
                    pthread_mutex_lock(&fd->ref_lock);
2632
0
                    c->unsorted = fd->unsorted = 1;
2633
0
                    pthread_mutex_unlock(&fd->ref_lock);
2634
0
                }
2635
0
                cr->apos += s->last_apos;
2636
0
            }
2637
0
            s->last_apos=  cr->apos;
2638
0
        } else {
2639
0
            cr->apos = c->ref_seq_start;
2640
0
        }
2641
2642
0
        if (ds & CRAM_RG) {
2643
0
            if (!c->comp_hdr->codecs[DS_RG]) goto block_err;
2644
0
            r |= c->comp_hdr->codecs[DS_RG]
2645
0
                           ->decode(s, c->comp_hdr->codecs[DS_RG], blk,
2646
0
                                    (char *)&cr->rg, &out_sz);
2647
0
            if (r) goto block_err;
2648
0
            if (cr->rg == unknown_rg)
2649
0
                cr->rg = -1;
2650
0
        } else {
2651
0
            cr->rg = -1;
2652
0
        }
2653
2654
0
        cr->name_len = 0;
2655
2656
0
        if (c->comp_hdr->read_names_included) {
2657
0
            int32_t out_sz2 = 1; // block auto grows in decode()
2658
2659
            // Read directly into name cram_block
2660
0
            cr->name = BLOCK_SIZE(s->name_blk);
2661
0
            if (ds & CRAM_RN) {
2662
0
                if (!c->comp_hdr->codecs[DS_RN]) goto block_err;
2663
0
                r |= c->comp_hdr->codecs[DS_RN]
2664
0
                                ->decode(s, c->comp_hdr->codecs[DS_RN], blk,
2665
0
                                         (char *)s->name_blk, &out_sz2);
2666
0
                if (r) goto block_err;
2667
0
                cr->name_len = out_sz2;
2668
0
            }
2669
0
        }
2670
2671
0
        cr->mate_pos = 0;
2672
0
        cr->mate_line = -1;
2673
0
        cr->mate_ref_id = -1;
2674
0
        cr->explicit_tlen = INT64_MIN;
2675
0
        if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) {
2676
0
            if (ds & CRAM_MF) {
2677
0
                if (CRAM_MAJOR_VERS(fd->version) == 1) {
2678
                    /* MF is byte in 1.0, int32 in 2.0 */
2679
0
                    unsigned char mf;
2680
0
                    if (!c->comp_hdr->codecs[DS_MF]) goto block_err;
2681
0
                    r |= c->comp_hdr->codecs[DS_MF]
2682
0
                                    ->decode(s, c->comp_hdr->codecs[DS_MF],
2683
0
                                             blk, (char *)&mf, &out_sz);
2684
0
                    if (r) goto block_err;
2685
0
                    cr->mate_flags = mf;
2686
0
                } else {
2687
0
                    if (!c->comp_hdr->codecs[DS_MF]) goto block_err;
2688
0
                    r |= c->comp_hdr->codecs[DS_MF]
2689
0
                                    ->decode(s, c->comp_hdr->codecs[DS_MF],
2690
0
                                             blk,
2691
0
                                             (char *)&cr->mate_flags,
2692
0
                                             &out_sz);
2693
0
                    if (r) goto block_err;
2694
0
                }
2695
0
            } else {
2696
0
                cr->mate_flags = 0;
2697
0
            }
2698
2699
0
            if (!c->comp_hdr->read_names_included) {
2700
0
                int32_t out_sz2 = 1;
2701
2702
                // Read directly into name cram_block
2703
0
                cr->name = BLOCK_SIZE(s->name_blk);
2704
0
                if (ds & CRAM_RN) {
2705
0
                    if (!c->comp_hdr->codecs[DS_RN]) goto block_err;
2706
0
                    r |= c->comp_hdr->codecs[DS_RN]
2707
0
                                    ->decode(s, c->comp_hdr->codecs[DS_RN],
2708
0
                                             blk, (char *)s->name_blk,
2709
0
                                             &out_sz2);
2710
0
                    if (r) goto block_err;
2711
0
                    cr->name_len = out_sz2;
2712
0
                }
2713
0
            }
2714
2715
0
            if (ds & CRAM_NS) {
2716
0
                if (!c->comp_hdr->codecs[DS_NS]) goto block_err;
2717
0
                r |= c->comp_hdr->codecs[DS_NS]
2718
0
                                ->decode(s, c->comp_hdr->codecs[DS_NS], blk,
2719
0
                                         (char *)&cr->mate_ref_id, &out_sz);
2720
0
                if (r) goto block_err;
2721
0
            }
2722
2723
            // Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown.
2724
            // if (cr->mate_ref_id == -1 && cr->flags & 0x01) {
2725
            //     /* Paired, but unmapped */
2726
            //     cr->flags |= BAM_FMUNMAP;
2727
            // }
2728
2729
0
            if (ds & CRAM_NP) {
2730
0
                if (!c->comp_hdr->codecs[DS_NP]) goto block_err;;
2731
0
                if (CRAM_MAJOR_VERS(fd->version) < 4) {
2732
0
                    int32_t i32;
2733
0
                    r |= c->comp_hdr->codecs[DS_NP]
2734
0
                                    ->decode(s, c->comp_hdr->codecs[DS_NP], blk,
2735
0
                                             (char *)&i32, &out_sz);
2736
0
                    cr->mate_pos = i32;
2737
0
                } else {
2738
0
                    r |= c->comp_hdr->codecs[DS_NP]
2739
0
                        ->decode(s, c->comp_hdr->codecs[DS_NP], blk,
2740
0
                                 (char *)&cr->mate_pos, &out_sz);
2741
0
                }
2742
0
                if (r) goto block_err;
2743
0
            }
2744
2745
0
            if (ds & CRAM_TS) {
2746
0
                if (!c->comp_hdr->codecs[DS_TS]) goto block_err;
2747
0
                r = cram_decode_tlen(fd, c, s, blk, &cr->tlen);
2748
0
                if (r) goto block_err;
2749
0
            } else {
2750
0
                cr->tlen = INT64_MIN;
2751
0
            }
2752
0
        } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) {
2753
            // else not detached
2754
0
            if (ds & CRAM_NF) {
2755
0
                if (!c->comp_hdr->codecs[DS_NF]) goto block_err;
2756
0
                r |= c->comp_hdr->codecs[DS_NF]
2757
0
                                ->decode(s, c->comp_hdr->codecs[DS_NF], blk,
2758
0
                                         (char *)&cr->mate_line, &out_sz);
2759
0
                if (r) goto block_err;
2760
0
                cr->mate_line += rec + 1;
2761
2762
                //cr->name_len = sprintf(name, "%d", name_id++);
2763
                //cr->name = DSTRING_LEN(name_ds);
2764
                //dstring_nappend(name_ds, name, cr->name_len);
2765
2766
0
                cr->mate_ref_id = -1;
2767
0
                cr->tlen = INT64_MIN;
2768
0
                cr->mate_pos = 0;
2769
0
            } else  {
2770
0
                cr->mate_flags = 0;
2771
0
                cr->tlen = INT64_MIN;
2772
0
            }
2773
0
            if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) {
2774
0
                if (ds & CRAM_TS) {
2775
0
                    r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen);
2776
0
                    if (r) return r;
2777
0
                } else {
2778
0
                    cr->mate_flags = 0;
2779
0
                    cr->tlen = INT64_MIN;
2780
0
                }
2781
0
            }
2782
0
        } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) {
2783
0
            if (ds & CRAM_TS) {
2784
0
                r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen);
2785
0
                if (r) return r;
2786
0
            } else {
2787
0
                cr->mate_flags = 0;
2788
0
                cr->tlen = INT64_MIN;
2789
0
            }
2790
0
        } else {
2791
0
            cr->mate_flags = 0;
2792
0
            cr->tlen = INT64_MIN;
2793
0
        }
2794
        /*
2795
        else if (!name[0]) {
2796
            //name[0] = '?'; name[1] = 0;
2797
            //cr->name_len = 1;
2798
            //cr->name=  DSTRING_LEN(s->name_ds);
2799
            //dstring_nappend(s->name_ds, "?", 1);
2800
2801
            cr->mate_ref_id = -1;
2802
            cr->tlen = 0;
2803
            cr->mate_pos = 0;
2804
        }
2805
        */
2806
2807
        /* Auxiliary tags */
2808
0
        has_MD = has_NM = 0;
2809
0
        if (CRAM_MAJOR_VERS(fd->version) == 1)
2810
0
            r |= cram_decode_aux_1_0(c, s, blk, cr);
2811
0
        else
2812
0
            r |= cram_decode_aux(fd, c, s, blk, cr, &has_MD, &has_NM);
2813
0
        if (r) goto block_err;
2814
2815
        /* Fake up dynamic string growth and appending */
2816
0
        if (ds & CRAM_RL) {
2817
0
            cr->seq = BLOCK_SIZE(s->seqs_blk);
2818
0
            BLOCK_RESIZE(s->seqs_blk, cr->seq + cr->len);
2819
0
            seq = (char *)BLOCK_END(s->seqs_blk);
2820
0
            BLOCK_SIZE(s->seqs_blk) += cr->len;
2821
2822
0
            if (!seq)
2823
0
                goto block_err;
2824
2825
0
            cr->qual = BLOCK_SIZE(s->qual_blk);
2826
0
            BLOCK_RESIZE(s->qual_blk, cr->qual + cr->len);
2827
0
            qual = (char *)BLOCK_END(s->qual_blk);
2828
0
            BLOCK_SIZE(s->qual_blk) += cr->len;
2829
2830
0
            if (!s->ref)
2831
0
                memset(seq, '=', cr->len);
2832
0
        }
2833
2834
0
        if (!(bf & BAM_FUNMAP)) {
2835
0
            if ((ds & CRAM_AP) && cr->apos <= 0) {
2836
0
                hts_log_error("Read has alignment position %"PRId64
2837
0
                              " but no unmapped flag",
2838
0
                              cr->apos);
2839
0
                goto block_err;
2840
0
            }
2841
            /* Decode sequence and generate CIGAR */
2842
0
            if (ds & (CRAM_SEQ | CRAM_MQ)) {
2843
0
                r |= cram_decode_seq(fd, c, s, blk, cr, sh, cf, seq, qual,
2844
0
                                     has_MD, has_NM);
2845
0
                if (r) goto block_err;
2846
0
            } else {
2847
0
                cr->cigar = 0;
2848
0
                cr->ncigar = 0;
2849
0
                cr->aend = cr->apos;
2850
0
                cr->mqual = 0;
2851
0
            }
2852
0
        } else {
2853
0
            int out_sz2 = cr->len;
2854
2855
            //puts("Unmapped");
2856
0
            cr->cigar = 0;
2857
0
            cr->ncigar = 0;
2858
0
            cr->aend = cr->apos;
2859
0
            cr->mqual = 0;
2860
2861
0
            if (ds & CRAM_BA && cr->len) {
2862
0
                if (!c->comp_hdr->codecs[DS_BA]) goto block_err;
2863
0
                r |= c->comp_hdr->codecs[DS_BA]
2864
0
                                ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
2865
0
                                         (char *)seq, &out_sz2);
2866
0
                if (r) goto block_err;
2867
0
            }
2868
2869
0
            if ((ds & CRAM_CF) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
2870
0
                out_sz2 = cr->len;
2871
0
                if (ds & CRAM_QS && cr->len >= 0) {
2872
0
                    if (!c->comp_hdr->codecs[DS_QS]) goto block_err;
2873
0
                    r |= c->comp_hdr->codecs[DS_QS]
2874
0
                                    ->decode(s, c->comp_hdr->codecs[DS_QS],
2875
0
                                             blk, qual, &out_sz2);
2876
0
                    if (r) goto block_err;
2877
0
                }
2878
0
            } else {
2879
0
                if (ds & CRAM_RL)
2880
0
                    memset(qual, 255, cr->len);
2881
0
            }
2882
0
        }
2883
2884
0
        if (!c->comp_hdr->qs_seq_orient && (ds & CRAM_QS) && (cr->flags & BAM_FREVERSE)) {
2885
0
            int i, j;
2886
0
            for (i = 0, j = cr->len-1; i < j; i++, j--) {
2887
0
                unsigned char c;
2888
0
                c = qual[i];
2889
0
                qual[i] = qual[j];
2890
0
                qual[j] = c;
2891
0
            }
2892
0
        }
2893
0
    }
2894
2895
597
    pthread_mutex_lock(&fd->ref_lock);
2896
597
    if (refs) {
2897
0
        int i;
2898
0
        for (i = 0; i < fd->refs->nref; i++) {
2899
0
            if (refs[i])
2900
0
                cram_ref_decr(fd->refs, i);
2901
0
        }
2902
0
        free(refs);
2903
0
        refs = NULL;
2904
597
    } else if (ref_id >= 0 && s->ref != fd->ref_free && !embed_ref) {
2905
0
        cram_ref_decr(fd->refs, ref_id);
2906
0
    }
2907
597
    pthread_mutex_unlock(&fd->ref_lock);
2908
2909
    /* Resolve mate pair cross-references between recs within this slice */
2910
597
    r |= cram_decode_slice_xref(s, fd->required_fields);
2911
2912
    // Free the original blocks as we no longer need these.
2913
597
    {
2914
597
        int i;
2915
3.17k
        for (i = 0; i < s->hdr->num_blocks; i++) {
2916
2.57k
            cram_block *b = s->block[i];
2917
2.57k
            cram_free_block(b);
2918
2.57k
            s->block[i] = NULL;
2919
2.57k
        }
2920
597
    }
2921
2922
    // Also see initial BLOCK_RESIZE_EXACT at top of function.
2923
    // As we grow blocks we overallocate by up to 50%. So shrink
2924
    // back to their final sizes here.
2925
    //
2926
    //fprintf(stderr, "%d %d // %d %d // %d %d // %d %d\n",
2927
    //      (int)s->seqs_blk->byte, (int)s->seqs_blk->alloc,
2928
    //      (int)s->qual_blk->byte, (int)s->qual_blk->alloc,
2929
    //      (int)s->name_blk->byte, (int)s->name_blk->alloc,
2930
    //      (int)s->aux_blk->byte,  (int)s->aux_blk->alloc);
2931
597
    BLOCK_RESIZE_EXACT(s->seqs_blk, BLOCK_SIZE(s->seqs_blk)+1);
2932
597
    BLOCK_RESIZE_EXACT(s->qual_blk, BLOCK_SIZE(s->qual_blk)+1);
2933
597
    BLOCK_RESIZE_EXACT(s->name_blk, BLOCK_SIZE(s->name_blk)+1);
2934
597
    BLOCK_RESIZE_EXACT(s->aux_blk,  BLOCK_SIZE(s->aux_blk)+1);
2935
2936
597
    return r;
2937
2938
0
 block_err:
2939
0
    if (refs) {
2940
0
        int i;
2941
0
        pthread_mutex_lock(&fd->ref_lock);
2942
0
        for (i = 0; i < fd->refs->nref; i++) {
2943
0
            if (refs[i])
2944
0
                cram_ref_decr(fd->refs, i);
2945
0
        }
2946
0
        free(refs);
2947
0
        pthread_mutex_unlock(&fd->ref_lock);
2948
0
    }
2949
2950
0
    return -1;
2951
597
}
2952
2953
typedef struct {
2954
    cram_fd *fd;
2955
    cram_container *c;
2956
    cram_slice *s;
2957
    sam_hdr_t *h;
2958
    int exit_code;
2959
} cram_decode_job;
2960
2961
0
void *cram_decode_slice_thread(void *arg) {
2962
0
    cram_decode_job *j = (cram_decode_job *)arg;
2963
2964
0
    j->exit_code = cram_decode_slice(j->fd, j->c, j->s, j->h);
2965
2966
0
    return j;
2967
0
}
2968
2969
/*
2970
 * Spawn a multi-threaded version of cram_decode_slice().
2971
 */
2972
int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s,
2973
603
                         sam_hdr_t *bfd) {
2974
603
    cram_decode_job *j;
2975
603
    int nonblock;
2976
2977
603
    if (!fd->pool)
2978
603
        return cram_decode_slice(fd, c, s, bfd);
2979
2980
0
    if (!(j = malloc(sizeof(*j))))
2981
0
        return -1;
2982
2983
0
    j->fd = fd;
2984
0
    j->c  = c;
2985
0
    j->s  = s;
2986
0
    j->h  = bfd;
2987
2988
0
    nonblock = hts_tpool_process_sz(fd->rqueue) ? 1 : 0;
2989
2990
0
    int saved_errno = errno;
2991
0
    errno = 0;
2992
0
    if (-1 == hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_decode_slice_thread,
2993
0
                                  j, nonblock)) {
2994
        /* Would block */
2995
0
        if (errno != EAGAIN)
2996
0
            return -1;
2997
0
        fd->job_pending = j;
2998
0
    } else {
2999
0
        fd->job_pending = NULL;
3000
0
    }
3001
0
    errno = saved_errno;
3002
3003
    // flush too
3004
0
    return 0;
3005
0
}
3006
3007
3008
/* ----------------------------------------------------------------------
3009
 * CRAM sequence iterators.
3010
 */
3011
3012
/*
3013
 * Converts a cram in-memory record into a bam in-memory record. We
3014
 * pass a pointer to a bam_seq_t pointer along with the a pointer to
3015
 * the allocated size. These can initially be pointers to NULL and zero.
3016
 *
3017
 * This function will reallocate the bam buffer as required and update
3018
 * (*bam)->alloc accordingly, allowing it to be used within a loop
3019
 * efficiently without needing to allocate new bam objects over and
3020
 * over again.
3021
 *
3022
 * Returns the used size of the bam record on success
3023
 *         -1 on failure.
3024
 */
3025
int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s,
3026
0
                cram_record *cr, int rec, bam_seq_t **bam) {
3027
0
    int ret, rg_len;
3028
0
    char name_a[1024], *name;
3029
0
    int name_len;
3030
0
    char *aux;
3031
0
    char *seq, *qual;
3032
0
    sam_hrecs_t *bfd = sh->hrecs;
3033
3034
    /* Assign names if not explicitly set */
3035
0
    if (fd->required_fields & SAM_QNAME) {
3036
0
        if (cr->name_len) {
3037
0
            name = (char *)BLOCK_DATA(s->name_blk) + cr->name;
3038
0
            name_len = cr->name_len;
3039
0
        } else {
3040
0
            name = name_a;
3041
0
            if (cr->mate_line >= 0 && cr->mate_line < s->max_rec &&
3042
0
                s->crecs[cr->mate_line].name_len > 0) {
3043
                // Copy our mate if non-zero.
3044
0
                memcpy(name_a, BLOCK_DATA(s->name_blk)+s->crecs[cr->mate_line].name,
3045
0
                       s->crecs[cr->mate_line].name_len);
3046
0
                name = name_a + s->crecs[cr->mate_line].name_len;
3047
0
            } else {
3048
                // Otherwise generate a name based on prefix
3049
0
                name_len = strlen(fd->prefix);
3050
0
                memcpy(name, fd->prefix, name_len);
3051
0
                name += name_len;
3052
0
                *name++ = ':';
3053
0
                if (cr->mate_line >= 0 && cr->mate_line < rec) {
3054
0
                    name = (char *)append_uint64((unsigned char *)name,
3055
0
                                                 s->hdr->record_counter +
3056
0
                                                 cr->mate_line + 1);
3057
0
                } else {
3058
0
                    name = (char *)append_uint64((unsigned char *)name,
3059
0
                                                 s->hdr->record_counter +
3060
0
                                                 rec + 1);
3061
0
                }
3062
0
            }
3063
0
            name_len = name - name_a;
3064
0
            name = name_a;
3065
0
        }
3066
0
    } else {
3067
0
        name = "?";
3068
0
        name_len = 1;
3069
0
    }
3070
3071
    /* Generate BAM record */
3072
0
    if (cr->rg < -1 || cr->rg >= bfd->nrg)
3073
0
        return -1;
3074
0
    rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0;
3075
3076
0
    if (fd->required_fields & (SAM_SEQ | SAM_QUAL)) {
3077
0
        if (!BLOCK_DATA(s->seqs_blk))
3078
0
            return -1;
3079
0
        seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
3080
0
    } else {
3081
0
        seq = "*";
3082
0
        cr->len = 0;
3083
0
    }
3084
3085
0
    if (fd->required_fields & SAM_QUAL) {
3086
0
        if (!BLOCK_DATA(s->qual_blk))
3087
0
            return -1;
3088
0
        qual = (char *)BLOCK_DATA(s->qual_blk) + cr->qual;
3089
0
    } else {
3090
0
        qual = NULL;
3091
0
    }
3092
3093
0
    ret = bam_set1(*bam,
3094
0
                   name_len, name,
3095
0
                   cr->flags, cr->ref_id, cr->apos - 1, cr->mqual,
3096
0
                   cr->ncigar, &s->cigar[cr->cigar],
3097
0
                   cr->mate_ref_id, cr->mate_pos - 1, cr->tlen,
3098
0
                   cr->len, seq, qual,
3099
0
                   cr->aux_size + rg_len);
3100
0
    if (ret < 0) {
3101
0
        return ret;
3102
0
    }
3103
3104
0
    aux = (char *)bam_aux(*bam);
3105
3106
    /* Auxiliary strings */
3107
0
    if (cr->aux_size != 0) {
3108
0
        memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size);
3109
0
        aux += cr->aux_size;
3110
0
        (*bam)->l_data += cr->aux_size;
3111
0
    }
3112
3113
    /* RG:Z: */
3114
0
    if (rg_len > 0) {
3115
0
        *aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z';
3116
0
        int len = bfd->rg[cr->rg].name_len;
3117
0
        memcpy(aux, bfd->rg[cr->rg].name, len);
3118
0
        aux += len;
3119
0
        *aux++ = 0;
3120
0
        (*bam)->l_data += rg_len;
3121
0
    }
3122
3123
0
    return (*bam)->l_data;
3124
0
}
3125
3126
/*
3127
 * Here be dragons! The multi-threading code in this is crufty beyond belief.
3128
 */
3129
3130
/*
3131
 * Load first container.
3132
 * Called when fd->ctr is NULL>
3133
 *
3134
 * Returns container on success
3135
 *        NULL on failure.
3136
 */
3137
783
static cram_container *cram_first_slice(cram_fd *fd) {
3138
783
    cram_container *c;
3139
3140
7.55k
    do {
3141
7.55k
        if (fd->ctr)
3142
6.77k
            cram_free_container(fd->ctr);
3143
3144
7.55k
        if (!(c = fd->ctr = cram_read_container(fd)))
3145
20
            return NULL;
3146
7.53k
        c->curr_slice_mt = c->curr_slice;
3147
7.53k
    } while (c->length == 0);
3148
3149
    /*
3150
     * The first container may be a result of a sub-range query.
3151
     * In which case it may still not be the optimal starting point
3152
     * due to skipped containers/slices in the index.
3153
     */
3154
    // No need for locks here as we're in the main thread.
3155
763
    if (fd->range.refid != -2) {
3156
0
        while (c->ref_seq_id != -2 &&
3157
0
               (c->ref_seq_id < fd->range.refid ||
3158
0
                (fd->range.refid >= 0 && c->ref_seq_id == fd->range.refid
3159
0
                 && c->ref_seq_start + c->ref_seq_span-1 < fd->range.start))) {
3160
0
            if (0 != cram_seek(fd, c->length, SEEK_CUR))
3161
0
                return NULL;
3162
0
            cram_free_container(fd->ctr);
3163
0
            do {
3164
0
                if (!(c = fd->ctr = cram_read_container(fd)))
3165
0
                    return NULL;
3166
0
            } while (c->length == 0);
3167
0
        }
3168
3169
0
        if (c->ref_seq_id != -2 && c->ref_seq_id != fd->range.refid) {
3170
0
            fd->eof = 1;
3171
0
            return NULL;
3172
0
        }
3173
0
    }
3174
3175
763
    if (!(c->comp_hdr_block = cram_read_block(fd)))
3176
24
        return NULL;
3177
739
    if (c->comp_hdr_block->content_type != COMPRESSION_HEADER)
3178
26
        return NULL;
3179
3180
713
    c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
3181
713
    if (!c->comp_hdr)
3182
665
        return NULL;
3183
48
    if (!c->comp_hdr->AP_delta &&
3184
0
        sam_hrecs_sort_order(fd->header->hrecs) != ORDER_COORD) {
3185
0
        pthread_mutex_lock(&fd->ref_lock);
3186
0
        fd->unsorted = 1;
3187
0
        pthread_mutex_unlock(&fd->ref_lock);
3188
0
    }
3189
3190
48
    return c;
3191
713
}
3192
3193
1.38k
cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
3194
1.38k
    cram_container *c_curr;  // container being consumed via cram_get_seq()
3195
1.38k
    cram_slice *s_curr = NULL;
3196
3197
    // Populate the first container if unknown.
3198
1.38k
    if (!(c_curr = fd->ctr)) {
3199
783
        if (!(c_curr = cram_first_slice(fd)))
3200
735
            return NULL;
3201
783
    }
3202
3203
    // Discard previous slice
3204
645
    if ((s_curr = c_curr->slice)) {
3205
597
        c_curr->slice = NULL;
3206
597
        cram_free_slice(s_curr);
3207
597
        s_curr = NULL;
3208
597
    }
3209
3210
    // If we've consumed all slices in this container, also discard
3211
    // the container too.
3212
645
    if (c_curr->curr_slice == c_curr->max_slice) {
3213
54
        if (fd->ctr == c_curr)
3214
54
            fd->ctr = NULL;
3215
54
        if (fd->ctr_mt == c_curr)
3216
51
            fd->ctr_mt = NULL;
3217
54
        cram_free_container(c_curr);
3218
54
        c_curr = NULL;
3219
54
    }
3220
3221
645
    if (!fd->ctr_mt)
3222
99
        fd->ctr_mt = c_curr;
3223
3224
    // Fetch the next slice (and the container if necessary).
3225
    //
3226
    // If single threaded this loop bails out as soon as it finds
3227
    // a slice in range.  In this case c_next and c_curr end up being
3228
    // the same thing.
3229
    //
3230
    // If multi-threaded, we loop until we have filled out
3231
    // thread pool input queue.  Here c_next and c_curr *may* differ, as
3232
    // can fd->ctr and fd->ctr_mt.
3233
645
    for (;;) {
3234
645
        cram_container *c_next = fd->ctr_mt;
3235
645
        cram_slice *s_next = NULL;
3236
3237
        // Next slice; either from the last job we failed to push
3238
        // to the input queue or via more I/O.
3239
645
        if (fd->job_pending) {
3240
0
            cram_decode_job *j = (cram_decode_job *)fd->job_pending;
3241
0
            c_next = j->c;
3242
0
            s_next = j->s;
3243
0
            free(fd->job_pending);
3244
0
            fd->job_pending = NULL;
3245
645
        } else if (!fd->ooc) {
3246
648
        empty_container:
3247
648
            if (!c_next || c_next->curr_slice_mt == c_next->max_slice) {
3248
                // new container
3249
7.11k
                for(;;) {
3250
7.11k
                    if (!(c_next = cram_read_container(fd))) {
3251
9
                        if (fd->pool) {
3252
0
                            fd->ooc = 1;
3253
0
                            break;
3254
0
                        }
3255
3256
9
                        return NULL;
3257
9
                    }
3258
7.10k
                    c_next->curr_slice_mt = c_next->curr_slice;
3259
3260
7.10k
                    if (c_next->length != 0)
3261
48
                        break;
3262
3263
7.05k
                    cram_free_container(c_next);
3264
7.05k
                }
3265
48
                if (fd->ooc)
3266
0
                    break;
3267
3268
//                printf("%p %d:%ld-%ld vs %d:%ld-%ld\n", fd,
3269
//                       c_next->ref_seq_id, c_next->ref_seq_start, c_next->ref_seq_start+c_next->ref_seq_span-1,
3270
//                       fd->range.refid, fd->range.start, fd->range.end);
3271
3272
                /* Skip containers not yet spanning our range */
3273
48
                if (fd->range.refid != -2 && c_next->ref_seq_id != -2) {
3274
                    // ref_id beyond end of range; bail out
3275
0
                    if (c_next->ref_seq_id != fd->range.refid) {
3276
0
                        cram_free_container(c_next);
3277
0
                        fd->ctr_mt = NULL;
3278
0
                        fd->ooc = 1;
3279
0
                        break;
3280
0
                    }
3281
3282
                    // position beyond end of range; bail out
3283
0
                    if (fd->range.refid != -1 &&
3284
0
                        c_next->ref_seq_start > fd->range.end) {
3285
0
                        cram_free_container(c_next);
3286
0
                        fd->ctr_mt = NULL;
3287
0
                        fd->ooc = 1;
3288
0
                        break;
3289
0
                    }
3290
3291
                    // Before start of range; skip to next container.
3292
                    // Can't use cram_seek() here as it drops in-progress
3293
                    // multi-threaded decode jobs, so call hseek() directly.
3294
0
                    if (fd->range.refid != -1 &&
3295
0
                        c_next->ref_seq_start + c_next->ref_seq_span-1 <
3296
0
                        fd->range.start) {
3297
0
                        off_t skip_length = c_next->length;
3298
0
                        cram_free_container(c_next);
3299
0
                        c_next = NULL;
3300
0
                        fd->ooc = 0;
3301
0
                        if (hseek(fd->fp, skip_length, SEEK_CUR) < 0)
3302
0
                            return NULL;
3303
0
                        continue;
3304
0
                    }
3305
0
                }
3306
3307
                // Container is valid range, so remember it for restarting
3308
                // this function.
3309
48
                fd->ctr_mt = c_next;
3310
3311
48
                if (!(c_next->comp_hdr_block = cram_read_block(fd)))
3312
6
                    return NULL;
3313
42
                if (c_next->comp_hdr_block->content_type != COMPRESSION_HEADER)
3314
3
                    return NULL;
3315
3316
39
                c_next->comp_hdr =
3317
39
                    cram_decode_compression_header(fd, c_next->comp_hdr_block);
3318
39
                if (!c_next->comp_hdr)
3319
0
                    return NULL;
3320
3321
39
                if (!c_next->comp_hdr->AP_delta &&
3322
0
                    sam_hrecs_sort_order(fd->header->hrecs) != ORDER_COORD) {
3323
0
                    pthread_mutex_lock(&fd->ref_lock);
3324
0
                    fd->unsorted = 1;
3325
0
                    pthread_mutex_unlock(&fd->ref_lock);
3326
0
                }
3327
39
            }
3328
3329
630
            if (c_next->num_records == 0) {
3330
3
                if (fd->ctr == c_next)
3331
3
                    fd->ctr = NULL;
3332
3
                if (c_curr == c_next)
3333
3
                    c_curr = NULL;
3334
3
                if (fd->ctr_mt == c_next)
3335
3
                    fd->ctr_mt = NULL;
3336
3
                cram_free_container(c_next);
3337
3
                c_next = NULL;
3338
3
                goto empty_container;
3339
3
            }
3340
3341
627
            if (!(s_next = c_next->slice = cram_read_slice(fd)))
3342
24
                return NULL;
3343
3344
603
            s_next->slice_num = ++c_next->curr_slice_mt;
3345
603
            s_next->curr_rec = 0;
3346
603
            s_next->max_rec = s_next->hdr->num_records;
3347
3348
603
            s_next->last_apos = s_next->hdr->ref_seq_start;
3349
3350
            // We know the container overlaps our range, but with multi-slice
3351
            // containers we may have slices that do not.  Skip these also.
3352
603
            if (fd->range.refid != -2 && s_next->hdr->ref_seq_id != -2) {
3353
                // ref_id beyond end of range; bail out
3354
0
                if (s_next->hdr->ref_seq_id != fd->range.refid) {
3355
0
                    fd->ooc = 1;
3356
0
                    cram_free_slice(s_next);
3357
0
                    c_next->slice = s_next = NULL;
3358
0
                    break;
3359
0
                }
3360
3361
                // position beyond end of range; bail out
3362
0
                if (fd->range.refid != -1 &&
3363
0
                    s_next->hdr->ref_seq_start > fd->range.end) {
3364
0
                    fd->ooc = 1;
3365
0
                    cram_free_slice(s_next);
3366
0
                    c_next->slice = s_next = NULL;
3367
0
                    break;
3368
0
                }
3369
3370
                // before start of range; skip to next slice
3371
0
                if (fd->range.refid != -1 &&
3372
0
                    s_next->hdr->ref_seq_start + s_next->hdr->ref_seq_span-1 <
3373
0
                    fd->range.start) {
3374
0
                    cram_free_slice(s_next);
3375
0
                    c_next->slice = s_next = NULL;
3376
0
                    continue;
3377
0
                }
3378
0
            }
3379
603
        } // end: if (!fd->ooc)
3380
3381
603
        if (!c_next || !s_next)
3382
0
            break;
3383
3384
        // Decode the slice, either right now (non-threaded) or by pushing
3385
        // it to the a decode queue (threaded).
3386
603
        if (cram_decode_slice_mt(fd, c_next, s_next, fd->header) != 0) {
3387
6
            hts_log_error("Failure to decode slice");
3388
6
            cram_free_slice(s_next);
3389
6
            c_next->slice = NULL;
3390
6
            return NULL;
3391
6
        }
3392
3393
        // No thread pool, so don't loop again
3394
597
        if (!fd->pool) {
3395
597
            c_curr = c_next;
3396
597
            s_curr = s_next;
3397
597
            break;
3398
597
        }
3399
3400
        // With thread pool, but we have a job pending so our decode queue
3401
        // is full.
3402
0
        if (fd->job_pending)
3403
0
            break;
3404
3405
        // Otherwise we're threaded with room in the decode input queue, so
3406
        // keep reading slices for decode.
3407
        // Push it a bit far, to qsize in queue rather than pending arrival,
3408
        // as cram tends to be a bit bursty in decode timings.
3409
0
        if (hts_tpool_process_len(fd->rqueue) >
3410
0
            hts_tpool_process_qsize(fd->rqueue))
3411
0
            break;
3412
0
    } // end of for(;;)
3413
3414
3415
    // When not threaded we've already have c_curr and s_curr.
3416
    // Otherwise we need get them by pulling off the decode output queue.
3417
597
    if (fd->pool) {
3418
0
        hts_tpool_result *res;
3419
0
        cram_decode_job *j;
3420
3421
0
        if (fd->ooc && hts_tpool_process_empty(fd->rqueue)) {
3422
0
            fd->eof = 1;
3423
0
            return NULL;
3424
0
        }
3425
3426
0
        res = hts_tpool_next_result_wait(fd->rqueue);
3427
3428
0
        if (!res || !hts_tpool_result_data(res)) {
3429
0
            hts_log_error("Call to hts_tpool_next_result failed");
3430
0
            return NULL;
3431
0
        }
3432
3433
0
        j = (cram_decode_job *)hts_tpool_result_data(res);
3434
0
        c_curr = j->c;
3435
0
        s_curr = j->s;
3436
3437
0
        if (j->exit_code != 0) {
3438
0
            hts_log_error("Slice decode failure");
3439
0
            fd->eof = 0;
3440
0
            hts_tpool_delete_result(res, 1);
3441
0
            return NULL;
3442
0
        }
3443
3444
0
        hts_tpool_delete_result(res, 1);
3445
0
    }
3446
3447
597
    *cp = c_curr;
3448
3449
    // Update current slice being processed (as opposed to current
3450
    // slice in the multi-threaded reahead.
3451
597
    fd->ctr = c_curr;
3452
597
    if (c_curr) {
3453
597
        c_curr->slice = s_curr;
3454
597
        if (s_curr)
3455
597
            c_curr->curr_slice = s_curr->slice_num;
3456
597
    }
3457
597
    if (s_curr)
3458
597
        s_curr->curr_rec = 0;
3459
0
    else
3460
0
        fd->eof = 1;
3461
3462
597
    return s_curr;
3463
597
}
3464
3465
/*
3466
 * Read the next cram record and return it.
3467
 * Note that to decode cram_record the caller will need to look up some data
3468
 * in the current slice, pointed to by fd->ctr->slice. This is valid until
3469
 * the next call to cram_get_seq (which may invalidate it).
3470
 *
3471
 * Returns record pointer on success (do not free)
3472
 *        NULL on failure
3473
 */
3474
783
cram_record *cram_get_seq(cram_fd *fd) {
3475
783
    cram_container *c;
3476
783
    cram_slice *s;
3477
3478
1.38k
    for (;;) {
3479
1.38k
        c = fd->ctr;
3480
1.38k
        if (c && c->slice && c->slice->curr_rec < c->slice->max_rec) {
3481
0
            s = c->slice;
3482
1.38k
        } else {
3483
1.38k
            if (!(s = cram_next_slice(fd, &c)))
3484
783
                return NULL;
3485
597
            continue; /* In case slice contains no records */
3486
1.38k
        }
3487
3488
        // No need to lock here as get_seq is running in the main thread,
3489
        // which is also the same one that does the range modifications.
3490
0
        if (fd->range.refid != -2) {
3491
0
            if (fd->range.refid == -1 && s->crecs[s->curr_rec].ref_id != -1) {
3492
                // Special case when looking for unmapped blocks at end.
3493
                // If these are mixed in with mapped data (c->ref_id == -2)
3494
                // then we need skip until we find the unmapped data, if at all
3495
0
                s->curr_rec++;
3496
0
                continue;
3497
0
            }
3498
0
            if (s->crecs[s->curr_rec].ref_id < fd->range.refid &&
3499
0
                s->crecs[s->curr_rec].ref_id != -1) {
3500
                // Looking for a mapped read, but not there yet.  Special case
3501
                // as -1 (unmapped) shouldn't be considered < refid.
3502
0
                s->curr_rec++;
3503
0
                continue;
3504
0
            }
3505
3506
0
            if (s->crecs[s->curr_rec].ref_id != fd->range.refid) {
3507
0
                fd->eof = 1;
3508
0
                cram_free_slice(s);
3509
0
                c->slice = NULL;
3510
0
                return NULL;
3511
0
            }
3512
3513
0
            if (fd->range.refid != -1 && s->crecs[s->curr_rec].apos > fd->range.end) {
3514
0
                fd->eof = 1;
3515
0
                cram_free_slice(s);
3516
0
                c->slice = NULL;
3517
0
                return NULL;
3518
0
            }
3519
3520
0
            if (fd->range.refid != -1 && s->crecs[s->curr_rec].aend < fd->range.start) {
3521
0
                s->curr_rec++;
3522
0
                continue;
3523
0
            }
3524
0
        }
3525
3526
0
        break;
3527
0
    }
3528
3529
0
    fd->ctr = c;
3530
0
    c->slice = s;
3531
0
    return &s->crecs[s->curr_rec++];
3532
783
}
3533
3534
/*
3535
 * Read the next cram record and convert it to a bam_seq_t struct.
3536
 *
3537
 * Returns >= 0 success (number of bytes written to *bam)
3538
 *        -1 on EOF or failure (check fd->err)
3539
 */
3540
783
int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam) {
3541
783
    cram_record *cr;
3542
783
    cram_container *c;
3543
783
    cram_slice *s;
3544
3545
783
    if (!(cr = cram_get_seq(fd)))
3546
783
        return -1;
3547
3548
0
    c = fd->ctr;
3549
0
    s = c->slice;
3550
3551
0
    return cram_to_bam(fd->header, fd, s, cr, s->curr_rec-1, bam);
3552
783
}
3553
3554
/*
3555
 * Drains and frees the decode read-queue for a multi-threaded reader.
3556
 */
3557
1.08k
void cram_drain_rqueue(cram_fd *fd) {
3558
1.08k
    cram_container *lc = NULL;
3559
3560
1.08k
    if (!fd->pool || !fd->rqueue)
3561
1.08k
        return;
3562
3563
    // drain queue of any in-flight decode jobs
3564
0
    while (!hts_tpool_process_empty(fd->rqueue)) {
3565
0
        hts_tpool_result *r = hts_tpool_next_result_wait(fd->rqueue);
3566
0
        if (!r)
3567
0
            break;
3568
0
        cram_decode_job *j = (cram_decode_job *)hts_tpool_result_data(r);
3569
0
        if (j->c->slice == j->s)
3570
0
            j->c->slice = NULL;
3571
0
        if (j->c != lc) {
3572
0
            if (lc) {
3573
0
                if (fd->ctr == lc)
3574
0
                    fd->ctr = NULL;
3575
0
                if (fd->ctr_mt == lc)
3576
0
                    fd->ctr_mt = NULL;
3577
0
                cram_free_container(lc);
3578
0
            }
3579
0
            lc = j->c;
3580
0
        }
3581
0
        cram_free_slice(j->s);
3582
0
        hts_tpool_delete_result(r, 1);
3583
0
    }
3584
3585
    // Also tidy up any pending decode job that we didn't submit to the workers
3586
    // due to the input queue being full.
3587
0
    if (fd->job_pending) {
3588
0
        cram_decode_job *j = (cram_decode_job *)fd->job_pending;
3589
0
        if (j->c->slice == j->s)
3590
0
            j->c->slice = NULL;
3591
0
        if (j->c != lc) {
3592
0
            if (lc) {
3593
0
                if (fd->ctr == lc)
3594
0
                    fd->ctr = NULL;
3595
0
                if (fd->ctr_mt == lc)
3596
0
                    fd->ctr_mt = NULL;
3597
0
                cram_free_container(lc);
3598
0
            }
3599
0
            lc = j->c;
3600
0
        }
3601
0
        cram_free_slice(j->s);
3602
0
        free(j);
3603
0
        fd->job_pending = NULL;
3604
0
    }
3605
3606
0
    if (lc) {
3607
0
        if (fd->ctr == lc)
3608
0
            fd->ctr = NULL;
3609
0
        if (fd->ctr_mt == lc)
3610
0
            fd->ctr_mt = NULL;
3611
0
        cram_free_container(lc);
3612
0
    }
3613
0
}