Coverage Report

Created: 2026-02-14 06:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/cram/cram_decode.c
Line
Count
Source
1
/*
2
Copyright (c) 2012-2020, 2022-2025 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * - In-memory decoding of CRAM data structures.
33
 * - Iterator for reading CRAM record by record.
34
 */
35
36
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
37
#include <config.h>
38
39
#include <stdio.h>
40
#include <errno.h>
41
#include <assert.h>
42
#include <stdlib.h>
43
#include <string.h>
44
#include <zlib.h>
45
#include <sys/types.h>
46
#include <sys/stat.h>
47
#include <math.h>
48
#include <stdint.h>
49
#include <inttypes.h>
50
51
#include "cram.h"
52
#include "os.h"
53
#include "../htslib/hts.h"
54
#include "../htslib/hfile.h"
55
56
//Whether CIGAR has just M or uses = and X to indicate match and mismatch
57
//#define USE_X
58
59
/* ----------------------------------------------------------------------
60
 * CRAM compression headers
61
 */
62
63
/*
64
 * Decodes the Tag Dictionary record in the preservation map
65
 * Updates the cram compression header.
66
 *
67
 * Returns number of bytes decoded on success
68
 *        -1 on failure
69
 */
70
int cram_decode_TD(cram_fd *fd, char *cp, const char *endp,
71
370
                   cram_block_compression_hdr *h) {
72
370
    char *op = cp;
73
370
    unsigned char *dat;
74
370
    cram_block *b;
75
370
    int32_t blk_size = 0;
76
370
    int nTL, i, sz, err = 0;
77
78
370
    if (!(b = cram_new_block(0, 0)))
79
0
        return -1;
80
81
370
    if (h->TD_blk || h->TL) {
82
179
        hts_log_warning("More than one TD block found in compression header");
83
179
        cram_free_block(h->TD_blk);
84
179
        free(h->TL);
85
179
        h->TD_blk = NULL;
86
179
        h->TL = NULL;
87
179
    }
88
89
    /* Decode */
90
370
    blk_size = fd->vv.varint_get32(&cp, endp, &err);
91
370
    if (!blk_size) {
92
122
        h->nTL = 0;
93
122
        cram_free_block(b);
94
122
        return cp - op;
95
122
    }
96
97
248
    if (err || blk_size < 0 || endp - cp < blk_size) {
98
9
        cram_free_block(b);
99
9
        return -1;
100
9
    }
101
102
239
    BLOCK_APPEND(b, cp, blk_size);
103
239
    cp += blk_size;
104
239
    sz = cp - op;
105
    // Force nul termination if missing
106
239
    if (BLOCK_DATA(b)[BLOCK_SIZE(b)-1])
107
186
        BLOCK_APPEND_CHAR(b, '\0');
108
109
    /* Set up TL lookup table */
110
239
    dat = BLOCK_DATA(b);
111
112
    // Count
113
3.28k
    for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
114
3.04k
        nTL++;
115
15.4k
        while (dat[i])
116
12.3k
            i++;
117
3.04k
    }
118
119
    // Copy
120
239
    if (!(h->TL = calloc(nTL, sizeof(*h->TL)))) {
121
0
        cram_free_block(b);
122
0
        return -1;
123
0
    }
124
3.28k
    for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
125
3.04k
        h->TL[nTL++] = &dat[i];
126
15.4k
        while (dat[i])
127
12.3k
            i++;
128
3.04k
    }
129
239
    h->TD_blk = b;
130
239
    h->nTL = nTL;
131
132
239
    return sz;
133
134
0
 block_err:
135
0
    cram_free_block(b);
136
0
    return -1;
137
239
}
138
139
/*
140
 * Decodes a CRAM block compression header.
141
 * Returns header ptr on success
142
 *         NULL on failure
143
 */
144
cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
145
3.31k
                                                           cram_block *b) {
146
3.31k
    char *cp, *endp, *cp_copy;
147
3.31k
    cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
148
3.31k
    int i, err = 0;
149
3.31k
    int32_t map_size = 0, map_count = 0;
150
151
3.31k
    if (!hdr)
152
0
        return NULL;
153
154
3.31k
    if (b->method != RAW) {
155
2.19k
        if (cram_uncompress_block(b)) {
156
1.80k
            free(hdr);
157
1.80k
            return NULL;
158
1.80k
        }
159
2.19k
    }
160
161
1.51k
    cp = (char *)b->data;
162
1.51k
    endp = cp + b->uncomp_size;
163
164
1.51k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
165
1.44k
        hdr->ref_seq_id = fd->vv.varint_get32(&cp, endp, &err);
166
1.44k
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
167
0
            hdr->ref_seq_start = fd->vv.varint_get64(&cp, endp, &err);
168
0
            hdr->ref_seq_span  = fd->vv.varint_get64(&cp, endp, &err);
169
1.44k
        } else {
170
1.44k
            hdr->ref_seq_start = fd->vv.varint_get32(&cp, endp, &err);
171
1.44k
            hdr->ref_seq_span  = fd->vv.varint_get32(&cp, endp, &err);
172
1.44k
        }
173
1.44k
        hdr->num_records   = fd->vv.varint_get32(&cp, endp, &err);
174
1.44k
        hdr->num_landmarks = fd->vv.varint_get32(&cp, endp, &err);
175
1.44k
        if (hdr->num_landmarks < 0 ||
176
1.44k
            hdr->num_landmarks >= SIZE_MAX / sizeof(int32_t) ||
177
1.44k
            endp - cp < hdr->num_landmarks) {
178
55
            free(hdr);
179
55
            return NULL;
180
55
        }
181
1.38k
        if (!(hdr->landmark = malloc(hdr->num_landmarks * sizeof(int32_t)))) {
182
0
            free(hdr);
183
0
            return NULL;
184
0
        }
185
37.7k
        for (i = 0; i < hdr->num_landmarks; i++)
186
36.3k
            hdr->landmark[i] = fd->vv.varint_get32(&cp, endp, &err);;
187
1.38k
    }
188
189
1.45k
    hdr->preservation_map = kh_init(map);
190
191
1.45k
    memset(hdr->rec_encoding_map, 0,
192
1.45k
           CRAM_MAP_HASH * sizeof(hdr->rec_encoding_map[0]));
193
1.45k
    memset(hdr->tag_encoding_map, 0,
194
1.45k
           CRAM_MAP_HASH * sizeof(hdr->tag_encoding_map[0]));
195
196
1.45k
    if (!hdr->preservation_map) {
197
0
        cram_free_compression_header(hdr);
198
0
        return NULL;
199
0
    }
200
201
    /* Initialise defaults for preservation map */
202
1.45k
    hdr->read_names_included = 0;
203
1.45k
    hdr->AP_delta = 1;
204
1.45k
    hdr->qs_seq_orient = 1;
205
1.45k
    memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20);
206
207
    /* Preservation map */
208
1.45k
    map_size  = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp;
209
1.45k
    map_count = fd->vv.varint_get32(&cp, endp, &err);
210
47.1k
    for (i = 0; i < map_count; i++) {
211
45.8k
        pmap_t hd;
212
45.8k
        khint_t k;
213
45.8k
        int r;
214
215
45.8k
        if (endp - cp < 3) {
216
125
            cram_free_compression_header(hdr);
217
125
            return NULL;
218
125
        }
219
45.7k
        cp += 2;
220
45.7k
        switch(CRAM_KEY(cp[-2],cp[-1])) {
221
30
        case CRAM_KEY('M','I'): // was mapped QS included in V1.0
222
114
        case CRAM_KEY('U','I'): // was unmapped QS included in V1.0
223
144
        case CRAM_KEY('P','I'): // was unmapped placed in V1.0
224
144
            hd.i = *cp++;
225
144
            break;
226
227
89
        case CRAM_KEY('R','N'):
228
89
            hd.i = *cp++;
229
89
            k = kh_put(map, hdr->preservation_map, "RN", &r);
230
89
            if (-1 == r) {
231
0
                cram_free_compression_header(hdr);
232
0
                return NULL;
233
0
            }
234
235
89
            kh_val(hdr->preservation_map, k) = hd;
236
89
            hdr->read_names_included = hd.i;
237
89
            break;
238
239
119
        case CRAM_KEY('A','P'):
240
119
            hd.i = *cp++;
241
119
            k = kh_put(map, hdr->preservation_map, "AP", &r);
242
119
            if (-1 == r) {
243
0
                cram_free_compression_header(hdr);
244
0
                return NULL;
245
0
            }
246
247
119
            kh_val(hdr->preservation_map, k) = hd;
248
119
            hdr->AP_delta = hd.i;
249
119
            break;
250
251
1.17k
        case CRAM_KEY('R','R'):
252
1.17k
            hd.i = *cp++;
253
1.17k
            k = kh_put(map, hdr->preservation_map, "RR", &r);
254
1.17k
            if (-1 == r) {
255
0
                cram_free_compression_header(hdr);
256
0
                return NULL;
257
0
            }
258
259
1.17k
            kh_val(hdr->preservation_map, k) = hd;
260
1.17k
            hdr->no_ref = !hd.i;
261
1.17k
            break;
262
263
71
        case CRAM_KEY('Q','O'):
264
71
            hd.i = *cp++;
265
71
            k = kh_put(map, hdr->preservation_map, "QO", &r);
266
71
            if (-1 == r) {
267
0
                cram_free_compression_header(hdr);
268
0
                return NULL;
269
0
            }
270
271
71
            kh_val(hdr->preservation_map, k) = hd;
272
71
            hdr->qs_seq_orient = hd.i;
273
71
            break;
274
275
696
        case CRAM_KEY('S','M'):
276
696
            if (endp - cp < 5) {
277
3
                cram_free_compression_header(hdr);
278
3
                return NULL;
279
3
            }
280
693
            hdr->substitution_matrix[0][(cp[0]>>6)&3] = 'C';
281
693
            hdr->substitution_matrix[0][(cp[0]>>4)&3] = 'G';
282
693
            hdr->substitution_matrix[0][(cp[0]>>2)&3] = 'T';
283
693
            hdr->substitution_matrix[0][(cp[0]>>0)&3] = 'N';
284
285
693
            hdr->substitution_matrix[1][(cp[1]>>6)&3] = 'A';
286
693
            hdr->substitution_matrix[1][(cp[1]>>4)&3] = 'G';
287
693
            hdr->substitution_matrix[1][(cp[1]>>2)&3] = 'T';
288
693
            hdr->substitution_matrix[1][(cp[1]>>0)&3] = 'N';
289
290
693
            hdr->substitution_matrix[2][(cp[2]>>6)&3] = 'A';
291
693
            hdr->substitution_matrix[2][(cp[2]>>4)&3] = 'C';
292
693
            hdr->substitution_matrix[2][(cp[2]>>2)&3] = 'T';
293
693
            hdr->substitution_matrix[2][(cp[2]>>0)&3] = 'N';
294
295
693
            hdr->substitution_matrix[3][(cp[3]>>6)&3] = 'A';
296
693
            hdr->substitution_matrix[3][(cp[3]>>4)&3] = 'C';
297
693
            hdr->substitution_matrix[3][(cp[3]>>2)&3] = 'G';
298
693
            hdr->substitution_matrix[3][(cp[3]>>0)&3] = 'N';
299
300
693
            hdr->substitution_matrix[4][(cp[4]>>6)&3] = 'A';
301
693
            hdr->substitution_matrix[4][(cp[4]>>4)&3] = 'C';
302
693
            hdr->substitution_matrix[4][(cp[4]>>2)&3] = 'G';
303
693
            hdr->substitution_matrix[4][(cp[4]>>0)&3] = 'T';
304
305
693
            hd.p = cp;
306
693
            cp += 5;
307
308
693
            k = kh_put(map, hdr->preservation_map, "SM", &r);
309
693
            if (-1 == r) {
310
0
                cram_free_compression_header(hdr);
311
0
                return NULL;
312
0
            }
313
693
            kh_val(hdr->preservation_map, k) = hd;
314
693
            break;
315
316
370
        case CRAM_KEY('T','D'): {
317
370
            int sz = cram_decode_TD(fd, cp, endp, hdr); // tag dictionary
318
370
            if (sz < 0) {
319
9
                cram_free_compression_header(hdr);
320
9
                return NULL;
321
9
            }
322
323
361
            hd.p = cp;
324
361
            cp += sz;
325
326
361
            k = kh_put(map, hdr->preservation_map, "TD", &r);
327
361
            if (-1 == r) {
328
0
                cram_free_compression_header(hdr);
329
0
                return NULL;
330
0
            }
331
361
            kh_val(hdr->preservation_map, k) = hd;
332
361
            break;
333
361
        }
334
335
43.0k
        default:
336
43.0k
            hts_log_warning("Unrecognised preservation map key %c%c", cp[-2], cp[-1]);
337
            // guess byte;
338
43.0k
            cp++;
339
43.0k
            break;
340
45.7k
        }
341
45.7k
    }
342
1.32k
    if (cp - cp_copy != map_size) {
343
52
        cram_free_compression_header(hdr);
344
52
        return NULL;
345
52
    }
346
347
    /* Record encoding map */
348
1.27k
    map_size  = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp;
349
1.27k
    map_count = fd->vv.varint_get32(&cp, endp, &err);
350
1.27k
    int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0;
351
40.5k
    for (i = 0; i < map_count; i++) {
352
39.9k
        char *key = cp;
353
39.9k
        int32_t encoding = E_NULL;
354
39.9k
        int32_t size = 0;
355
39.9k
        ptrdiff_t offset;
356
39.9k
        cram_map *m;
357
39.9k
        enum cram_DS_ID ds_id;
358
39.9k
        enum cram_external_type type;
359
360
39.9k
        if (endp - cp < 4) {
361
32
            cram_free_compression_header(hdr);
362
32
            return NULL;
363
32
        }
364
365
39.9k
        cp += 2;
366
39.9k
        encoding = fd->vv.varint_get32(&cp, endp, &err);
367
39.9k
        size     = fd->vv.varint_get32(&cp, endp, &err);
368
369
39.9k
        offset = cp - (char *)b->data;
370
371
39.9k
        if (encoding == E_NULL)
372
11.9k
            continue;
373
374
27.9k
        if (size < 0 || endp - cp < size) {
375
220
            cram_free_compression_header(hdr);
376
220
            return NULL;
377
220
        }
378
379
        //printf("%s codes for %.2s\n", cram_encoding2str(encoding), key);
380
381
        /*
382
         * For CRAM1.0 CF and BF are Byte and not Int.
383
         * Practically speaking it makes no difference unless we have a
384
         * 1.0 format file that stores these in EXTERNAL as only then
385
         * does Byte vs Int matter.
386
         *
387
         * Neither this C code nor Java reference implementations did this,
388
         * so we gloss over it and treat them as int.
389
         */
390
27.6k
        ds_id = DS_CORE;
391
27.6k
        if (key[0] == 'B' && key[1] == 'F') {
392
90
            ds_id = DS_BF; type = E_INT;
393
27.6k
        } else if (key[0] == 'C' && key[1] == 'F') {
394
261
            ds_id = DS_CF; type = E_INT;
395
27.3k
        } else if (key[0] == 'R' && key[1] == 'I') {
396
123
            ds_id = DS_RI; type = E_INT;
397
27.2k
        } else if (key[0] == 'R' && key[1] == 'L') {
398
26
            ds_id = DS_RL; type = E_INT;
399
27.1k
        } else if (key[0] == 'A' && key[1] == 'P') {
400
329
            ds_id = DS_AP;
401
329
            type = is_v4 ? E_SLONG : E_INT;
402
26.8k
        } else if (key[0] == 'R' && key[1] == 'G') {
403
95
            ds_id = DS_RG;
404
95
            type = E_INT;
405
26.7k
        } else if (key[0] == 'M' && key[1] == 'F') {
406
65
            ds_id = DS_MF; type = E_INT;
407
26.7k
        } else if (key[0] == 'N' && key[1] == 'S') {
408
142
            ds_id = DS_NS; type = E_INT;
409
26.5k
        } else if (key[0] == 'N' && key[1] == 'P') {
410
579
            ds_id = DS_NP;
411
579
            type = is_v4 ? E_LONG : E_INT;
412
25.9k
        } else if (key[0] == 'T' && key[1] == 'S') {
413
86
            ds_id = DS_TS;
414
86
            type = is_v4 ? E_SLONG : E_INT;
415
25.9k
        } else if (key[0] == 'N' && key[1] == 'F') {
416
209
            ds_id = DS_NF; type = E_INT;
417
25.6k
        } else if (key[0] == 'T' && key[1] == 'C') {
418
110
            ds_id = DS_TC; type = E_BYTE;
419
25.5k
        } else if (key[0] == 'T' && key[1] == 'N') {
420
5
            ds_id = DS_TN; type = E_INT;
421
25.5k
        } else if (key[0] == 'F' && key[1] == 'N') {
422
3
            ds_id = DS_FN; type = E_INT;
423
25.5k
        } else if (key[0] == 'F' && key[1] == 'C') {
424
375
            ds_id = DS_FC; type = E_BYTE;
425
25.2k
        } else if (key[0] == 'F' && key[1] == 'P') {
426
20
            ds_id = DS_FP; type = E_INT;
427
25.1k
        } else if (key[0] == 'B' && key[1] == 'S') {
428
8
            ds_id = DS_BS; type = E_BYTE;
429
25.1k
        } else if (key[0] == 'I' && key[1] == 'N') {
430
12
            ds_id = DS_IN; type = E_BYTE_ARRAY;
431
25.1k
        } else if (key[0] == 'S' && key[1] == 'C') {
432
6
            ds_id = DS_SC; type = E_BYTE_ARRAY;
433
25.1k
        } else if (key[0] == 'D' && key[1] == 'L') {
434
89
            ds_id = DS_DL; type = E_INT;
435
25.0k
        } else if (key[0] == 'B' && key[1] == 'A') {
436
207
            ds_id = DS_BA; type = E_BYTE;
437
24.8k
        } else if (key[0] == 'B' && key[1] == 'B') {
438
54
            ds_id = DS_BB; type = E_BYTE_ARRAY;
439
24.8k
        } else if (key[0] == 'R' && key[1] == 'S') {
440
204
            ds_id = DS_RS; type = E_INT;
441
24.6k
        } else if (key[0] == 'P' && key[1] == 'D') {
442
3
            ds_id = DS_PD; type = E_INT;
443
24.5k
        } else if (key[0] == 'H' && key[1] == 'C') {
444
0
            ds_id = DS_HC; type = E_INT;
445
24.5k
        } else if (key[0] == 'M' && key[1] == 'Q') {
446
1.43k
            ds_id = DS_MQ; type = E_INT;
447
23.1k
        } else if (key[0] == 'R' && key[1] == 'N') {
448
21
            ds_id = DS_RN; type = E_BYTE_ARRAY_BLOCK;
449
23.1k
        } else if (key[0] == 'Q' && key[1] == 'S') {
450
176
            ds_id = DS_QS; type = E_BYTE;
451
22.9k
        } else if (key[0] == 'Q' && key[1] == 'Q') {
452
2
            ds_id = DS_QQ; type = E_BYTE_ARRAY;
453
22.9k
        } else if (key[0] == 'T' && key[1] == 'L') {
454
14
            ds_id = DS_TL; type = E_INT;
455
22.9k
        } else if (key[0] == 'T' && key[1] == 'M') {
456
22.9k
        } else if (key[0] == 'T' && key[1] == 'V') {
457
22.9k
        } else {
458
22.9k
            hts_log_warning("Unrecognised key: %.2s", key);
459
22.9k
        }
460
461
27.6k
        if (ds_id != DS_CORE) {
462
4.75k
            if (hdr->codecs[ds_id] != NULL) {
463
3.78k
                hts_log_warning("Codec for key %.2s defined more than once",
464
3.78k
                                key);
465
3.78k
                hdr->codecs[ds_id]->free(hdr->codecs[ds_id]);
466
3.78k
            }
467
4.75k
            hdr->codecs[ds_id] = cram_decoder_init(hdr, encoding, cp, size,
468
4.75k
                                                   type, fd->version, &fd->vv);
469
4.75k
            if (!hdr->codecs[ds_id]) {
470
370
                cram_free_compression_header(hdr);
471
370
                return NULL;
472
370
            }
473
4.75k
        }
474
475
27.3k
        cp += size;
476
477
        // Fill out cram_map purely for cram_dump to dump out.
478
27.3k
        m = malloc(sizeof(*m));
479
27.3k
        if (!m) {
480
0
            cram_free_compression_header(hdr);
481
0
            return NULL;
482
0
        }
483
27.3k
        m->key = CRAM_KEY(key[0], key[1]);
484
27.3k
        m->encoding = encoding;
485
27.3k
        m->size     = size;
486
27.3k
        m->offset   = offset;
487
27.3k
        m->codec = NULL;
488
489
27.3k
        m->next = hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])];
490
27.3k
        hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])] = m;
491
27.3k
    }
492
648
    if (cp - cp_copy != map_size) {
493
82
        cram_free_compression_header(hdr);
494
82
        return NULL;
495
82
    }
496
497
    /* Tag encoding map */
498
566
    map_size  = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp;
499
566
    map_count = fd->vv.varint_get32(&cp, endp, &err);
500
1.36k
    for (i = 0; i < map_count; i++) {
501
929
        int32_t encoding = E_NULL;
502
929
        int32_t size = 0;
503
929
        cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
504
929
        uint8_t key[3];
505
506
929
        if (!m || endp - cp < 6) {
507
6
            free(m);
508
6
            cram_free_compression_header(hdr);
509
6
            return NULL;
510
6
        }
511
512
923
        m->key = fd->vv.varint_get32(&cp, endp, &err);
513
923
        key[0] = m->key>>16;
514
923
        key[1] = m->key>>8;
515
923
        key[2] = m->key;
516
923
        encoding = fd->vv.varint_get32(&cp, endp, &err);
517
923
        size     = fd->vv.varint_get32(&cp, endp, &err);
518
519
923
        m->encoding = encoding;
520
923
        m->size     = size;
521
923
        m->offset   = cp - (char *)b->data;
522
923
        if (size < 0 || endp - cp < size ||
523
903
            !(m->codec = cram_decoder_init(hdr, encoding, cp, size,
524
903
                                           E_BYTE_ARRAY_BLOCK, fd->version, &fd->vv))) {
525
125
            cram_free_compression_header(hdr);
526
125
            free(m);
527
125
            return NULL;
528
125
        }
529
530
798
        cp += size;
531
532
798
        m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])];
533
798
        hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m;
534
798
    }
535
435
    if (err || cp - cp_copy != map_size) {
536
35
        cram_free_compression_header(hdr);
537
35
        return NULL;
538
35
    }
539
540
400
    return hdr;
541
435
}
542
543
/*
544
 * Note we also need to scan through the record encoding map to
545
 * see which data series share the same block, either external or
546
 * CORE. For example if we need the BF data series but MQ and CF
547
 * are also encoded in the same block then we need to add those in
548
 * as a dependency in order to correctly decode BF.
549
 *
550
 * Returns 0 on success
551
 *        -1 on failure
552
 */
553
int cram_dependent_data_series(cram_fd *fd,
554
                               cram_block_compression_hdr *hdr,
555
587
                               cram_slice *s) {
556
587
    int *block_used;
557
587
    int core_used = 0;
558
587
    int i;
559
587
    static int i_to_id[] = {
560
587
        DS_BF, DS_AP, DS_FP, DS_RL, DS_DL, DS_NF, DS_BA, DS_QS,
561
587
        DS_FC, DS_FN, DS_BS, DS_IN, DS_RG, DS_MQ, DS_TL, DS_RN,
562
587
        DS_NS, DS_NP, DS_TS, DS_MF, DS_CF, DS_RI, DS_RS, DS_PD,
563
587
        DS_HC, DS_SC, DS_BB, DS_QQ,
564
587
    };
565
587
    uint32_t orig_ds;
566
567
    /*
568
     * Set the data_series bit field based on fd->required_fields
569
     * contents.
570
     */
571
587
    if (fd->required_fields && fd->required_fields != INT_MAX) {
572
0
        s->data_series = 0;
573
574
0
        if (fd->required_fields & SAM_QNAME)
575
0
            s->data_series |= CRAM_RN;
576
577
0
        if (fd->required_fields & SAM_FLAG)
578
0
            s->data_series |= CRAM_BF;
579
580
0
        if (fd->required_fields & SAM_RNAME)
581
0
            s->data_series |= CRAM_RI | CRAM_BF;
582
583
0
        if (fd->required_fields & SAM_POS)
584
0
            s->data_series |= CRAM_AP | CRAM_BF;
585
586
0
        if (fd->required_fields & SAM_MAPQ)
587
0
            s->data_series |= CRAM_MQ;
588
589
0
        if (fd->required_fields & SAM_CIGAR)
590
0
            s->data_series |= CRAM_CIGAR;
591
592
0
        if (fd->required_fields & SAM_RNEXT)
593
0
            s->data_series |= CRAM_CF | CRAM_NF | CRAM_RI | CRAM_NS |CRAM_BF;
594
595
0
        if (fd->required_fields & SAM_PNEXT)
596
0
            s->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_NP | CRAM_BF;
597
598
0
        if (fd->required_fields & SAM_TLEN)
599
0
            s->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_TS |
600
0
                CRAM_BF | CRAM_MF | CRAM_RI | CRAM_CIGAR;
601
602
0
        if (fd->required_fields & SAM_SEQ)
603
0
            s->data_series |= CRAM_SEQ;
604
605
0
        if (!(fd->required_fields & SAM_AUX))
606
            // No easy way to get MD/NM without other tags at present
607
0
            s->decode_md = 0;
608
609
0
        if (fd->required_fields & SAM_QUAL)
610
0
            s->data_series |= CRAM_QUAL;
611
612
0
        if (fd->required_fields & SAM_AUX)
613
0
            s->data_series |= CRAM_RG | CRAM_TL | CRAM_aux;
614
615
0
        if (fd->required_fields & SAM_RGAUX)
616
0
            s->data_series |= CRAM_RG | CRAM_BF;
617
618
        // Always uncompress CORE block
619
0
        if (cram_uncompress_block(s->block[0]))
620
0
            return -1;
621
587
    } else {
622
587
        s->data_series = CRAM_ALL;
623
624
2.80k
        for (i = 0; i < s->hdr->num_blocks; i++) {
625
2.22k
            if (cram_uncompress_block(s->block[i]))
626
6
                return -1;
627
2.22k
        }
628
629
581
        return 0;
630
587
    }
631
632
0
    block_used = calloc(s->hdr->num_blocks+1, sizeof(int));
633
0
    if (!block_used)
634
0
        return -1;
635
636
0
    do {
637
        /*
638
         * Also set data_series based on code prerequisites. Eg if we need
639
         * CRAM_QS then we also need to know CRAM_RL so we know how long it
640
         * is, or if we need FC/FP then we also need FN (number of features).
641
         *
642
         * It's not reciprocal though. We may be needing to decode FN
643
         * but have no need to decode FC, FP and cigar ops.
644
         */
645
0
        if (s->data_series & CRAM_RS)    s->data_series |= CRAM_FC|CRAM_FP;
646
0
        if (s->data_series & CRAM_PD)    s->data_series |= CRAM_FC|CRAM_FP;
647
0
        if (s->data_series & CRAM_HC)    s->data_series |= CRAM_FC|CRAM_FP;
648
0
        if (s->data_series & CRAM_QS)    s->data_series |= CRAM_FC|CRAM_FP;
649
0
        if (s->data_series & CRAM_IN)    s->data_series |= CRAM_FC|CRAM_FP;
650
0
        if (s->data_series & CRAM_SC)    s->data_series |= CRAM_FC|CRAM_FP;
651
0
        if (s->data_series & CRAM_BS)    s->data_series |= CRAM_FC|CRAM_FP;
652
0
        if (s->data_series & CRAM_DL)    s->data_series |= CRAM_FC|CRAM_FP;
653
0
        if (s->data_series & CRAM_BA)    s->data_series |= CRAM_FC|CRAM_FP;
654
0
        if (s->data_series & CRAM_BB)    s->data_series |= CRAM_FC|CRAM_FP;
655
0
        if (s->data_series & CRAM_QQ)    s->data_series |= CRAM_FC|CRAM_FP;
656
657
        // cram_decode_seq() needs seq[] array
658
0
        if (s->data_series & (CRAM_SEQ|CRAM_CIGAR)) s->data_series |= CRAM_RL;
659
660
0
        if (s->data_series & CRAM_FP)    s->data_series |= CRAM_FC;
661
0
        if (s->data_series & CRAM_FC)    s->data_series |= CRAM_FN;
662
0
        if (s->data_series & CRAM_aux)   s->data_series |= CRAM_TL;
663
0
        if (s->data_series & CRAM_MF)    s->data_series |= CRAM_CF;
664
0
        if (s->data_series & CRAM_MQ)    s->data_series |= CRAM_BF;
665
0
        if (s->data_series & CRAM_BS)    s->data_series |= CRAM_RI;
666
0
        if (s->data_series & (CRAM_MF |CRAM_NS |CRAM_NP |CRAM_TS |CRAM_NF))
667
0
            s->data_series |= CRAM_CF;
668
0
        if (!hdr->read_names_included && s->data_series & CRAM_RN)
669
0
            s->data_series |= CRAM_CF | CRAM_NF;
670
0
        if (s->data_series & (CRAM_BA | CRAM_QS | CRAM_BB | CRAM_QQ))
671
0
            s->data_series |= CRAM_BF | CRAM_CF | CRAM_RL;
672
0
        if (s->data_series & CRAM_FN) {
673
            // The CRAM_FN loop checks for reference length boundaries,
674
            // which needs a working seq_pos.  Some fields are fixed size
675
            // irrespective of if we decode (BS), but others need to know
676
            // the size of the string fetched back (SC, IN, BB).
677
0
            s->data_series |= CRAM_SC | CRAM_IN | CRAM_BB;
678
0
        }
679
680
0
        orig_ds = s->data_series;
681
682
        // Find which blocks are in use.
683
0
        for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
684
0
            int bnum1, bnum2, j;
685
0
            cram_codec *c = hdr->codecs[i_to_id[i]];
686
687
0
            if (!(s->data_series & (1<<i)))
688
0
                continue;
689
690
0
            if (!c)
691
0
                continue;
692
693
0
            bnum1 = cram_codec_to_id(c, &bnum2);
694
695
0
            for (;;) {
696
0
                switch (bnum1) {
697
0
                case -2:
698
0
                    break;
699
700
0
                case -1:
701
0
                    core_used = 1;
702
0
                    break;
703
704
0
                default:
705
0
                    for (j = 0; j < s->hdr->num_blocks; j++) {
706
0
                        if (s->block[j]->content_type == EXTERNAL &&
707
0
                            s->block[j]->content_id == bnum1) {
708
0
                            block_used[j] = 1;
709
0
                            if (cram_uncompress_block(s->block[j])) {
710
0
                                free(block_used);
711
0
                                return -1;
712
0
                            }
713
0
                        }
714
0
                    }
715
0
                    break;
716
0
                }
717
718
0
                if (bnum2 == -2 || bnum1 == bnum2)
719
0
                    break;
720
721
0
                bnum1 = bnum2; // 2nd pass
722
0
            }
723
0
        }
724
725
        // Tags too
726
0
        if ((fd->required_fields & SAM_AUX) ||
727
0
            (s->data_series & CRAM_aux)) {
728
0
            for (i = 0; i < CRAM_MAP_HASH; i++) {
729
0
                int bnum1, bnum2, j;
730
0
                cram_map *m = hdr->tag_encoding_map[i];
731
732
0
                while (m) {
733
0
                    cram_codec *c = m->codec;
734
0
                    if (!c) {
735
0
                        m = m->next;
736
0
                        continue;
737
0
                    }
738
739
0
                    bnum1 = cram_codec_to_id(c, &bnum2);
740
741
0
                    for (;;) {
742
0
                        switch (bnum1) {
743
0
                        case -2:
744
0
                            break;
745
746
0
                        case -1:
747
0
                            core_used = 1;
748
0
                            break;
749
750
0
                        default:
751
0
                            for (j = 0; j < s->hdr->num_blocks; j++) {
752
0
                                if (s->block[j]->content_type == EXTERNAL &&
753
0
                                    s->block[j]->content_id == bnum1) {
754
0
                                    block_used[j] = 1;
755
0
                                    if (cram_uncompress_block(s->block[j])) {
756
0
                                        free(block_used);
757
0
                                        return -1;
758
0
                                    }
759
0
                                }
760
0
                            }
761
0
                            break;
762
0
                        }
763
764
0
                        if (bnum2 == -2 || bnum1 == bnum2)
765
0
                            break;
766
767
0
                        bnum1 = bnum2; // 2nd pass
768
0
                    }
769
770
0
                    m = m->next;
771
0
                }
772
0
            }
773
0
        }
774
775
        // We now know which blocks are in used, so repeat and find
776
        // which other data series need to be added.
777
0
        for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
778
0
            int bnum1, bnum2, j;
779
0
            cram_codec *c = hdr->codecs[i_to_id[i]];
780
781
0
            if (!c)
782
0
                continue;
783
784
0
            bnum1 = cram_codec_to_id(c, &bnum2);
785
786
0
            for (;;) {
787
0
                switch (bnum1) {
788
0
                case -2:
789
0
                    break;
790
791
0
                case -1:
792
0
                    if (core_used) {
793
                        //printf(" + data series %08x:\n", 1<<i);
794
0
                        s->data_series |= 1<<i;
795
0
                    }
796
0
                    break;
797
798
0
                default:
799
0
                    for (j = 0; j < s->hdr->num_blocks; j++) {
800
0
                        if (s->block[j]->content_type == EXTERNAL &&
801
0
                            s->block[j]->content_id == bnum1) {
802
0
                            if (block_used[j]) {
803
                                //printf(" + data series %08x:\n", 1<<i);
804
0
                                s->data_series |= 1<<i;
805
0
                            }
806
0
                        }
807
0
                    }
808
0
                    break;
809
0
                }
810
811
0
                if (bnum2 == -2 || bnum1 == bnum2)
812
0
                    break;
813
814
0
                bnum1 = bnum2; // 2nd pass
815
0
            }
816
0
        }
817
818
        // Tags too
819
0
        for (i = 0; i < CRAM_MAP_HASH; i++) {
820
0
            int bnum1, bnum2, j;
821
0
            cram_map *m = hdr->tag_encoding_map[i];
822
823
0
            while (m) {
824
0
                cram_codec *c = m->codec;
825
0
                if (!c) {
826
0
                    m = m->next;
827
0
                    continue;
828
0
                }
829
830
0
                bnum1 = cram_codec_to_id(c, &bnum2);
831
832
0
                for (;;) {
833
0
                    switch (bnum1) {
834
0
                    case -2:
835
0
                        break;
836
837
0
                    case -1:
838
                        //printf(" + data series %08x:\n", CRAM_aux);
839
0
                        s->data_series |= CRAM_aux;
840
0
                        break;
841
842
0
                    default:
843
0
                        for (j = 0; j < s->hdr->num_blocks; j++) {
844
0
                            if (s->block[j]->content_type == EXTERNAL &&
845
0
                                s->block[j]->content_id == bnum1) {
846
0
                                if (block_used[j]) {
847
                                    //printf(" + data series %08x:\n",
848
                                    //       CRAM_aux);
849
0
                                    s->data_series |= CRAM_aux;
850
0
                                }
851
0
                            }
852
0
                        }
853
0
                        break;
854
0
                    }
855
856
0
                    if (bnum2 == -2 || bnum1 == bnum2)
857
0
                        break;
858
859
0
                    bnum1 = bnum2; // 2nd pass
860
0
                }
861
862
0
                m = m->next;
863
0
            }
864
0
        }
865
0
    } while (orig_ds != s->data_series);
866
867
0
    free(block_used);
868
0
    return 0;
869
0
}
870
871
/*
872
 * Checks whether an external block is used solely by a single data series.
873
 * Returns the codec type if so (EXTERNAL, BYTE_ARRAY_LEN, BYTE_ARRAY_STOP)
874
 *         or 0 if not (E_NULL).
875
 */
876
static int cram_ds_unique(cram_block_compression_hdr *hdr, cram_codec *c,
877
0
                          int id) {
878
0
    int i, n_id = 0;
879
0
    enum cram_encoding e_type = 0;
880
881
0
    for (i = 0; i < DS_END; i++) {
882
0
        cram_codec *c;
883
0
        int bnum1, bnum2, old_n_id;
884
885
0
        if (!(c = hdr->codecs[i]))
886
0
            continue;
887
888
0
        bnum1 = cram_codec_to_id(c, &bnum2);
889
890
0
        old_n_id = n_id;
891
0
        if (bnum1 == id) {
892
0
            n_id++;
893
0
            e_type = c->codec;
894
0
        }
895
0
        if (bnum2 == id) {
896
0
            n_id++;
897
0
            e_type = c->codec;
898
0
        }
899
900
0
        if (n_id == old_n_id+2)
901
0
            n_id--; // len/val in same place counts once only.
902
0
    }
903
904
0
    return n_id == 1 ? e_type : 0;
905
0
}
906
907
/*
908
 * Attempts to estimate the size of some blocks so we can preallocate them
909
 * before decoding.  Although decoding will automatically grow the blocks,
910
 * it is typically more efficient to preallocate.
911
 */
912
void cram_decode_estimate_sizes(cram_block_compression_hdr *hdr, cram_slice *s,
913
                                int *qual_size, int *name_size,
914
0
                                int *q_id) {
915
0
    int bnum1, bnum2;
916
0
    cram_codec *cd;
917
918
0
    *qual_size = 0;
919
0
    *name_size = 0;
920
921
    /* Qual */
922
0
    cd = hdr->codecs[DS_QS];
923
0
    if (cd == NULL) return;
924
0
    bnum1 = cram_codec_to_id(cd, &bnum2);
925
0
    if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
926
0
    if (cram_ds_unique(hdr, cd, bnum1)) {
927
0
        cram_block *b = cram_get_block_by_id(s, bnum1);
928
0
        if (b) *qual_size = b->uncomp_size;
929
0
        if (q_id && cd->codec == E_EXTERNAL)
930
0
            *q_id = bnum1;
931
0
    }
932
933
    /* Name */
934
0
    cd = hdr->codecs[DS_RN];
935
0
    if (cd == NULL) return;
936
0
    bnum1 = cram_codec_to_id(cd, &bnum2);
937
0
    if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
938
0
    if (cram_ds_unique(hdr, cd, bnum1)) {
939
0
        cram_block *b = cram_get_block_by_id(s, bnum1);
940
0
        if (b) *name_size = b->uncomp_size;
941
0
    }
942
0
}
943
944
945
/* ----------------------------------------------------------------------
946
 * CRAM slices
947
 */
948
949
/*
950
 * Decodes a CRAM (un)mapped slice header block.
951
 * Returns slice header ptr on success
952
 *         NULL on failure
953
 */
954
683
cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) {
955
683
    cram_block_slice_hdr *hdr;
956
683
    unsigned char *cp;
957
683
    unsigned char *cp_end;
958
683
    int i, err = 0;
959
960
683
    if (b->method != RAW) {
961
        /* Spec. says slice header should be RAW, but we can future-proof
962
           by trying to decode it if it isn't. */
963
12
        if (cram_uncompress_block(b) < 0)
964
7
            return NULL;
965
12
    }
966
676
    cp =  (unsigned char *)BLOCK_DATA(b);
967
676
    cp_end = cp + b->uncomp_size;
968
969
676
    if (b->content_type != MAPPED_SLICE &&
970
3
        b->content_type != UNMAPPED_SLICE)
971
0
        return NULL;
972
973
676
    if (!(hdr  = calloc(1, sizeof(*hdr))))
974
0
        return NULL;
975
976
676
    hdr->content_type = b->content_type;
977
978
676
    if (b->content_type == MAPPED_SLICE) {
979
673
        hdr->ref_seq_id = fd->vv.varint_get32s((char **)&cp, (char *)cp_end, &err);
980
673
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
981
0
            hdr->ref_seq_start = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err);
982
0
            hdr->ref_seq_span  = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err);
983
673
        } else {
984
673
            hdr->ref_seq_start = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
985
673
            hdr->ref_seq_span  = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
986
673
        }
987
673
        if (hdr->ref_seq_start < 0 || hdr->ref_seq_span < 0) {
988
20
            free(hdr);
989
20
            hts_log_error("Negative values not permitted for header "
990
20
                          "sequence start or span fields");
991
20
            return NULL;
992
20
        }
993
673
    }
994
656
    hdr->num_records = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err);
995
656
    hdr->record_counter = 0;
996
656
    if (CRAM_MAJOR_VERS(fd->version) == 2) {
997
0
        hdr->record_counter = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
998
656
    } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
999
0
        hdr->record_counter = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err);
1000
0
    }
1001
656
    hdr->num_blocks      = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
1002
656
    hdr->num_content_ids = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
1003
656
    if (hdr->num_content_ids < 1 ||
1004
645
        hdr->num_content_ids >= 10000) {
1005
        // Slice must have at least one data block, and there is no need
1006
        // for more than 2 per possible aux-tag plus ancillary.
1007
23
        free(hdr);
1008
23
        return NULL;
1009
23
    }
1010
633
    hdr->block_content_ids = malloc(hdr->num_content_ids * sizeof(int32_t));
1011
633
    if (!hdr->block_content_ids) {
1012
0
        free(hdr);
1013
0
        return NULL;
1014
0
    }
1015
1016
25.3k
    for (i = 0; i < hdr->num_content_ids; i++)
1017
24.6k
        hdr->block_content_ids[i] = fd->vv.varint_get32((char **)&cp,
1018
24.6k
                                                         (char *)cp_end,
1019
24.6k
                                                         &err);
1020
633
    if (err) {
1021
8
        free(hdr->block_content_ids);
1022
8
        free(hdr);
1023
8
        return NULL;
1024
8
    }
1025
1026
625
    if (b->content_type == MAPPED_SLICE)
1027
622
        hdr->ref_base_id = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err);
1028
1029
625
    if (CRAM_MAJOR_VERS(fd->version) != 1) {
1030
0
        if (cp_end - cp < 16) {
1031
0
            free(hdr->block_content_ids);
1032
0
            free(hdr);
1033
0
            return NULL;
1034
0
        }
1035
0
        memcpy(hdr->md5, cp, 16);
1036
625
    } else {
1037
625
        memset(hdr->md5, 0, 16);
1038
625
    }
1039
1040
625
    if (!err)
1041
619
        return hdr;
1042
1043
6
    free(hdr->block_content_ids);
1044
6
    free(hdr);
1045
6
    return NULL;
1046
625
}
1047
1048
1049
#if 0
1050
/* Returns the number of bits set in val; it the highest bit used */
1051
static int nbits(int v) {
1052
    static const int MultiplyDeBruijnBitPosition[32] = {
1053
        1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31,
1054
        9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32
1055
    };
1056
1057
    v |= v >> 1; // first up to set all bits 1 after the first 1 */
1058
    v |= v >> 2;
1059
    v |= v >> 4;
1060
    v |= v >> 8;
1061
    v |= v >> 16;
1062
1063
    // DeBruijn magic to find top bit
1064
    return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
1065
}
1066
#endif
1067
1068
#if 0
1069
static int sort_freqs(const void *vp1, const void *vp2) {
1070
    const int i1 = *(const int *)vp1;
1071
    const int i2 = *(const int *)vp2;
1072
    return i1-i2;
1073
}
1074
#endif
1075
1076
/* ----------------------------------------------------------------------
1077
 * Primary CRAM sequence decoder
1078
 */
1079
1080
0
static inline int add_md_char(cram_slice *s, int decode_md, char c, int32_t *md_dist) {
1081
0
    if (decode_md) {
1082
0
        BLOCK_APPEND_UINT(s->aux_blk, *md_dist);
1083
0
        BLOCK_APPEND_CHAR(s->aux_blk, c);
1084
0
        *md_dist = 0;
1085
0
    }
1086
0
    return 0;
1087
1088
0
 block_err:
1089
0
    return -1;
1090
0
}
1091
1092
/*
1093
 * Internal part of cram_decode_slice().
1094
 * Generates the sequence, quality and cigar components.
1095
 */
1096
static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
1097
                           cram_block *blk, cram_record *cr, sam_hdr_t *sh,
1098
                           int cf, char *seq, char *qual,
1099
0
                           int has_MD, int has_NM) {
1100
0
    int prev_pos = 0, f, r = 0, out_sz = 1;
1101
0
    int seq_pos = 1;
1102
0
    int cig_len = 0;
1103
0
    int64_t ref_pos = cr->apos;
1104
0
    int32_t fn, i32;
1105
0
    enum cigar_op cig_op = BAM_CMATCH;
1106
0
    uint32_t *cigar = s->cigar;
1107
0
    uint32_t ncigar = s->ncigar;
1108
0
    uint32_t cigar_alloc = s->cigar_alloc;
1109
0
    uint32_t nm = 0;
1110
0
    int32_t md_dist = 0;
1111
0
    int orig_aux = 0;
1112
    // CRAM <  4.0 decode_md is off/on
1113
    // CRAM >= 4.0 decode_md is auto/on (auto=on if MD* present, off otherwise)
1114
0
    int do_md = CRAM_MAJOR_VERS(fd->version) >= 4
1115
0
        ? (s->decode_md > 0)
1116
0
        : (s->decode_md != 0);
1117
0
    int decode_md = s->ref && cr->ref_id >= 0 && ((do_md && !has_MD) || has_MD < 0);
1118
0
    int decode_nm = s->ref && cr->ref_id >= 0 && ((do_md && !has_NM) || has_NM < 0);
1119
0
    uint32_t ds = s->data_series;
1120
0
    sam_hrecs_t *bfd = sh->hrecs;
1121
1122
0
    cram_codec **codecs = c->comp_hdr->codecs;
1123
1124
0
    if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1125
0
        memset(qual, 255, cr->len);
1126
0
    }
1127
1128
0
    if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
1129
0
        decode_md = decode_nm = 0;
1130
1131
0
    if (decode_md) {
1132
0
        orig_aux = BLOCK_SIZE(s->aux_blk);
1133
0
        if (has_MD == 0)
1134
0
            BLOCK_APPEND(s->aux_blk, "MDZ", 3);
1135
0
    }
1136
1137
0
    if (ds & CRAM_FN) {
1138
0
        if (!codecs[DS_FN]) return -1;
1139
0
        r |= codecs[DS_FN]->decode(s,codecs[DS_FN],
1140
0
                                   blk, (char *)&fn, &out_sz);
1141
0
        if (r) return r;
1142
0
    } else {
1143
0
        fn = 0;
1144
0
    }
1145
1146
0
    ref_pos--; // count from 0
1147
0
    cr->cigar = ncigar;
1148
1149
0
    if (!(ds & (CRAM_FC | CRAM_FP)))
1150
0
        goto skip_cigar;
1151
1152
0
    if (fn) {
1153
0
        if ((ds & CRAM_FC) && !codecs[DS_FC])
1154
0
            return -1;
1155
0
        if ((ds & CRAM_FP) && !codecs[DS_FP])
1156
0
            return -1;
1157
0
    }
1158
1159
0
    for (f = 0; f < fn; f++) {
1160
0
        int32_t pos = 0;
1161
0
        char op;
1162
1163
0
        if (ncigar+2 >= cigar_alloc) {
1164
0
            cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1165
0
            if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar))))
1166
0
                return -1;
1167
0
            s->cigar = cigar;
1168
0
        }
1169
1170
0
        if (ds & CRAM_FC) {
1171
0
            r |= codecs[DS_FC]->decode(s,
1172
0
                                       codecs[DS_FC],
1173
0
                                       blk,
1174
0
                                       &op,  &out_sz);
1175
0
            if (r) return r;
1176
0
        }
1177
1178
0
        if (!(ds & CRAM_FP))
1179
0
            continue;
1180
1181
0
        r |= codecs[DS_FP]->decode(s,
1182
0
                                   codecs[DS_FP],
1183
0
                                   blk,
1184
0
                                   (char *)&pos, &out_sz);
1185
0
        if (r) return r;
1186
0
        pos += prev_pos;
1187
1188
0
        if (pos <= 0) {
1189
0
            hts_log_error("Feature position %d before start of read", pos);
1190
0
            return -1;
1191
0
        }
1192
1193
0
        if (pos > seq_pos) {
1194
0
            if (pos > cr->len+1)
1195
0
                return -1;
1196
1197
0
            if (s->ref && cr->ref_id >= 0) {
1198
0
                if (ref_pos + pos - seq_pos > bfd->ref[cr->ref_id].len) {
1199
0
                    static int whinged = 0;
1200
0
                    int rlen;
1201
0
                    if (!whinged)
1202
0
                        hts_log_warning("Ref pos outside of ref sequence boundary");
1203
0
                    whinged = 1;
1204
0
                    rlen = bfd->ref[cr->ref_id].len - ref_pos;
1205
                    // May miss MD/NM cases where both seq/ref are N, but this is a
1206
                    // malformed cram file anyway.
1207
0
                    if (rlen > 0) {
1208
0
                        if (ref_pos + rlen > s->ref_end)
1209
0
                            goto beyond_slice;
1210
1211
0
                        memcpy(&seq[seq_pos-1],
1212
0
                               &s->ref[ref_pos - s->ref_start +1], rlen);
1213
0
                        if ((pos - seq_pos) - rlen > 0)
1214
0
                            memset(&seq[seq_pos-1+rlen], 'N',
1215
0
                                   (pos - seq_pos) - rlen);
1216
0
                    } else {
1217
0
                        memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
1218
0
                    }
1219
0
                    if (md_dist >= 0)
1220
0
                        md_dist += pos - seq_pos;
1221
0
                } else {
1222
                    // 'N' in both ref and seq is also mismatch for NM/MD
1223
0
                    if (ref_pos + pos-seq_pos > s->ref_end)
1224
0
                        goto beyond_slice;
1225
1226
0
                    const char *refp = s->ref + ref_pos - s->ref_start + 1;
1227
0
                    const int frag_len = pos - seq_pos;
1228
0
                    int do_cpy = 1;
1229
0
                    if (decode_md || decode_nm) {
1230
0
                        char *N = memchr(refp, 'N', frag_len);
1231
0
                        if (N) {
1232
0
                            int i;
1233
0
                            for (i = 0; i < frag_len; i++) {
1234
0
                                char base = refp[i];
1235
0
                                if (base == 'N') {
1236
0
                                    if (add_md_char(s, decode_md,
1237
0
                                                    'N', &md_dist) < 0)
1238
0
                                        return -1;
1239
0
                                    nm++;
1240
0
                                } else {
1241
0
                                    md_dist++;
1242
0
                                }
1243
0
                                seq[seq_pos-1+i] = base;
1244
0
                            }
1245
0
                            do_cpy = 0;
1246
0
                        } else {
1247
0
                            md_dist += frag_len;
1248
0
                        }
1249
0
                    }
1250
0
                    if (do_cpy)
1251
0
                        memcpy(&seq[seq_pos-1], refp, frag_len);
1252
0
                }
1253
0
            }
1254
#ifdef USE_X
1255
            if (cig_len && cig_op != BAM_CBASE_MATCH) {
1256
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1257
                cig_len = 0;
1258
            }
1259
            cig_op = BAM_CBASE_MATCH;
1260
#else
1261
0
            if (cig_len && cig_op != BAM_CMATCH) {
1262
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1263
0
                cig_len = 0;
1264
0
            }
1265
0
            cig_op = BAM_CMATCH;
1266
0
#endif
1267
0
            cig_len += pos - seq_pos;
1268
0
            ref_pos += pos - seq_pos;
1269
0
            seq_pos = pos;
1270
0
        }
1271
1272
0
        prev_pos = pos;
1273
1274
0
        if (!(ds & CRAM_FC))
1275
0
            goto skip_cigar;
1276
1277
0
        switch(op) {
1278
0
        case 'S': { // soft clip: IN
1279
0
            int32_t out_sz2 = cr->len ? cr->len-(pos-1) : 1;
1280
0
            int have_sc = 0;
1281
1282
0
            if (cig_len) {
1283
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1284
0
                cig_len = 0;
1285
0
            }
1286
0
            switch (CRAM_MAJOR_VERS(fd->version)) {
1287
0
            case 1:
1288
0
                if (ds & CRAM_IN) {
1289
0
                    r |= codecs[DS_IN]
1290
0
                        ? codecs[DS_IN]->decode(s, codecs[DS_IN],
1291
0
                                                blk,
1292
0
                                                cr->len ? &seq[pos-1] : NULL,
1293
0
                                                &out_sz2)
1294
0
                        : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1295
0
                    have_sc = 1;
1296
0
                }
1297
0
                break;
1298
0
            case 2:
1299
0
            default:
1300
0
                if (ds & CRAM_SC) {
1301
0
                    r |= codecs[DS_SC]
1302
0
                        ? codecs[DS_SC]->decode(s, codecs[DS_SC],
1303
0
                                                blk,
1304
0
                                                cr->len ? &seq[pos-1] : NULL,
1305
0
                                                &out_sz2)
1306
0
                        : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1307
0
                    have_sc = 1;
1308
0
                }
1309
0
                break;
1310
1311
                //default:
1312
                //    r |= codecs[DS_BB]
1313
                //        ? codecs[DS_BB]->decode(s, codecs[DS_BB],
1314
                //                                blk, &seq[pos-1], &out_sz2)
1315
                //        : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1316
0
            }
1317
0
            if (have_sc) {
1318
0
                if (r) return r;
1319
0
                cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP;
1320
0
                cig_op = BAM_CSOFT_CLIP;
1321
0
                seq_pos += out_sz2;
1322
0
            }
1323
0
            break;
1324
0
        }
1325
1326
0
        case 'X': { // Substitution; BS
1327
0
            unsigned char base;
1328
#ifdef USE_X
1329
            if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1330
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1331
                cig_len = 0;
1332
            }
1333
            if (ds & CRAM_BS) {
1334
                if (!codecs[DS_BS]) return -1;
1335
                r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk,
1336
                                           (char *)&base, &out_sz);
1337
                if (pos-1 < cr->len)
1338
                    seq[pos-1] = 'N'; // FIXME look up BS=base value
1339
            }
1340
            cig_op = BAM_CBASE_MISMATCH;
1341
#else
1342
0
            int ref_base;
1343
0
            if (cig_len && cig_op != BAM_CMATCH) {
1344
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1345
0
                cig_len = 0;
1346
0
            }
1347
0
            if (ds & CRAM_BS) {
1348
0
                if (!codecs[DS_BS]) return -1;
1349
0
                r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk,
1350
0
                                           (char *)&base, &out_sz);
1351
0
                if (r) return -1;
1352
0
                if (cr->ref_id < 0 || ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
1353
0
                    if (pos-1 < cr->len)
1354
0
                        seq[pos-1] = c->comp_hdr->
1355
0
                            substitution_matrix[fd->L1['N']][base];
1356
0
                    if (decode_md || decode_nm) {
1357
0
                        if (md_dist >= 0 && decode_md)
1358
0
                            BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1359
0
                        md_dist = -1;
1360
0
                        nm--;
1361
0
                    }
1362
0
                } else {
1363
0
                    unsigned char ref_call = ref_pos < s->ref_end
1364
0
                        ? (uc)s->ref[ref_pos - s->ref_start +1]
1365
0
                        : 'N';
1366
0
                    ref_base = fd->L1[ref_call];
1367
0
                    if (pos-1 < cr->len)
1368
0
                        seq[pos-1] = c->comp_hdr->
1369
0
                            substitution_matrix[ref_base][base];
1370
0
                    if (add_md_char(s, decode_md, ref_call, &md_dist) < 0)
1371
0
                        return -1;
1372
0
                }
1373
0
            }
1374
0
            cig_op = BAM_CMATCH;
1375
0
#endif
1376
0
            nm++;
1377
0
            cig_len++;
1378
0
            seq_pos++;
1379
0
            ref_pos++;
1380
0
            break;
1381
0
        }
1382
1383
0
        case 'D': { // Deletion; DL
1384
0
            if (cig_len && cig_op != BAM_CDEL) {
1385
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1386
0
                cig_len = 0;
1387
0
            }
1388
0
            if (ds & CRAM_DL) {
1389
0
                if (!codecs[DS_DL]) return -1;
1390
0
                r |= codecs[DS_DL]->decode(s, codecs[DS_DL], blk,
1391
0
                                           (char *)&i32, &out_sz);
1392
0
                if (r) return r;
1393
0
                if (decode_md || decode_nm) {
1394
0
                    if (ref_pos + i32 > s->ref_end)
1395
0
                        goto beyond_slice;
1396
0
                    if (md_dist >= 0 && decode_md)
1397
0
                        BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1398
0
                    if (ref_pos + i32 <= bfd->ref[cr->ref_id].len) {
1399
0
                        if (decode_md) {
1400
0
                            BLOCK_APPEND_CHAR(s->aux_blk, '^');
1401
0
                            BLOCK_APPEND(s->aux_blk,
1402
0
                                         &s->ref[ref_pos - s->ref_start +1],
1403
0
                                         i32);
1404
0
                            md_dist = 0;
1405
0
                        }
1406
0
                        nm += i32;
1407
0
                    } else {
1408
0
                        uint32_t dlen;
1409
0
                        if (bfd->ref[cr->ref_id].len >= ref_pos) {
1410
0
                            if (decode_md) {
1411
0
                                BLOCK_APPEND_CHAR(s->aux_blk, '^');
1412
0
                                BLOCK_APPEND(s->aux_blk,
1413
0
                                             &s->ref[ref_pos - s->ref_start+1],
1414
0
                                             bfd->ref[cr->ref_id].len-ref_pos);
1415
0
                                BLOCK_APPEND_UINT(s->aux_blk, 0);
1416
0
                            }
1417
0
                            dlen = i32 - (bfd->ref[cr->ref_id].len - ref_pos);
1418
0
                            nm += i32 - dlen;
1419
0
                        } else {
1420
0
                            dlen = i32;
1421
0
                        }
1422
1423
0
                        md_dist = -1;
1424
0
                    }
1425
0
                }
1426
0
                cig_op = BAM_CDEL;
1427
0
                cig_len += i32;
1428
0
                ref_pos += i32;
1429
                //printf("  %d: DL = %d (ret %d)\n", f, i32, r);
1430
0
            }
1431
0
            break;
1432
0
        }
1433
1434
0
        case 'I': { // Insertion (several bases); IN
1435
0
            int32_t out_sz2 = cr->len ? cr->len-(pos-1) : 1;
1436
1437
0
            if (cig_len && cig_op != BAM_CINS) {
1438
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1439
0
                cig_len = 0;
1440
0
            }
1441
1442
0
            if (ds & CRAM_IN) {
1443
0
                if (!codecs[DS_IN]) return -1;
1444
0
                r |= codecs[DS_IN]->decode(s, codecs[DS_IN], blk,
1445
0
                                           cr->len ? &seq[pos-1] : NULL,
1446
0
                                           &out_sz2);
1447
0
                if (r) return r;
1448
0
                cig_op = BAM_CINS;
1449
0
                cig_len += out_sz2;
1450
0
                seq_pos += out_sz2;
1451
0
                nm      += out_sz2;
1452
                //printf("  %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2);
1453
0
            }
1454
0
            break;
1455
0
        }
1456
1457
0
        case 'i': { // Insertion (single base); BA
1458
0
            if (cig_len && cig_op != BAM_CINS) {
1459
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1460
0
                cig_len = 0;
1461
0
            }
1462
0
            if (ds & CRAM_BA) {
1463
0
                if (!codecs[DS_BA]) return -1;
1464
0
                r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk,
1465
0
                                           cr->len ? &seq[pos-1] : NULL,
1466
0
                                           &out_sz);
1467
0
                if (r) return r;
1468
0
            }
1469
0
            cig_op = BAM_CINS;
1470
0
            cig_len++;
1471
0
            seq_pos++;
1472
0
            nm++;
1473
0
            break;
1474
0
        }
1475
1476
0
        case 'b': { // Several bases
1477
0
            int32_t len = cr->len ? cr->len-(pos-1) : 1;
1478
1479
0
            if (cig_len && cig_op != BAM_CMATCH) {
1480
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1481
0
                cig_len = 0;
1482
0
            }
1483
1484
0
            if (ds & CRAM_BB) {
1485
0
                if (!codecs[DS_BB]) return -1;
1486
0
                r |= codecs[DS_BB]->decode(s, codecs[DS_BB], blk,
1487
0
                                           cr->len ? &seq[pos-1] : NULL,
1488
0
                                           &len);
1489
0
                if (r) return r;
1490
1491
0
                if (decode_md || decode_nm) {
1492
0
                    int x;
1493
0
                    if (md_dist >= 0 && decode_md)
1494
0
                        BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1495
1496
0
                    for (x = 0; x < len; x++) {
1497
0
                        if (x && decode_md)
1498
0
                            BLOCK_APPEND_UINT(s->aux_blk, 0);
1499
0
                        if (ref_pos+x >= bfd->ref[cr->ref_id].len || !s->ref) {
1500
0
                            md_dist = -1;
1501
0
                            break;
1502
0
                        } else {
1503
0
                            if (decode_md) {
1504
0
                                if (ref_pos + x > s->ref_end)
1505
0
                                    goto beyond_slice;
1506
0
                                char r = s->ref[ref_pos+x-s->ref_start +1];
1507
0
                                BLOCK_APPEND_CHAR(s->aux_blk, r);
1508
0
                            }
1509
0
                        }
1510
0
                    }
1511
1512
0
                    nm += x;
1513
0
                    md_dist = 0;
1514
0
                }
1515
0
            }
1516
1517
0
            cig_op = BAM_CMATCH;
1518
1519
0
            cig_len+=len;
1520
0
            seq_pos+=len;
1521
0
            ref_pos+=len;
1522
            //prev_pos+=len;
1523
0
            break;
1524
0
        }
1525
1526
0
        case 'q': { // Several quality values
1527
0
            int32_t len = cr->len ? cr->len - (pos-1) : 1;
1528
1529
0
            if (cig_len && cig_op != BAM_CMATCH) {
1530
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1531
0
                cig_len = 0;
1532
0
            }
1533
1534
0
            if (ds & CRAM_QQ) {
1535
0
                if (!codecs[DS_QQ]) return -1;
1536
0
                if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)
1537
0
                    && (unsigned char)*qual == 255)
1538
0
                    memset(qual, 30, cr->len); // ?
1539
0
                r |= codecs[DS_QQ]->decode(s, codecs[DS_QQ], blk,
1540
0
                                           (char *)&qual[pos-1], &len);
1541
0
                if (r) return r;
1542
0
            }
1543
1544
0
            cig_op = BAM_CMATCH;
1545
1546
            //prev_pos+=len;
1547
0
            break;
1548
0
        }
1549
1550
0
        case 'B': { // Read base; BA, QS
1551
#ifdef USE_X
1552
            if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1553
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1554
                cig_len = 0;
1555
            }
1556
#else
1557
0
            if (cig_len && cig_op != BAM_CMATCH) {
1558
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1559
0
                cig_len = 0;
1560
0
            }
1561
0
#endif
1562
0
            if (ds & CRAM_BA) {
1563
0
                if (!codecs[DS_BA]) return -1;
1564
0
                r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk,
1565
0
                                           cr->len ? &seq[pos-1] : NULL,
1566
0
                                           &out_sz);
1567
1568
0
                if (decode_md || decode_nm) {
1569
0
                    if (md_dist >= 0 && decode_md)
1570
0
                        BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1571
0
                    if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
1572
0
                        md_dist = -1;
1573
0
                    } else {
1574
0
                        if (decode_md) {
1575
0
                            if (ref_pos > s->ref_end)
1576
0
                                goto beyond_slice;
1577
0
                            BLOCK_APPEND_CHAR(s->aux_blk,
1578
0
                                              s->ref[ref_pos-s->ref_start +1]);
1579
0
                        }
1580
0
                        nm++;
1581
0
                        md_dist = 0;
1582
0
                    }
1583
0
                }
1584
0
            }
1585
0
            if (ds & CRAM_QS) {
1586
0
                if (!codecs[DS_QS]) return -1;
1587
0
                if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)
1588
0
                    && (unsigned char)*qual == 255)
1589
0
                    memset(qual, 30, cr->len); // ASCII ?.  Same as htsjdk
1590
0
                r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk,
1591
0
                                           (char *)&qual[pos-1], &out_sz);
1592
0
            }
1593
#ifdef USE_X
1594
            cig_op = BAM_CBASE_MISMATCH;
1595
#else
1596
0
            cig_op = BAM_CMATCH;
1597
0
#endif
1598
0
            cig_len++;
1599
0
            seq_pos++;
1600
0
            ref_pos++;
1601
            //printf("  %d: BA/QS(B) = %c/%d (ret %d)\n", f, i32, qc, r);
1602
0
            break;
1603
0
        }
1604
1605
0
        case 'Q': { // Quality score; QS
1606
0
            if (ds & CRAM_QS) {
1607
0
                if (!codecs[DS_QS]) return -1;
1608
0
                if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) &&
1609
0
                    (unsigned char)*qual == 255)
1610
0
                    memset(qual, 30, cr->len); // ?
1611
0
                r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk,
1612
0
                                           (char *)&qual[pos-1], &out_sz);
1613
                //printf("  %d: QS = %d (ret %d)\n", f, qc, r);
1614
0
            }
1615
0
            break;
1616
0
        }
1617
1618
0
        case 'H': { // hard clip; HC
1619
0
            if (cig_len && cig_op != BAM_CHARD_CLIP) {
1620
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1621
0
                cig_len = 0;
1622
0
            }
1623
0
            if (ds & CRAM_HC) {
1624
0
                if (!codecs[DS_HC]) return -1;
1625
0
                r |= codecs[DS_HC]->decode(s, codecs[DS_HC], blk,
1626
0
                                           (char *)&i32, &out_sz);
1627
0
                if (r) return r;
1628
0
                cig_op = BAM_CHARD_CLIP;
1629
0
                cig_len += i32;
1630
0
            }
1631
0
            break;
1632
0
        }
1633
1634
0
        case 'P': { // padding; PD
1635
0
            if (cig_len && cig_op != BAM_CPAD) {
1636
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1637
0
                cig_len = 0;
1638
0
            }
1639
0
            if (ds & CRAM_PD) {
1640
0
                if (!codecs[DS_PD]) return -1;
1641
0
                r |= codecs[DS_PD]->decode(s, codecs[DS_PD], blk,
1642
0
                                           (char *)&i32, &out_sz);
1643
0
                if (r) return r;
1644
0
                cig_op = BAM_CPAD;
1645
0
                cig_len += i32;
1646
0
            }
1647
0
            break;
1648
0
        }
1649
1650
0
        case 'N': { // Ref skip; RS
1651
0
            if (cig_len && cig_op != BAM_CREF_SKIP) {
1652
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1653
0
                cig_len = 0;
1654
0
            }
1655
0
            if (ds & CRAM_RS) {
1656
0
                if (!codecs[DS_RS]) return -1;
1657
0
                r |= codecs[DS_RS]->decode(s, codecs[DS_RS], blk,
1658
0
                                           (char *)&i32, &out_sz);
1659
0
                if (r) return r;
1660
0
                cig_op = BAM_CREF_SKIP;
1661
0
                cig_len += i32;
1662
0
                ref_pos += i32;
1663
0
            }
1664
0
            break;
1665
0
        }
1666
1667
0
        default:
1668
0
            hts_log_error("Unknown feature code '%c'", op);
1669
0
            return -1;
1670
0
        }
1671
0
    }
1672
1673
0
    if (!(ds & CRAM_FC))
1674
0
        goto skip_cigar;
1675
1676
    /* An implicit match op for any unaccounted for bases */
1677
0
    if ((ds & CRAM_FN) && cr->len >= seq_pos) {
1678
0
        if (s->ref && cr->ref_id >= 0) {
1679
0
            if (ref_pos + cr->len - seq_pos + 1 > bfd->ref[cr->ref_id].len) {
1680
0
                static int whinged = 0;
1681
0
                int rlen;
1682
0
                if (!whinged)
1683
0
                    hts_log_warning("Ref pos outside of ref sequence boundary");
1684
0
                whinged = 1;
1685
0
                rlen = bfd->ref[cr->ref_id].len - ref_pos;
1686
                // May miss MD/NM cases where both seq/ref are N, but this is a
1687
                // malformed cram file anyway.
1688
0
                if (rlen > 0) {
1689
0
                    if (seq_pos-1 + rlen < cr->len)
1690
0
                        memcpy(&seq[seq_pos-1],
1691
0
                               &s->ref[ref_pos - s->ref_start +1], rlen);
1692
0
                    if ((cr->len - seq_pos + 1) - rlen > 0)
1693
0
                        memset(&seq[seq_pos-1+rlen], 'N',
1694
0
                               (cr->len - seq_pos + 1) - rlen);
1695
0
                } else {
1696
0
                    if (cr->len - seq_pos + 1 > 0)
1697
0
                        memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
1698
0
                }
1699
0
                if (md_dist >= 0)
1700
0
                    md_dist += cr->len - seq_pos + 1;
1701
0
            } else {
1702
0
                if (cr->len - seq_pos + 1 > 0) {
1703
0
                    if (ref_pos + cr->len-seq_pos +1 > s->ref_end)
1704
0
                        goto beyond_slice;
1705
0
                    int remainder = cr->len - (seq_pos-1);
1706
0
                    int j = ref_pos - s->ref_start + 1;
1707
0
                    if (decode_md || decode_nm) {
1708
0
                        int i;
1709
0
                        char *N = memchr(&s->ref[j], 'N', remainder);
1710
0
                        if (!N) {
1711
                            // short cut the common case
1712
0
                            md_dist += cr->len - (seq_pos-1);
1713
0
                        } else {
1714
0
                            char *refp = &s->ref[j-(seq_pos-1)];
1715
0
                            md_dist += N-&s->ref[j];
1716
0
                            int i_start = seq_pos-1 + (N - &s->ref[j]);
1717
0
                            for (i = i_start; i < cr->len; i++) {
1718
0
                                char base = refp[i];
1719
0
                                if (base == 'N') {
1720
0
                                    if (add_md_char(s, decode_md, 'N',
1721
0
                                                    &md_dist) < 0)
1722
0
                                        return -1;
1723
0
                                    nm++;
1724
0
                                } else {
1725
0
                                    md_dist++;
1726
0
                                }
1727
0
                            }
1728
0
                        }
1729
0
                    }
1730
0
                    memcpy(&seq[seq_pos-1], &s->ref[j], remainder);
1731
0
                }
1732
0
                ref_pos += cr->len - seq_pos + 1;
1733
0
            }
1734
0
        } else if (cr->ref_id >= 0) {
1735
            // So alignment end can be computed even when not decoding sequence
1736
0
            ref_pos += cr->len - seq_pos + 1;
1737
0
        }
1738
1739
0
        if (ncigar+1 >= cigar_alloc) {
1740
0
            cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1741
0
            if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar))))
1742
0
                return -1;
1743
0
            s->cigar = cigar;
1744
0
        }
1745
#ifdef USE_X
1746
        if (cig_len && cig_op != BAM_CBASE_MATCH) {
1747
            cigar[ncigar++] = (cig_len<<4) + cig_op;
1748
            cig_len = 0;
1749
        }
1750
        cig_op = BAM_CBASE_MATCH;
1751
#else
1752
0
        if (cig_len && cig_op != BAM_CMATCH) {
1753
0
            cigar[ncigar++] = (cig_len<<4) + cig_op;
1754
0
            cig_len = 0;
1755
0
        }
1756
0
        cig_op = BAM_CMATCH;
1757
0
#endif
1758
0
        cig_len += cr->len - seq_pos+1;
1759
0
    }
1760
1761
0
 skip_cigar:
1762
1763
0
    if ((ds & CRAM_FN) && decode_md) {
1764
0
        if (md_dist >= 0)
1765
0
            BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1766
0
    }
1767
1768
0
    if (cig_len) {
1769
0
        if (ncigar >= cigar_alloc) {
1770
0
            cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1771
0
            if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar))))
1772
0
                return -1;
1773
0
            s->cigar = cigar;
1774
0
        }
1775
1776
0
        cigar[ncigar++] = (cig_len<<4) + cig_op;
1777
0
    }
1778
1779
0
    cr->ncigar = ncigar - cr->cigar;
1780
0
    cr->aend = ref_pos > cr->apos ? ref_pos : cr->apos;
1781
1782
    //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos);
1783
1784
0
    if (ds & CRAM_MQ) {
1785
0
        if (!codecs[DS_MQ]) return -1;
1786
0
        r |= codecs[DS_MQ]->decode(s, codecs[DS_MQ], blk,
1787
0
                                   (char *)&cr->mqual, &out_sz);
1788
0
    } else {
1789
0
        cr->mqual = 40;
1790
0
    }
1791
1792
0
    if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1793
0
        int32_t out_sz2 = cr->len;
1794
1795
0
        if (!codecs[DS_QS]) return -1;
1796
0
        r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk,
1797
0
                                   qual, &out_sz2);
1798
0
    }
1799
1800
0
    s->cigar = cigar;
1801
0
    s->cigar_alloc = cigar_alloc;
1802
0
    s->ncigar = ncigar;
1803
1804
0
    if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
1805
0
        cr->len = 0;
1806
1807
0
    if (decode_md) {
1808
0
        BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z:
1809
0
        size_t sz = BLOCK_SIZE(s->aux_blk) - orig_aux;
1810
0
        if (has_MD < 0) {
1811
            // has_MD < 0; already have MDZ allocated in aux at -has_MD,
1812
            // but wrote MD to end of aux (at orig_aux).
1813
            // We need some memmoves to shuffle it around.
1814
0
            char tmp_MD_[1024], *tmp_MD = tmp_MD_;
1815
0
            unsigned char *orig_aux_p = BLOCK_DATA(s->aux_blk) + orig_aux;
1816
0
            if (sz > 1024) {
1817
0
                tmp_MD = malloc(sz);
1818
0
                if (!tmp_MD)
1819
0
                    return -1;
1820
0
            }
1821
0
            memcpy(tmp_MD, orig_aux_p, sz);
1822
0
            memmove(&BLOCK_DATA(s->aux_blk)[-has_MD] + sz,
1823
0
                    &BLOCK_DATA(s->aux_blk)[-has_MD],
1824
0
                    orig_aux_p - &BLOCK_DATA(s->aux_blk)[-has_MD]);
1825
0
            memcpy(&BLOCK_DATA(s->aux_blk)[-has_MD], tmp_MD, sz);
1826
0
            if (tmp_MD != tmp_MD_)
1827
0
                free(tmp_MD);
1828
1829
0
            if (-has_NM > -has_MD)
1830
                // we inserted before NM, so move it up a bit
1831
0
                has_NM -= sz;
1832
0
        }
1833
        // else has_MD == 0 and we've already appended MD to the end.
1834
1835
0
        cr->aux_size += sz;
1836
0
    }
1837
1838
0
    if (decode_nm) {
1839
0
        if (has_NM == 0) {
1840
0
            char buf[7];
1841
0
            size_t buf_size;
1842
0
            buf[0] = 'N'; buf[1] = 'M';
1843
0
            if (nm <= UINT8_MAX) {
1844
0
                buf_size = 4;
1845
0
                buf[2] = 'C';
1846
0
                buf[3] = (nm>> 0) & 0xff;
1847
0
            } else if (nm <= UINT16_MAX) {
1848
0
                buf_size = 5;
1849
0
                buf[2] = 'S';
1850
0
                buf[3] = (nm>> 0) & 0xff;
1851
0
                buf[4] = (nm>> 8) & 0xff;
1852
0
            } else {
1853
0
                buf_size = 7;
1854
0
                buf[2] = 'I';
1855
0
                buf[3] = (nm>> 0) & 0xff;
1856
0
                buf[4] = (nm>> 8) & 0xff;
1857
0
                buf[5] = (nm>>16) & 0xff;
1858
0
                buf[6] = (nm>>24) & 0xff;
1859
0
            }
1860
0
            BLOCK_APPEND(s->aux_blk, buf, buf_size);
1861
0
            cr->aux_size += buf_size;
1862
0
        } else {
1863
            // Preallocated space for NM at -has_NM into aux block
1864
0
            unsigned char *buf = BLOCK_DATA(s->aux_blk) + -has_NM;
1865
0
            buf[0] = (nm>> 0) & 0xff;
1866
0
            buf[1] = (nm>> 8) & 0xff;
1867
0
            buf[2] = (nm>>16) & 0xff;
1868
0
            buf[3] = (nm>>24) & 0xff;
1869
0
        }
1870
0
    }
1871
1872
0
    return r;
1873
1874
0
 beyond_slice:
1875
    // Cramtools can create CRAMs that have sequence features outside the
1876
    // stated range of the container & slice reference extents (start + span).
1877
    // We have to check for these in many places, but for brevity have the
1878
    // error reporting in only one.
1879
0
    hts_log_error("CRAM CIGAR extends beyond slice reference extents");
1880
0
    return -1;
1881
1882
0
 block_err:
1883
0
    return -1;
1884
0
}
1885
1886
/*
1887
 * Quick and simple hash lookup for cram_map arrays
1888
 */
1889
0
static cram_map *map_find(cram_map **map, unsigned char *key, int id) {
1890
0
    cram_map *m;
1891
1892
0
    m = map[CRAM_MAP(key[0],key[1])];
1893
0
    while (m && m->key != id)
1894
0
        m= m->next;
1895
1896
0
    return m;
1897
0
}
1898
1899
//#define map_find(M,K,I) M[CRAM_MAP(K[0],K[1])];while (m && m->key != I);m= m->next
1900
1901
1902
static int cram_decode_aux_1_0(cram_container *c, cram_slice *s,
1903
0
                               cram_block *blk, cram_record *cr) {
1904
0
    int i, r = 0, out_sz = 1;
1905
0
    unsigned char ntags;
1906
1907
0
    if (!c->comp_hdr->codecs[DS_TC]) return -1;
1908
0
    r |= c->comp_hdr->codecs[DS_TC]->decode(s, c->comp_hdr->codecs[DS_TC], blk,
1909
0
                                            (char *)&ntags, &out_sz);
1910
0
    cr->ntags = ntags;
1911
1912
    //printf("TC=%d\n", cr->ntags);
1913
0
    cr->aux_size = 0;
1914
0
    cr->aux = BLOCK_SIZE(s->aux_blk);
1915
1916
0
    for (i = 0; i < cr->ntags; i++) {
1917
0
        int32_t id, out_sz = 1;
1918
0
        unsigned char tag_data[3];
1919
0
        cram_map *m;
1920
1921
        //printf("Tag %d/%d\n", i+1, cr->ntags);
1922
0
        if (!c->comp_hdr->codecs[DS_TN]) return -1;
1923
0
        r |= c->comp_hdr->codecs[DS_TN]->decode(s, c->comp_hdr->codecs[DS_TN],
1924
0
                                                blk, (char *)&id, &out_sz);
1925
0
        if (out_sz == 3) {
1926
            // Tag name stored as 3 chars instead of an int?
1927
0
            memcpy(tag_data, &id, 3);
1928
0
        } else {
1929
0
            tag_data[0] = (id>>16) & 0xff;
1930
0
            tag_data[1] = (id>>8)  & 0xff;
1931
0
            tag_data[2] = id       & 0xff;
1932
0
        }
1933
1934
0
        m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
1935
0
        if (!m)
1936
0
            return -1;
1937
0
        BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
1938
1939
0
        if (!m->codec) return -1;
1940
0
        r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
1941
1942
0
        cr->aux_size += out_sz + 3;
1943
0
    }
1944
1945
0
    return r;
1946
1947
0
 block_err:
1948
0
    return -1;
1949
0
}
1950
1951
// Derived from sam.c aux_type2size
1952
static inline int aux_ele_size(uint8_t type)
1953
0
{
1954
0
    switch (type) {
1955
0
    case 'A': case 'c': case 'C':
1956
0
        return 1;
1957
0
    case 's': case 'S':
1958
0
        return 2;
1959
0
    case 'i': case 'I': case 'f':
1960
0
        return 4;
1961
0
    case 'd':
1962
0
        return 8;
1963
0
    default:
1964
0
        return 1;
1965
0
    }
1966
0
}
1967
1968
// has_MD and has_NM are filled out with 0 for none present,
1969
// 1 for present and verbatim, and -pos for present as placeholder
1970
// (MD*, NM*) to be generated and filled out at offset +pos.
1971
static int cram_decode_aux(cram_fd *fd,
1972
                           cram_container *c, cram_slice *s,
1973
                           cram_block *blk, cram_record *cr,
1974
0
                           int *has_MD, int *has_NM) {
1975
0
    int i, r = 0, out_sz = 1;
1976
0
    int32_t TL = 0;
1977
0
    unsigned char *TN;
1978
0
    uint32_t ds = s->data_series;
1979
1980
0
    if (!(ds & (CRAM_TL|CRAM_aux))) {
1981
0
        cr->aux = 0;
1982
0
        cr->aux_size = 0;
1983
0
        return 0;
1984
0
    }
1985
1986
0
    if (!c->comp_hdr->codecs[DS_TL]) return -1;
1987
0
    r |= c->comp_hdr->codecs[DS_TL]->decode(s, c->comp_hdr->codecs[DS_TL], blk,
1988
0
                                            (char *)&TL, &out_sz);
1989
0
    if (r || TL < 0 || TL >= c->comp_hdr->nTL)
1990
0
        return -1;
1991
1992
0
    TN = c->comp_hdr->TL[TL];
1993
0
    cr->ntags = strlen((char *)TN)/3; // optimise to remove strlen
1994
1995
    //printf("TC=%d\n", cr->ntags);
1996
0
    cr->aux_size = 0;
1997
0
    cr->aux = BLOCK_SIZE(s->aux_blk);
1998
1999
0
    if (!(ds & CRAM_aux))
2000
0
        return 0;
2001
2002
0
    for (i = 0; i < cr->ntags; i++) {
2003
0
        int32_t id, out_sz = 1;
2004
0
        unsigned char tag_data[7];
2005
0
        cram_map *m;
2006
2007
0
        if (TN[0] == 'M' && TN[1] == 'D' && has_MD)
2008
0
            *has_MD = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1);
2009
0
        if (TN[0] == 'N' && TN[1] == 'M' && has_NM)
2010
0
            *has_NM = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1);;
2011
2012
        //printf("Tag %d/%d\n", i+1, cr->ntags);
2013
0
        tag_data[0] = TN[0];
2014
0
        tag_data[1] = TN[1];
2015
0
        tag_data[2] = TN[2];
2016
0
        id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2];
2017
2018
0
        if (CRAM_MAJOR_VERS(fd->version) >= 4 && TN[2] == '*') {
2019
            // Place holder, fill out contents later.
2020
0
            int tag_data_size;
2021
0
            if (TN[0] == 'N' && TN[1] == 'M') {
2022
                // Use a fixed size, so we can allocate room for it now.
2023
0
                memcpy(&tag_data[2], "I\0\0\0\0", 5);
2024
0
                tag_data_size = 7;
2025
0
            } else if (TN[0] == 'R' && TN[1] == 'G') {
2026
                // RG is variable size, but known already.  Insert now
2027
0
                TN += 3;
2028
                // Equiv to fd->header->hrecs->rg[cr->rg], but this is the
2029
                // new header API equivalent.
2030
0
                const char *rg = sam_hdr_line_name(fd->header, "RG", cr->rg);
2031
0
                if (!rg)
2032
0
                    continue;
2033
2034
0
                size_t rg_len = strlen(rg);
2035
0
                tag_data[2] = 'Z';
2036
0
                BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
2037
0
                BLOCK_APPEND(s->aux_blk, rg, rg_len);
2038
0
                BLOCK_APPEND_CHAR(s->aux_blk, '\0');
2039
0
                cr->aux_size += 3 + rg_len + 1;
2040
0
                cr->rg = -1; // prevents auto-add later
2041
0
                continue;
2042
0
            } else {
2043
                // Unknown size.  We'll insert MD into stream later.
2044
0
                tag_data[2] = 'Z';
2045
0
                tag_data_size = 3;
2046
0
            }
2047
0
            BLOCK_APPEND(s->aux_blk, (char *)tag_data, tag_data_size);
2048
0
            cr->aux_size += tag_data_size;
2049
0
            TN += 3;
2050
0
        } else {
2051
0
            TN += 3;
2052
0
            m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
2053
0
            if (!m)
2054
0
                return -1;
2055
2056
0
            BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
2057
2058
0
            if (!m->codec) return -1;
2059
0
            if (m->codec->codec == E_BYTE_ARRAY_LEN ||
2060
0
                m->codec->codec == E_BYTE_ARRAY_STOP)
2061
                // NB we don't know the maximum length for B arrays yet,
2062
                // but we're using BYTE_ARRAY_BLOCK encodings so they're auto-
2063
                // resizing arrays that cannot overflow.  The codec handles this
2064
                // check for us.
2065
0
                out_sz *= aux_ele_size(TN[-1]);
2066
0
            r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
2067
0
            if (r) break;
2068
0
            cr->aux_size += out_sz + 3;
2069
2070
            // cF CRAM flags.
2071
0
            if (TN[-3]=='c' && TN[-2]=='F' && TN[-1]=='C' && out_sz == 1) {
2072
                // Remove cF tag
2073
0
                uint8_t cF = BLOCK_END(s->aux_blk)[-1];
2074
0
                BLOCK_SIZE(s->aux_blk) -= out_sz+3;
2075
0
                cr->aux_size -= out_sz+3;
2076
2077
                // bit 1 => don't auto-decode MD.
2078
                // Pretend MD is present verbatim, so we don't auto-generate
2079
0
                if ((cF & 1) && has_MD && *has_MD == 0)
2080
0
                    *has_MD = 1;
2081
2082
                // bit 1 => don't auto-decode NM
2083
0
                if ((cF & 2) && has_NM && *has_NM == 0)
2084
0
                    *has_NM = 1;
2085
0
            }
2086
0
        }
2087
2088
        // We could go to 2^32 fine, but we shouldn't be hitting this anyway,
2089
        // and it's protecting against memory hogs too.
2090
0
        if (BLOCK_SIZE(s->aux_blk) > (1u<<31)) {
2091
0
            hts_log_error("CRAM->BAM aux block size overflow");
2092
0
            goto block_err;
2093
0
        }
2094
0
    }
2095
2096
0
    return r;
2097
2098
0
 block_err:
2099
0
    return -1;
2100
0
}
2101
2102
/* Resolve mate pair cross-references between recs within this slice */
2103
500
static int cram_decode_slice_xref(cram_slice *s, int required_fields) {
2104
500
    int rec;
2105
2106
500
    if (!(required_fields & (SAM_RNEXT | SAM_PNEXT | SAM_TLEN))) {
2107
0
        for (rec = 0; rec < s->hdr->num_records; rec++) {
2108
0
            cram_record *cr = &s->crecs[rec];
2109
2110
0
            cr->tlen = 0;
2111
0
            cr->mate_pos = 0;
2112
0
            cr->mate_ref_id = -1;
2113
0
        }
2114
2115
0
        return 0;
2116
0
    }
2117
2118
500
    for (rec = 0; rec < s->hdr->num_records; rec++) {
2119
0
        cram_record *cr = &s->crecs[rec];
2120
2121
0
        if (cr->mate_line >= 0) {
2122
0
            if (cr->mate_line < s->hdr->num_records) {
2123
                /*
2124
                 * On the first read, loop through computing lengths.
2125
                 * It's not perfect as we have one slice per reference so we
2126
                 * cannot detect when TLEN should be zero due to seqs that
2127
                 * map to multiple references.
2128
                 *
2129
                 * We also cannot set tlen correct when it spans a slice for
2130
                 * other reasons. This may make tlen too small. Should we
2131
                 * fix this by forcing TLEN to be stored verbatim in such cases?
2132
                 *
2133
                 * Or do we just admit defeat and output 0 for tlen? It's the
2134
                 * safe option...
2135
                 */
2136
0
                if (cr->tlen == INT64_MIN) {
2137
0
                    int id1 = rec, id2 = rec;
2138
0
                    int64_t aleft = cr->apos, aright = cr->aend;
2139
0
                    int64_t tlen;
2140
0
                    int ref = cr->ref_id;
2141
2142
                    // number of segments starting at the same point.
2143
0
                    int left_cnt = 0;
2144
0
                    int right_cnt = 0;
2145
2146
0
                    do {
2147
0
                        if (aleft > s->crecs[id2].apos)
2148
0
                            aleft = s->crecs[id2].apos, left_cnt = 1;
2149
0
                        else if (aleft == s->crecs[id2].apos)
2150
0
                            left_cnt++;
2151
0
                        if (aright < s->crecs[id2].aend) {
2152
0
                            aright = s->crecs[id2].aend;
2153
0
                            right_cnt = 1;
2154
0
                        } else if (aright == s->crecs[id2].aend) {
2155
0
                            right_cnt++;
2156
0
                        }
2157
0
                        if (s->crecs[id2].mate_line == -1) {
2158
0
                            s->crecs[id2].mate_line = rec;
2159
0
                            break;
2160
0
                        }
2161
0
                        if (s->crecs[id2].mate_line <= id2 ||
2162
0
                            s->crecs[id2].mate_line >= s->hdr->num_records)
2163
0
                            return -1;
2164
0
                        id2 = s->crecs[id2].mate_line;
2165
2166
0
                        if (s->crecs[id2].ref_id != ref)
2167
0
                            ref = -1;
2168
0
                    } while (id2 != id1);
2169
2170
0
                    if (ref != -1) {
2171
0
                        tlen = aright - aleft + 1;
2172
0
                        id1 = id2 = rec;
2173
2174
                        /*
2175
                         * When we have two seqs with identical start and
2176
                         * end coordinates, set +/- tlen based on 1st/last
2177
                         * bit flags instead, as a tie breaker.
2178
                         */
2179
0
                        if (s->crecs[id2].apos == aleft &&
2180
0
                            (s->crecs[id2].aend < aright ||
2181
0
                             left_cnt <= 1)) {
2182
                            // Leftmost, and not the rightmost
2183
0
                            s->crecs[id2].tlen = tlen;
2184
0
                            tlen = -tlen;
2185
0
                        } else if (s->crecs[id2].apos == aleft &&
2186
0
                                   s->crecs[id2].aend == aright &&
2187
0
                                   left_cnt > 1 && right_cnt > 1) {
2188
                            // Both leftmost and rightmost, resolve tie via
2189
                            // the BAM flags so changing order doesn't change
2190
                            // TLEN signs.
2191
0
                            if (s->crecs[id2].flags & BAM_FREAD1) {
2192
0
                                s->crecs[id2].tlen = tlen;
2193
0
                                tlen = -tlen;
2194
0
                            } else {
2195
0
                                s->crecs[id2].tlen = -tlen;
2196
0
                            }
2197
0
                        } else {
2198
                            // Rightmost or an internal
2199
0
                            s->crecs[id2].tlen = -tlen;
2200
0
                        }
2201
2202
0
                        id2 = s->crecs[id2].mate_line;
2203
0
                        while (id2 != id1) {
2204
0
                            s->crecs[id2].tlen = tlen;
2205
0
                            id2 = s->crecs[id2].mate_line;
2206
0
                        }
2207
0
                    } else {
2208
0
                        id1 = id2 = rec;
2209
2210
0
                        s->crecs[id2].tlen = 0;
2211
0
                        id2 = s->crecs[id2].mate_line;
2212
0
                        while (id2 != id1) {
2213
0
                            s->crecs[id2].tlen = 0;
2214
0
                            id2 = s->crecs[id2].mate_line;
2215
0
                        }
2216
0
                    }
2217
0
                }
2218
2219
0
                cr->mate_pos = s->crecs[cr->mate_line].apos;
2220
0
                cr->mate_ref_id = s->crecs[cr->mate_line].ref_id;
2221
2222
                // paired
2223
0
                cr->flags |= BAM_FPAIRED;
2224
2225
                // set mate unmapped if needed
2226
0
                if (s->crecs[cr->mate_line].flags & BAM_FUNMAP) {
2227
0
                    cr->flags |= BAM_FMUNMAP;
2228
0
                    cr->tlen = 0;
2229
0
                }
2230
0
                if (cr->flags & BAM_FUNMAP) {
2231
0
                    cr->tlen = 0;
2232
0
                }
2233
2234
                // set mate reversed if needed
2235
0
                if (s->crecs[cr->mate_line].flags & BAM_FREVERSE)
2236
0
                    cr->flags |= BAM_FMREVERSE;
2237
0
            } else {
2238
0
                hts_log_error("Mate line out of bounds: %d vs [0, %d]",
2239
0
                              cr->mate_line, s->hdr->num_records-1);
2240
0
            }
2241
2242
            /* FIXME: construct read names here too if needed */
2243
0
        } else {
2244
0
            if (cr->mate_flags & CRAM_M_REVERSE) {
2245
0
                cr->flags |= BAM_FPAIRED | BAM_FMREVERSE;
2246
0
            }
2247
0
            if (cr->mate_flags & CRAM_M_UNMAP) {
2248
0
                cr->flags |= BAM_FMUNMAP;
2249
                //cr->mate_ref_id = -1;
2250
0
            }
2251
0
            if (!(cr->flags & BAM_FPAIRED))
2252
0
                cr->mate_ref_id = -1;
2253
0
        }
2254
2255
0
        if (cr->tlen == INT64_MIN)
2256
0
            cr->tlen = 0; // Just incase
2257
0
    }
2258
2259
500
    for (rec = 0; rec < s->hdr->num_records; rec++) {
2260
0
        cram_record *cr = &s->crecs[rec];
2261
0
        if (cr->explicit_tlen != INT64_MIN)
2262
0
            cr->tlen = cr->explicit_tlen;
2263
0
    }
2264
2265
500
    return 0;
2266
500
}
2267
2268
0
static char *md5_print(unsigned char *md5, char *out) {
2269
0
    int i;
2270
0
    for (i = 0; i < 16; i++) {
2271
0
        out[i*2+0] = "0123456789abcdef"[md5[i]>>4];
2272
0
        out[i*2+1] = "0123456789abcdef"[md5[i]&15];
2273
0
    }
2274
0
    out[32] = 0;
2275
2276
0
    return out;
2277
0
}
2278
2279
/*
2280
 * Utility function to decode tlen (ISIZE), as it's called
2281
 * in multiple places.
2282
 *
2283
 * Returns codec return value (0 on success).
2284
 */
2285
static int cram_decode_tlen(cram_fd *fd, cram_container *c, cram_slice *s,
2286
0
                            cram_block *blk, int64_t *tlen) {
2287
0
    int out_sz = 1, r = 0;
2288
2289
0
    if (!c->comp_hdr->codecs[DS_TS]) return -1;
2290
0
    if (CRAM_MAJOR_VERS(fd->version) < 4) {
2291
0
        int32_t i32;
2292
0
        r |= c->comp_hdr->codecs[DS_TS]
2293
0
            ->decode(s, c->comp_hdr->codecs[DS_TS], blk,
2294
0
                     (char *)&i32, &out_sz);
2295
0
        *tlen = i32;
2296
0
    } else {
2297
0
        r |= c->comp_hdr->codecs[DS_TS]
2298
0
            ->decode(s, c->comp_hdr->codecs[DS_TS], blk,
2299
0
                     (char *)tlen, &out_sz);
2300
0
    }
2301
0
    return r;
2302
0
}
2303
2304
/*
2305
 * Decode an entire slice from container blocks. Fills out s->crecs[] array.
2306
 * Returns 0 on success
2307
 *        -1 on failure
2308
 */
2309
int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
2310
587
                      sam_hdr_t *sh) {
2311
587
    cram_block *blk = s->block[0];
2312
587
    int32_t bf, ref_id;
2313
587
    unsigned char cf;
2314
587
    int out_sz, r = 0;
2315
587
    int rec;
2316
587
    char *seq = NULL, *qual = NULL;
2317
587
    int unknown_rg = -1;
2318
587
    int embed_ref;
2319
587
    char **refs = NULL;
2320
587
    uint32_t ds;
2321
587
    sam_hrecs_t *bfd = sh->hrecs;
2322
2323
587
    if (cram_dependent_data_series(fd, c->comp_hdr, s) != 0)
2324
6
        return -1;
2325
2326
581
    ds = s->data_series;
2327
2328
581
    blk->bit = 7; // MSB first
2329
2330
    // Study the blocks and estimate approx sizes to preallocate.
2331
    // This looks to speed up decoding by around 8-9%.
2332
    // We can always shrink back down at the end if we overestimated.
2333
    // However it's likely that this also saves memory as own growth
2334
    // factor (*=1.5) is never applied.
2335
581
    {
2336
581
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
2337
581
        int qsize=0, nsize=0, q_id=0;
2338
#else
2339
        int qsize, nsize, q_id;
2340
        cram_decode_estimate_sizes(c->comp_hdr, s, &qsize, &nsize, &q_id);
2341
        //fprintf(stderr, "qsize=%d nsize=%d\n", qsize, nsize);
2342
#endif
2343
2344
581
        if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->seqs_blk, qsize+1);
2345
581
        if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->qual_blk, qsize+1);
2346
581
        if (nsize && (ds & CRAM_NS)) BLOCK_RESIZE_EXACT(s->name_blk, nsize+1);
2347
2348
        // To do - consider using q_id here to usurp the quality block and
2349
        // avoid a memcpy during decode.
2350
        // Specifically when quality is an external block uniquely used by
2351
        // DS_QS only, then we can set s->qual_blk directly to this
2352
        // block and save the codec->decode() calls. (Approx 3% cpu saving)
2353
581
    }
2354
2355
    /* Look for unknown RG, added as last by Java CRAM? */
2356
581
    if (bfd->nrg > 0 &&
2357
303
        bfd->rg[bfd->nrg-1].name != NULL &&
2358
303
        !strcmp(bfd->rg[bfd->nrg-1].name, "UNKNOWN"))
2359
0
        unknown_rg = bfd->nrg-1;
2360
2361
581
    if (blk->content_type != CORE)
2362
3
        return -1;
2363
2364
578
    if (s->crecs)
2365
0
        free(s->crecs);
2366
578
    if (!(s->crecs = malloc(s->hdr->num_records * sizeof(*s->crecs))))
2367
3
        return -1;
2368
2369
575
    ref_id = s->hdr->ref_seq_id;
2370
575
    if (CRAM_MAJOR_VERS(fd->version) < 4)
2371
575
       embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0;
2372
0
    else
2373
0
       embed_ref = s->hdr->ref_base_id > 0 ? 1 : 0;
2374
2375
575
    if (ref_id >= 0) {
2376
518
        if (embed_ref) {
2377
488
            cram_block *b;
2378
488
            if (s->hdr->ref_base_id < 0) {
2379
0
                hts_log_error("No reference specified and no embedded reference is available"
2380
0
                              " at #%d:%"PRId64"-%"PRId64, ref_id, s->hdr->ref_seq_start,
2381
0
                              s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2382
0
                return -1;
2383
0
            }
2384
488
            b = cram_get_block_by_id(s, s->hdr->ref_base_id);
2385
488
            if (!b)
2386
24
                return -1;
2387
464
            if (cram_uncompress_block(b) != 0)
2388
0
                return -1;
2389
464
            s->ref = (char *)BLOCK_DATA(b);
2390
464
            s->ref_start = s->hdr->ref_seq_start;
2391
464
            s->ref_end   = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
2392
464
            if (s->hdr->ref_seq_span > b->uncomp_size) {
2393
3
                hts_log_error("Embedded reference is too small at #%d:%"PRIhts_pos"-%"PRIhts_pos,
2394
3
                              ref_id, s->ref_start, s->ref_end);
2395
3
                return -1;
2396
3
            }
2397
464
        } else if (!c->comp_hdr->no_ref) {
2398
            //// Avoid Java cramtools bug by loading entire reference seq
2399
            //s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0);
2400
            //s->ref_start = 1;
2401
2402
30
            if (fd->required_fields & SAM_SEQ) {
2403
30
                s->ref =
2404
30
                cram_get_ref(fd, s->hdr->ref_seq_id,
2405
30
                             s->hdr->ref_seq_start,
2406
30
                             s->hdr->ref_seq_start + s->hdr->ref_seq_span -1);
2407
30
            }
2408
30
            s->ref_start = s->hdr->ref_seq_start;
2409
30
            s->ref_end   = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
2410
2411
            /* Sanity check */
2412
30
            if (s->ref_start < 0) {
2413
0
                hts_log_warning("Slice starts before base 1"
2414
0
                                " at #%d:%"PRId64"-%"PRId64, ref_id, s->hdr->ref_seq_start,
2415
0
                                s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2416
0
                s->ref_start = 0;
2417
0
            }
2418
30
            pthread_mutex_lock(&fd->ref_lock);
2419
30
            pthread_mutex_lock(&fd->refs->lock);
2420
30
            if ((fd->required_fields & SAM_SEQ) &&
2421
30
                ref_id < fd->refs->nref && fd->refs->ref_id &&
2422
0
                s->ref_end > fd->refs->ref_id[ref_id]->length) {
2423
0
                s->ref_end = fd->refs->ref_id[ref_id]->length;
2424
0
            }
2425
30
            pthread_mutex_unlock(&fd->refs->lock);
2426
30
            pthread_mutex_unlock(&fd->ref_lock);
2427
30
        }
2428
518
    }
2429
2430
548
    if ((fd->required_fields & SAM_SEQ) &&
2431
548
        s->ref == NULL && s->hdr->ref_seq_id >= 0 && !c->comp_hdr->no_ref) {
2432
30
        hts_log_error("Unable to fetch reference %s:%"PRId64"-%"PRId64,
2433
30
                      fd->refs->ref_id && ref_id >= 0 && ref_id < fd->refs->nref
2434
30
                      ? fd->refs->ref_id[ref_id]->name
2435
30
                      : "unknown",
2436
30
                      s->hdr->ref_seq_start,
2437
30
                      s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2438
30
        return -1;
2439
30
    }
2440
2441
518
    if (CRAM_MAJOR_VERS(fd->version) != 1
2442
0
        && (fd->required_fields & SAM_SEQ)
2443
0
        && s->hdr->ref_seq_id >= 0
2444
0
        && !fd->ignore_md5
2445
0
        && memcmp(s->hdr->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) {
2446
0
        hts_md5_context *md5;
2447
0
        unsigned char digest[16];
2448
2449
0
        if (s->ref && s->hdr->ref_seq_id >= 0) {
2450
0
            int start, len;
2451
2452
0
            if (s->hdr->ref_seq_start >= s->ref_start) {
2453
0
                start = s->hdr->ref_seq_start - s->ref_start;
2454
0
            } else {
2455
0
                hts_log_warning("Slice starts before base 1 at #%d:%"PRIhts_pos"-%"PRIhts_pos,
2456
0
                                ref_id, s->ref_start, s->ref_end);
2457
0
                start = 0;
2458
0
            }
2459
2460
0
            if (s->hdr->ref_seq_span <= s->ref_end - s->ref_start + 1) {
2461
0
                len = s->hdr->ref_seq_span;
2462
0
            } else {
2463
0
                hts_log_warning("Slice ends beyond reference end at #%d:%"PRIhts_pos"-%"PRIhts_pos,
2464
0
                                ref_id, s->ref_start, s->ref_end);
2465
0
                len = s->ref_end - s->ref_start + 1;
2466
0
            }
2467
2468
0
            if (!(md5 = hts_md5_init()))
2469
0
                return -1;
2470
0
            if (start + len > s->ref_end - s->ref_start + 1)
2471
0
                len = s->ref_end - s->ref_start + 1 - start;
2472
0
            if (len >= 0)
2473
0
                hts_md5_update(md5, s->ref + start, len);
2474
0
            hts_md5_final(digest, md5);
2475
0
            hts_md5_destroy(md5);
2476
0
        } else if (!s->ref && s->hdr->ref_base_id >= 0) {
2477
0
            cram_block *b = cram_get_block_by_id(s, s->hdr->ref_base_id);
2478
0
            if (b) {
2479
0
                if (!(md5 = hts_md5_init()))
2480
0
                    return -1;
2481
0
                hts_md5_update(md5, b->data, b->uncomp_size);
2482
0
                hts_md5_final(digest, md5);
2483
0
                hts_md5_destroy(md5);
2484
0
            }
2485
0
        }
2486
2487
0
        if (!c->comp_hdr->no_ref &&
2488
0
            ((!s->ref && s->hdr->ref_base_id < 0)
2489
0
             || memcmp(digest, s->hdr->md5, 16) != 0)) {
2490
0
            char M[33];
2491
0
            const char *rname = sam_hdr_tid2name(sh, ref_id);
2492
0
            if (!rname) rname="?"; // cannot happen normally
2493
0
            hts_log_error("MD5 checksum reference mismatch at %s:%"PRIhts_pos"-%"PRIhts_pos,
2494
0
                          rname, s->ref_start, s->ref_end);
2495
0
            hts_log_error("CRAM  : %s", md5_print(s->hdr->md5, M));
2496
0
            hts_log_error("Ref   : %s", md5_print(digest, M));
2497
0
            kstring_t ks = KS_INITIALIZE;
2498
0
            if (sam_hdr_find_tag_id(sh, "SQ", "SN", rname, "M5", &ks) == 0)
2499
0
                hts_log_error("@SQ M5: %s", ks.s);
2500
0
            hts_log_error("Please check the reference given is correct");
2501
0
            ks_free(&ks);
2502
0
            return -1;
2503
0
        }
2504
0
    }
2505
2506
518
    if (ref_id == -2) {
2507
30
        pthread_mutex_lock(&fd->ref_lock);
2508
30
        pthread_mutex_lock(&fd->refs->lock);
2509
30
        refs = calloc(fd->refs->nref, sizeof(char *));
2510
30
        pthread_mutex_unlock(&fd->refs->lock);
2511
30
        pthread_mutex_unlock(&fd->ref_lock);
2512
30
        if (!refs)
2513
0
            return -1;
2514
30
    }
2515
2516
518
    int last_ref_id = -9; // Arbitrary -ve marker for not-yet-set
2517
518
    for (rec = 0; rec < s->hdr->num_records; rec++) {
2518
18
        cram_record *cr = &s->crecs[rec];
2519
18
        int has_MD, has_NM;
2520
2521
        //fprintf(stderr, "Decode seq %d, %d/%d\n", rec, blk->byte, blk->bit);
2522
2523
18
        cr->s = s;
2524
2525
18
        out_sz = 1; /* decode 1 item */
2526
18
        if (ds & CRAM_BF) {
2527
18
            if (!c->comp_hdr->codecs[DS_BF]) goto block_err;
2528
0
            r |= c->comp_hdr->codecs[DS_BF]
2529
0
                            ->decode(s, c->comp_hdr->codecs[DS_BF], blk,
2530
0
                                     (char *)&bf, &out_sz);
2531
0
            if (r || bf < 0 ||
2532
0
                bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap))
2533
0
                goto block_err;
2534
0
            bf = fd->bam_flag_swap[bf];
2535
0
            cr->flags = bf;
2536
0
        } else {
2537
0
            cr->flags = bf = 0x4; // unmapped
2538
0
        }
2539
2540
0
        if (ds & CRAM_CF) {
2541
0
            if (CRAM_MAJOR_VERS(fd->version) == 1) {
2542
                /* CF is byte in 1.0, int32 in 2.0 */
2543
0
                if (!c->comp_hdr->codecs[DS_CF]) goto block_err;
2544
0
                r |= c->comp_hdr->codecs[DS_CF]
2545
0
                                ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2546
0
                                         (char *)&cf, &out_sz);
2547
0
                if (r) goto block_err;
2548
0
                cr->cram_flags = cf;
2549
0
            } else {
2550
0
                if (!c->comp_hdr->codecs[DS_CF]) goto block_err;
2551
0
                r |= c->comp_hdr->codecs[DS_CF]
2552
0
                                ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2553
0
                                         (char *)&cr->cram_flags, &out_sz);
2554
0
                if (r) goto block_err;
2555
0
                cf = cr->cram_flags;
2556
0
            }
2557
0
        } else {
2558
0
            cf = cr->cram_flags = 0;
2559
0
        }
2560
2561
0
        if (CRAM_MAJOR_VERS(fd->version) != 1 && ref_id == -2) {
2562
0
            if (ds & CRAM_RI) {
2563
0
                if (!c->comp_hdr->codecs[DS_RI]) goto block_err;
2564
0
                r |= c->comp_hdr->codecs[DS_RI]
2565
0
                                ->decode(s, c->comp_hdr->codecs[DS_RI], blk,
2566
0
                                         (char *)&cr->ref_id, &out_sz);
2567
0
                if (r) goto block_err;
2568
0
                if ((fd->required_fields & (SAM_SEQ|SAM_TLEN))
2569
0
                    && cr->ref_id >= 0
2570
0
                    && cr->ref_id != last_ref_id) {
2571
0
                    if (!c->comp_hdr->no_ref) {
2572
                        // Range(fd):  seq >= 0, unmapped -1, unspecified   -2
2573
                        // Slice(s):   seq >= 0, unmapped -1, multiple refs -2
2574
                        // Record(cr): seq >= 0, unmapped -1
2575
0
                        pthread_mutex_lock(&fd->range_lock);
2576
0
                        int need_ref = (fd->range.refid == -2 || cr->ref_id == fd->range.refid);
2577
0
                        pthread_mutex_unlock(&fd->range_lock);
2578
0
                        if  (need_ref) {
2579
0
                            if (!refs[cr->ref_id])
2580
0
                                refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id, 1, 0);
2581
0
                            if (!(s->ref = refs[cr->ref_id]))
2582
0
                                goto block_err;
2583
0
                        } else {
2584
                            // For multi-ref containers, we don't need to fetch all
2585
                            // refs if we're only querying one.
2586
0
                            s->ref = NULL;
2587
0
                        }
2588
2589
0
                        pthread_mutex_lock(&fd->range_lock);
2590
0
                        int discard_last_ref = (last_ref_id >= 0 &&
2591
0
                                                refs[last_ref_id] &&
2592
0
                                                (fd->range.refid == -2 ||
2593
0
                                                 last_ref_id == fd->range.refid));
2594
0
                        pthread_mutex_unlock(&fd->range_lock);
2595
0
                        if (discard_last_ref) {
2596
0
                            pthread_mutex_lock(&fd->ref_lock);
2597
0
                            discard_last_ref = !fd->unsorted;
2598
0
                            pthread_mutex_unlock(&fd->ref_lock);
2599
0
                        }
2600
0
                        if (discard_last_ref) {
2601
0
                            cram_ref_decr(fd->refs, last_ref_id);
2602
0
                            refs[last_ref_id] = NULL;
2603
0
                        }
2604
0
                    }
2605
0
                    s->ref_start = 1;
2606
0
                    pthread_mutex_lock(&fd->ref_lock);
2607
0
                    pthread_mutex_lock(&fd->refs->lock);
2608
0
                    s->ref_end = fd->refs->ref_id[cr->ref_id]->length;
2609
0
                    pthread_mutex_unlock(&fd->refs->lock);
2610
0
                    pthread_mutex_unlock(&fd->ref_lock);
2611
2612
0
                    last_ref_id = cr->ref_id;
2613
0
                }
2614
0
            } else {
2615
0
                cr->ref_id = -1;
2616
0
            }
2617
0
        } else {
2618
0
            cr->ref_id = ref_id; // Forced constant in CRAM 1.0
2619
0
        }
2620
0
        if (cr->ref_id < -1 || cr->ref_id >= bfd->nref) {
2621
0
            hts_log_error("Requested unknown reference ID %d", cr->ref_id);
2622
0
            goto block_err;
2623
0
        }
2624
2625
0
        if (ds & CRAM_RL) {
2626
0
            if (!c->comp_hdr->codecs[DS_RL]) goto block_err;
2627
0
            r |= c->comp_hdr->codecs[DS_RL]
2628
0
                            ->decode(s, c->comp_hdr->codecs[DS_RL], blk,
2629
0
                                     (char *)&cr->len, &out_sz);
2630
0
            if (r) goto block_err;
2631
0
            if (cr->len < 0) {
2632
0
                hts_log_error("Read has negative length");
2633
0
                goto block_err;
2634
0
            }
2635
0
        }
2636
2637
0
        if (ds & CRAM_AP) {
2638
0
            if (!c->comp_hdr->codecs[DS_AP]) goto block_err;
2639
0
            if (CRAM_MAJOR_VERS(fd->version) >= 4) {
2640
0
                r |= c->comp_hdr->codecs[DS_AP]
2641
0
                                ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
2642
0
                                         (char *)&cr->apos, &out_sz);
2643
0
            } else  {
2644
0
                int32_t i32;
2645
0
                r |= c->comp_hdr->codecs[DS_AP]
2646
0
                                ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
2647
0
                                         (char *)&i32, &out_sz);
2648
0
                cr->apos = i32;
2649
0
            }
2650
0
            if (r) goto block_err;;
2651
0
            if (c->comp_hdr->AP_delta) {
2652
0
                if (cr->apos < 0 && c->unsorted == 0) {
2653
                    // cache locally in c->unsorted so we don't have an
2654
                    // excessive number of locks
2655
0
                    pthread_mutex_lock(&fd->ref_lock);
2656
0
                    c->unsorted = fd->unsorted = 1;
2657
0
                    pthread_mutex_unlock(&fd->ref_lock);
2658
0
                }
2659
0
                cr->apos += s->last_apos;
2660
0
            }
2661
0
            s->last_apos=  cr->apos;
2662
0
        } else {
2663
0
            cr->apos = c->ref_seq_start;
2664
0
        }
2665
2666
0
        if (ds & CRAM_RG) {
2667
0
            if (!c->comp_hdr->codecs[DS_RG]) goto block_err;
2668
0
            r |= c->comp_hdr->codecs[DS_RG]
2669
0
                           ->decode(s, c->comp_hdr->codecs[DS_RG], blk,
2670
0
                                    (char *)&cr->rg, &out_sz);
2671
0
            if (r) goto block_err;
2672
0
            if (cr->rg == unknown_rg)
2673
0
                cr->rg = -1;
2674
0
        } else {
2675
0
            cr->rg = -1;
2676
0
        }
2677
2678
0
        cr->name_len = 0;
2679
2680
0
        if (c->comp_hdr->read_names_included) {
2681
0
            int32_t out_sz2 = 1; // block auto grows in decode()
2682
2683
            // Read directly into name cram_block
2684
0
            cr->name = BLOCK_SIZE(s->name_blk);
2685
0
            if (ds & CRAM_RN) {
2686
0
                if (!c->comp_hdr->codecs[DS_RN]) goto block_err;
2687
0
                r |= c->comp_hdr->codecs[DS_RN]
2688
0
                                ->decode(s, c->comp_hdr->codecs[DS_RN], blk,
2689
0
                                         (char *)s->name_blk, &out_sz2);
2690
0
                if (r) goto block_err;
2691
0
                cr->name_len = out_sz2;
2692
0
            }
2693
0
        }
2694
2695
0
        cr->mate_pos = 0;
2696
0
        cr->mate_line = -1;
2697
0
        cr->mate_ref_id = -1;
2698
0
        cr->explicit_tlen = INT64_MIN;
2699
0
        if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) {
2700
0
            if (ds & CRAM_MF) {
2701
0
                if (CRAM_MAJOR_VERS(fd->version) == 1) {
2702
                    /* MF is byte in 1.0, int32 in 2.0 */
2703
0
                    unsigned char mf;
2704
0
                    if (!c->comp_hdr->codecs[DS_MF]) goto block_err;
2705
0
                    r |= c->comp_hdr->codecs[DS_MF]
2706
0
                                    ->decode(s, c->comp_hdr->codecs[DS_MF],
2707
0
                                             blk, (char *)&mf, &out_sz);
2708
0
                    if (r) goto block_err;
2709
0
                    cr->mate_flags = mf;
2710
0
                } else {
2711
0
                    if (!c->comp_hdr->codecs[DS_MF]) goto block_err;
2712
0
                    r |= c->comp_hdr->codecs[DS_MF]
2713
0
                                    ->decode(s, c->comp_hdr->codecs[DS_MF],
2714
0
                                             blk,
2715
0
                                             (char *)&cr->mate_flags,
2716
0
                                             &out_sz);
2717
0
                    if (r) goto block_err;
2718
0
                }
2719
0
            } else {
2720
0
                cr->mate_flags = 0;
2721
0
            }
2722
2723
0
            if (!c->comp_hdr->read_names_included) {
2724
0
                int32_t out_sz2 = 1;
2725
2726
                // Read directly into name cram_block
2727
0
                cr->name = BLOCK_SIZE(s->name_blk);
2728
0
                if (ds & CRAM_RN) {
2729
0
                    if (!c->comp_hdr->codecs[DS_RN]) goto block_err;
2730
0
                    r |= c->comp_hdr->codecs[DS_RN]
2731
0
                                    ->decode(s, c->comp_hdr->codecs[DS_RN],
2732
0
                                             blk, (char *)s->name_blk,
2733
0
                                             &out_sz2);
2734
0
                    if (r) goto block_err;
2735
0
                    cr->name_len = out_sz2;
2736
0
                }
2737
0
            }
2738
2739
0
            if (ds & CRAM_NS) {
2740
0
                if (!c->comp_hdr->codecs[DS_NS]) goto block_err;
2741
0
                r |= c->comp_hdr->codecs[DS_NS]
2742
0
                                ->decode(s, c->comp_hdr->codecs[DS_NS], blk,
2743
0
                                         (char *)&cr->mate_ref_id, &out_sz);
2744
0
                if (r) goto block_err;
2745
0
            }
2746
2747
            // Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown.
2748
            // if (cr->mate_ref_id == -1 && cr->flags & 0x01) {
2749
            //     /* Paired, but unmapped */
2750
            //     cr->flags |= BAM_FMUNMAP;
2751
            // }
2752
2753
0
            if (ds & CRAM_NP) {
2754
0
                if (!c->comp_hdr->codecs[DS_NP]) goto block_err;;
2755
0
                if (CRAM_MAJOR_VERS(fd->version) < 4) {
2756
0
                    int32_t i32;
2757
0
                    r |= c->comp_hdr->codecs[DS_NP]
2758
0
                                    ->decode(s, c->comp_hdr->codecs[DS_NP], blk,
2759
0
                                             (char *)&i32, &out_sz);
2760
0
                    cr->mate_pos = i32;
2761
0
                } else {
2762
0
                    r |= c->comp_hdr->codecs[DS_NP]
2763
0
                        ->decode(s, c->comp_hdr->codecs[DS_NP], blk,
2764
0
                                 (char *)&cr->mate_pos, &out_sz);
2765
0
                }
2766
0
                if (r) goto block_err;
2767
0
            }
2768
2769
0
            if (ds & CRAM_TS) {
2770
0
                if (!c->comp_hdr->codecs[DS_TS]) goto block_err;
2771
0
                r = cram_decode_tlen(fd, c, s, blk, &cr->tlen);
2772
0
                if (r) goto block_err;
2773
0
            } else {
2774
0
                cr->tlen = INT64_MIN;
2775
0
            }
2776
0
        } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) {
2777
            // else not detached
2778
0
            if (ds & CRAM_NF) {
2779
0
                if (!c->comp_hdr->codecs[DS_NF]) goto block_err;
2780
0
                r |= c->comp_hdr->codecs[DS_NF]
2781
0
                                ->decode(s, c->comp_hdr->codecs[DS_NF], blk,
2782
0
                                         (char *)&cr->mate_line, &out_sz);
2783
0
                if (r) goto block_err;
2784
0
                cr->mate_line += rec + 1;
2785
2786
                //cr->name_len = sprintf(name, "%d", name_id++);
2787
                //cr->name = DSTRING_LEN(name_ds);
2788
                //dstring_nappend(name_ds, name, cr->name_len);
2789
2790
0
                cr->mate_ref_id = -1;
2791
0
                cr->tlen = INT64_MIN;
2792
0
                cr->mate_pos = 0;
2793
0
            } else  {
2794
0
                cr->mate_flags = 0;
2795
0
                cr->tlen = INT64_MIN;
2796
0
            }
2797
0
            if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) {
2798
0
                if (ds & CRAM_TS) {
2799
0
                    r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen);
2800
0
                    if (r) return r;
2801
0
                } else {
2802
0
                    cr->mate_flags = 0;
2803
0
                    cr->tlen = INT64_MIN;
2804
0
                }
2805
0
            }
2806
0
        } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) {
2807
0
            if (ds & CRAM_TS) {
2808
0
                r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen);
2809
0
                if (r) return r;
2810
0
            } else {
2811
0
                cr->mate_flags = 0;
2812
0
                cr->tlen = INT64_MIN;
2813
0
            }
2814
0
        } else {
2815
0
            cr->mate_flags = 0;
2816
0
            cr->tlen = INT64_MIN;
2817
0
        }
2818
        /*
2819
        else if (!name[0]) {
2820
            //name[0] = '?'; name[1] = 0;
2821
            //cr->name_len = 1;
2822
            //cr->name=  DSTRING_LEN(s->name_ds);
2823
            //dstring_nappend(s->name_ds, "?", 1);
2824
2825
            cr->mate_ref_id = -1;
2826
            cr->tlen = 0;
2827
            cr->mate_pos = 0;
2828
        }
2829
        */
2830
2831
        /* Auxiliary tags */
2832
0
        has_MD = has_NM = 0;
2833
0
        if (CRAM_MAJOR_VERS(fd->version) == 1)
2834
0
            r |= cram_decode_aux_1_0(c, s, blk, cr);
2835
0
        else
2836
0
            r |= cram_decode_aux(fd, c, s, blk, cr, &has_MD, &has_NM);
2837
0
        if (r) goto block_err;
2838
2839
        /* Fake up dynamic string growth and appending */
2840
0
        if (ds & CRAM_RL) {
2841
0
            cr->seq = BLOCK_SIZE(s->seqs_blk);
2842
0
            BLOCK_RESIZE(s->seqs_blk, cr->seq + cr->len);
2843
0
            seq = (char *)BLOCK_END(s->seqs_blk);
2844
0
            BLOCK_SIZE(s->seqs_blk) += cr->len;
2845
2846
0
            if (!seq)
2847
0
                goto block_err;
2848
2849
0
            cr->qual = BLOCK_SIZE(s->qual_blk);
2850
0
            BLOCK_RESIZE(s->qual_blk, cr->qual + cr->len);
2851
0
            qual = (char *)BLOCK_END(s->qual_blk);
2852
0
            BLOCK_SIZE(s->qual_blk) += cr->len;
2853
2854
0
            if (!s->ref)
2855
0
                memset(seq, '=', cr->len);
2856
0
        }
2857
2858
0
        if (!(bf & BAM_FUNMAP)) {
2859
0
            if ((ds & CRAM_AP) && cr->apos <= 0) {
2860
0
                hts_log_error("Read has alignment position %"PRId64
2861
0
                              " but no unmapped flag",
2862
0
                              cr->apos);
2863
0
                goto block_err;
2864
0
            }
2865
            /* Decode sequence and generate CIGAR */
2866
0
            if (ds & (CRAM_SEQ | CRAM_MQ)) {
2867
0
                r |= cram_decode_seq(fd, c, s, blk, cr, sh, cf, seq, qual,
2868
0
                                     has_MD, has_NM);
2869
0
                if (r) goto block_err;
2870
0
            } else {
2871
0
                cr->cigar = 0;
2872
0
                cr->ncigar = 0;
2873
0
                cr->aend = cr->apos;
2874
0
                cr->mqual = 0;
2875
0
            }
2876
0
        } else {
2877
0
            int out_sz2 = cr->len;
2878
2879
            //puts("Unmapped");
2880
0
            cr->cigar = 0;
2881
0
            cr->ncigar = 0;
2882
0
            cr->aend = cr->apos;
2883
0
            cr->mqual = 0;
2884
2885
0
            if (ds & CRAM_BA && cr->len) {
2886
0
                if (!c->comp_hdr->codecs[DS_BA]) goto block_err;
2887
0
                r |= c->comp_hdr->codecs[DS_BA]
2888
0
                                ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
2889
0
                                         (char *)seq, &out_sz2);
2890
0
                if (r) goto block_err;
2891
0
            }
2892
2893
0
            if ((ds & CRAM_CF) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
2894
0
                out_sz2 = cr->len;
2895
0
                if (ds & CRAM_QS && cr->len >= 0) {
2896
0
                    if (!c->comp_hdr->codecs[DS_QS]) goto block_err;
2897
0
                    r |= c->comp_hdr->codecs[DS_QS]
2898
0
                                    ->decode(s, c->comp_hdr->codecs[DS_QS],
2899
0
                                             blk, qual, &out_sz2);
2900
0
                    if (r) goto block_err;
2901
0
                }
2902
0
            } else {
2903
0
                if (ds & CRAM_RL)
2904
0
                    memset(qual, 255, cr->len);
2905
0
            }
2906
0
        }
2907
2908
0
        if (!c->comp_hdr->qs_seq_orient && (ds & CRAM_QS) && (cr->flags & BAM_FREVERSE)) {
2909
0
            int i, j;
2910
0
            for (i = 0, j = cr->len-1; i < j; i++, j--) {
2911
0
                unsigned char c;
2912
0
                c = qual[i];
2913
0
                qual[i] = qual[j];
2914
0
                qual[j] = c;
2915
0
            }
2916
0
        }
2917
0
    }
2918
2919
500
    pthread_mutex_lock(&fd->ref_lock);
2920
500
    if (refs) {
2921
18
        int i;
2922
33
        for (i = 0; i < fd->refs->nref; i++) {
2923
15
            if (refs[i])
2924
0
                cram_ref_decr(fd->refs, i);
2925
15
        }
2926
18
        free(refs);
2927
18
        refs = NULL;
2928
482
    } else if (ref_id >= 0 && s->ref != fd->ref_free && !embed_ref) {
2929
0
        cram_ref_decr(fd->refs, ref_id);
2930
0
    }
2931
500
    pthread_mutex_unlock(&fd->ref_lock);
2932
2933
    /* Resolve mate pair cross-references between recs within this slice */
2934
500
    r |= cram_decode_slice_xref(s, fd->required_fields);
2935
2936
    // Free the original blocks as we no longer need these.
2937
500
    {
2938
500
        int i;
2939
2.48k
        for (i = 0; i < s->hdr->num_blocks; i++) {
2940
1.98k
            cram_block *b = s->block[i];
2941
1.98k
            cram_free_block(b);
2942
1.98k
            s->block[i] = NULL;
2943
1.98k
        }
2944
500
    }
2945
2946
    // Also see initial BLOCK_RESIZE_EXACT at top of function.
2947
    // As we grow blocks we overallocate by up to 50%. So shrink
2948
    // back to their final sizes here.
2949
    //
2950
    //fprintf(stderr, "%d %d // %d %d // %d %d // %d %d\n",
2951
    //      (int)s->seqs_blk->byte, (int)s->seqs_blk->alloc,
2952
    //      (int)s->qual_blk->byte, (int)s->qual_blk->alloc,
2953
    //      (int)s->name_blk->byte, (int)s->name_blk->alloc,
2954
    //      (int)s->aux_blk->byte,  (int)s->aux_blk->alloc);
2955
500
    BLOCK_RESIZE_EXACT(s->seqs_blk, BLOCK_SIZE(s->seqs_blk)+1);
2956
500
    BLOCK_RESIZE_EXACT(s->qual_blk, BLOCK_SIZE(s->qual_blk)+1);
2957
500
    BLOCK_RESIZE_EXACT(s->name_blk, BLOCK_SIZE(s->name_blk)+1);
2958
500
    BLOCK_RESIZE_EXACT(s->aux_blk,  BLOCK_SIZE(s->aux_blk)+1);
2959
2960
500
    return r;
2961
2962
18
 block_err:
2963
18
    if (refs) {
2964
12
        int i;
2965
12
        pthread_mutex_lock(&fd->ref_lock);
2966
30
        for (i = 0; i < fd->refs->nref; i++) {
2967
18
            if (refs[i])
2968
0
                cram_ref_decr(fd->refs, i);
2969
18
        }
2970
12
        free(refs);
2971
12
        pthread_mutex_unlock(&fd->ref_lock);
2972
12
    }
2973
2974
18
    return -1;
2975
500
}
2976
2977
typedef struct {
2978
    cram_fd *fd;
2979
    cram_container *c;
2980
    cram_slice *s;
2981
    sam_hdr_t *h;
2982
    int exit_code;
2983
} cram_decode_job;
2984
2985
0
void *cram_decode_slice_thread(void *arg) {
2986
0
    cram_decode_job *j = (cram_decode_job *)arg;
2987
2988
0
    j->exit_code = cram_decode_slice(j->fd, j->c, j->s, j->h);
2989
2990
0
    return j;
2991
0
}
2992
2993
/*
2994
 * Spawn a multi-threaded version of cram_decode_slice().
2995
 */
2996
int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s,
2997
587
                         sam_hdr_t *bfd) {
2998
587
    cram_decode_job *j;
2999
587
    int nonblock;
3000
3001
587
    if (!fd->pool)
3002
587
        return cram_decode_slice(fd, c, s, bfd);
3003
3004
0
    if (!(j = malloc(sizeof(*j))))
3005
0
        return -1;
3006
3007
0
    j->fd = fd;
3008
0
    j->c  = c;
3009
0
    j->s  = s;
3010
0
    j->h  = bfd;
3011
3012
0
    nonblock = hts_tpool_process_sz(fd->rqueue) ? 1 : 0;
3013
3014
0
    int saved_errno = errno;
3015
0
    errno = 0;
3016
0
    if (-1 == hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_decode_slice_thread,
3017
0
                                  j, nonblock)) {
3018
        /* Would block */
3019
0
        if (errno != EAGAIN)
3020
0
            return -1;
3021
0
        fd->job_pending = j;
3022
0
    } else {
3023
0
        fd->job_pending = NULL;
3024
0
    }
3025
0
    errno = saved_errno;
3026
3027
    // flush too
3028
0
    return 0;
3029
0
}
3030
3031
3032
/* ----------------------------------------------------------------------
3033
 * CRAM sequence iterators.
3034
 */
3035
3036
/*
3037
 * Converts a cram in-memory record into a bam in-memory record. We
3038
 * pass a pointer to a bam_seq_t pointer along with the a pointer to
3039
 * the allocated size. These can initially be pointers to NULL and zero.
3040
 *
3041
 * This function will reallocate the bam buffer as required and update
3042
 * (*bam)->alloc accordingly, allowing it to be used within a loop
3043
 * efficiently without needing to allocate new bam objects over and
3044
 * over again.
3045
 *
3046
 * Returns the used size of the bam record on success
3047
 *         -1 on failure.
3048
 */
3049
int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s,
3050
0
                cram_record *cr, int rec, bam_seq_t **bam) {
3051
0
    int ret, rg_len;
3052
0
    char name_a[1024], *name;
3053
0
    int name_len;
3054
0
    char *aux;
3055
0
    char *seq, *qual;
3056
0
    sam_hrecs_t *bfd = sh->hrecs;
3057
3058
    /* Assign names if not explicitly set */
3059
0
    if (fd->required_fields & SAM_QNAME) {
3060
0
        if (cr->name_len) {
3061
0
            name = (char *)BLOCK_DATA(s->name_blk) + cr->name;
3062
0
            name_len = cr->name_len;
3063
0
        } else {
3064
0
            name = name_a;
3065
0
            if (cr->mate_line >= 0 && cr->mate_line < s->max_rec &&
3066
0
                s->crecs[cr->mate_line].name_len > 0) {
3067
                // Copy our mate if non-zero.
3068
0
                memcpy(name_a, BLOCK_DATA(s->name_blk)+s->crecs[cr->mate_line].name,
3069
0
                       s->crecs[cr->mate_line].name_len);
3070
0
                name = name_a + s->crecs[cr->mate_line].name_len;
3071
0
            } else {
3072
                // Otherwise generate a name based on prefix
3073
0
                name_len = strlen(fd->prefix);
3074
0
                memcpy(name, fd->prefix, name_len);
3075
0
                name += name_len;
3076
0
                *name++ = ':';
3077
0
                if (cr->mate_line >= 0 && cr->mate_line < rec) {
3078
0
                    name = (char *)append_uint64((unsigned char *)name,
3079
0
                                                 s->hdr->record_counter +
3080
0
                                                 cr->mate_line + 1);
3081
0
                } else {
3082
0
                    name = (char *)append_uint64((unsigned char *)name,
3083
0
                                                 s->hdr->record_counter +
3084
0
                                                 rec + 1);
3085
0
                }
3086
0
            }
3087
0
            name_len = name - name_a;
3088
0
            name = name_a;
3089
0
        }
3090
0
    } else {
3091
0
        name = "?";
3092
0
        name_len = 1;
3093
0
    }
3094
3095
    /* Generate BAM record */
3096
0
    if (cr->rg < -1 || cr->rg >= bfd->nrg)
3097
0
        return -1;
3098
0
    rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0;
3099
3100
0
    if (fd->required_fields & (SAM_SEQ | SAM_QUAL)) {
3101
0
        if (!BLOCK_DATA(s->seqs_blk))
3102
0
            return -1;
3103
0
        seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
3104
0
    } else {
3105
0
        seq = "*";
3106
0
        cr->len = 0;
3107
0
    }
3108
3109
0
    if (fd->required_fields & SAM_QUAL) {
3110
0
        if (!BLOCK_DATA(s->qual_blk))
3111
0
            return -1;
3112
0
        qual = (char *)BLOCK_DATA(s->qual_blk) + cr->qual;
3113
0
    } else {
3114
0
        qual = NULL;
3115
0
    }
3116
3117
0
    ret = bam_set1(*bam,
3118
0
                   name_len, name,
3119
0
                   cr->flags, cr->ref_id, cr->apos - 1, cr->mqual,
3120
0
                   cr->ncigar, &s->cigar[cr->cigar],
3121
0
                   cr->mate_ref_id, cr->mate_pos - 1, cr->tlen,
3122
0
                   cr->len, seq, qual,
3123
0
                   cr->aux_size + rg_len);
3124
0
    if (ret < 0) {
3125
0
        return ret;
3126
0
    }
3127
3128
0
    aux = (char *)bam_aux(*bam);
3129
3130
    /* Auxiliary strings */
3131
0
    if (cr->aux_size != 0) {
3132
0
        memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size);
3133
0
        aux += cr->aux_size;
3134
0
        (*bam)->l_data += cr->aux_size;
3135
0
    }
3136
3137
    /* RG:Z: */
3138
0
    if (rg_len > 0) {
3139
0
        *aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z';
3140
0
        int len = bfd->rg[cr->rg].name_len;
3141
0
        memcpy(aux, bfd->rg[cr->rg].name, len);
3142
0
        aux += len;
3143
0
        *aux++ = 0;
3144
0
        (*bam)->l_data += rg_len;
3145
0
    }
3146
3147
0
    return (*bam)->l_data;
3148
0
}
3149
3150
/*
3151
 * Here be dragons! The multi-threading code in this is crufty beyond belief.
3152
 */
3153
3154
/*
3155
 * Load first container.
3156
 * Called when fd->ctr is NULL>
3157
 *
3158
 * Returns container on success
3159
 *        NULL on failure.
3160
 */
3161
3.50k
static cram_container *cram_first_slice(cram_fd *fd) {
3162
3.50k
    cram_container *c;
3163
3164
12.1k
    do {
3165
12.1k
        if (fd->ctr)
3166
8.67k
            cram_free_container(fd->ctr);
3167
3168
12.1k
        if (!(c = fd->ctr = cram_read_container(fd)))
3169
218
            return NULL;
3170
11.9k
        c->curr_slice_mt = c->curr_slice;
3171
11.9k
    } while (c->length == 0);
3172
3173
    /*
3174
     * The first container may be a result of a sub-range query.
3175
     * In which case it may still not be the optimal starting point
3176
     * due to skipped containers/slices in the index.
3177
     */
3178
    // No need for locks here as we're in the main thread.
3179
3.28k
    if (fd->range.refid != -2) {
3180
0
        while (c->ref_seq_id != -2 &&
3181
0
               (c->ref_seq_id < fd->range.refid ||
3182
0
                (fd->range.refid >= 0 && c->ref_seq_id == fd->range.refid
3183
0
                 && c->ref_seq_start + c->ref_seq_span-1 < fd->range.start))) {
3184
0
            if (0 != cram_seek(fd, c->length, SEEK_CUR))
3185
0
                return NULL;
3186
0
            cram_free_container(fd->ctr);
3187
0
            do {
3188
0
                if (!(c = fd->ctr = cram_read_container(fd)))
3189
0
                    return NULL;
3190
0
            } while (c->length == 0);
3191
0
        }
3192
3193
0
        if (c->ref_seq_id != -2 && c->ref_seq_id != fd->range.refid) {
3194
0
            fd->eof = 1;
3195
0
            return NULL;
3196
0
        }
3197
0
    }
3198
3199
3.28k
    if (!(c->comp_hdr_block = cram_read_block(fd)))
3200
38
        return NULL;
3201
3.24k
    if (c->comp_hdr_block->content_type != COMPRESSION_HEADER)
3202
51
        return NULL;
3203
3204
3.19k
    c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
3205
3.19k
    if (!c->comp_hdr)
3206
2.91k
        return NULL;
3207
286
    if (!c->comp_hdr->AP_delta &&
3208
0
        sam_hrecs_sort_order(fd->header->hrecs) != ORDER_COORD) {
3209
0
        pthread_mutex_lock(&fd->ref_lock);
3210
0
        fd->unsorted = 1;
3211
0
        pthread_mutex_unlock(&fd->ref_lock);
3212
0
    }
3213
3214
286
    return c;
3215
3.19k
}
3216
3217
4.00k
cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
3218
4.00k
    cram_container *c_curr;  // container being consumed via cram_get_seq()
3219
4.00k
    cram_slice *s_curr = NULL;
3220
3221
    // Populate the first container if unknown.
3222
4.00k
    if (!(c_curr = fd->ctr)) {
3223
3.50k
        if (!(c_curr = cram_first_slice(fd)))
3224
3.21k
            return NULL;
3225
3.50k
    }
3226
3227
    // Discard previous slice
3228
786
    if ((s_curr = c_curr->slice)) {
3229
500
        c_curr->slice = NULL;
3230
500
        cram_free_slice(s_curr);
3231
500
        s_curr = NULL;
3232
500
    }
3233
3234
    // If we've consumed all slices in this container, also discard
3235
    // the container too.
3236
786
    if (c_curr->curr_slice == c_curr->max_slice) {
3237
73
        if (fd->ctr == c_curr)
3238
73
            fd->ctr = NULL;
3239
73
        if (fd->ctr_mt == c_curr)
3240
31
            fd->ctr_mt = NULL;
3241
73
        cram_free_container(c_curr);
3242
73
        c_curr = NULL;
3243
73
    }
3244
3245
786
    if (!fd->ctr_mt)
3246
317
        fd->ctr_mt = c_curr;
3247
3248
    // Fetch the next slice (and the container if necessary).
3249
    //
3250
    // If single threaded this loop bails out as soon as it finds
3251
    // a slice in range.  In this case c_next and c_curr end up being
3252
    // the same thing.
3253
    //
3254
    // If multi-threaded, we loop until we have filled out
3255
    // thread pool input queue.  Here c_next and c_curr *may* differ, as
3256
    // can fd->ctr and fd->ctr_mt.
3257
786
    for (;;) {
3258
786
        cram_container *c_next = fd->ctr_mt;
3259
786
        cram_slice *s_next = NULL;
3260
3261
        // Next slice; either from the last job we failed to push
3262
        // to the input queue or via more I/O.
3263
786
        if (fd->job_pending) {
3264
0
            cram_decode_job *j = (cram_decode_job *)fd->job_pending;
3265
0
            c_next = j->c;
3266
0
            s_next = j->s;
3267
0
            free(fd->job_pending);
3268
0
            fd->job_pending = NULL;
3269
786
        } else if (!fd->ooc) {
3270
888
        empty_container:
3271
888
            if (!c_next || c_next->curr_slice_mt == c_next->max_slice) {
3272
                // new container
3273
4.28k
                for(;;) {
3274
4.28k
                    if (!(c_next = cram_read_container(fd))) {
3275
25
                        if (fd->pool) {
3276
0
                            fd->ooc = 1;
3277
0
                            break;
3278
0
                        }
3279
3280
25
                        return NULL;
3281
25
                    }
3282
4.26k
                    c_next->curr_slice_mt = c_next->curr_slice;
3283
3284
4.26k
                    if (c_next->length != 0)
3285
150
                        break;
3286
3287
4.11k
                    cram_free_container(c_next);
3288
4.11k
                }
3289
150
                if (fd->ooc)
3290
0
                    break;
3291
3292
//                printf("%p %d:%ld-%ld vs %d:%ld-%ld\n", fd,
3293
//                       c_next->ref_seq_id, c_next->ref_seq_start, c_next->ref_seq_start+c_next->ref_seq_span-1,
3294
//                       fd->range.refid, fd->range.start, fd->range.end);
3295
3296
                /* Skip containers not yet spanning our range */
3297
150
                if (fd->range.refid != -2 && c_next->ref_seq_id != -2) {
3298
                    // ref_id beyond end of range; bail out
3299
0
                    if (c_next->ref_seq_id != fd->range.refid) {
3300
0
                        cram_free_container(c_next);
3301
0
                        fd->ctr_mt = NULL;
3302
0
                        fd->ooc = 1;
3303
0
                        break;
3304
0
                    }
3305
3306
                    // position beyond end of range; bail out
3307
0
                    if (fd->range.refid != -1 &&
3308
0
                        c_next->ref_seq_start > fd->range.end) {
3309
0
                        cram_free_container(c_next);
3310
0
                        fd->ctr_mt = NULL;
3311
0
                        fd->ooc = 1;
3312
0
                        break;
3313
0
                    }
3314
3315
                    // Before start of range; skip to next container.
3316
                    // Can't use cram_seek() here as it drops in-progress
3317
                    // multi-threaded decode jobs, so call hseek() directly.
3318
0
                    if (fd->range.refid != -1 &&
3319
0
                        c_next->ref_seq_start + c_next->ref_seq_span-1 <
3320
0
                        fd->range.start) {
3321
0
                        off_t skip_length = c_next->length;
3322
0
                        cram_free_container(c_next);
3323
0
                        c_next = NULL;
3324
0
                        fd->ooc = 0;
3325
0
                        if (hseek(fd->fp, skip_length, SEEK_CUR) < 0)
3326
0
                            return NULL;
3327
0
                        continue;
3328
0
                    }
3329
0
                }
3330
3331
                // Container is valid range, so remember it for restarting
3332
                // this function.
3333
150
                fd->ctr_mt = c_next;
3334
3335
150
                if (!(c_next->comp_hdr_block = cram_read_block(fd)))
3336
11
                    return NULL;
3337
139
                if (c_next->comp_hdr_block->content_type != COMPRESSION_HEADER)
3338
16
                    return NULL;
3339
3340
123
                c_next->comp_hdr =
3341
123
                    cram_decode_compression_header(fd, c_next->comp_hdr_block);
3342
123
                if (!c_next->comp_hdr)
3343
9
                    return NULL;
3344
3345
114
                if (!c_next->comp_hdr->AP_delta &&
3346
0
                    sam_hrecs_sort_order(fd->header->hrecs) != ORDER_COORD) {
3347
0
                    pthread_mutex_lock(&fd->ref_lock);
3348
0
                    fd->unsorted = 1;
3349
0
                    pthread_mutex_unlock(&fd->ref_lock);
3350
0
                }
3351
114
            }
3352
3353
827
            if (c_next->num_records == 0) {
3354
102
                if (fd->ctr == c_next)
3355
6
                    fd->ctr = NULL;
3356
102
                if (c_curr == c_next)
3357
6
                    c_curr = NULL;
3358
102
                if (fd->ctr_mt == c_next)
3359
102
                    fd->ctr_mt = NULL;
3360
102
                cram_free_container(c_next);
3361
102
                c_next = NULL;
3362
102
                goto empty_container;
3363
102
            }
3364
3365
725
            if (!(s_next = c_next->slice = cram_read_slice(fd)))
3366
138
                return NULL;
3367
3368
587
            s_next->slice_num = ++c_next->curr_slice_mt;
3369
587
            s_next->curr_rec = 0;
3370
587
            s_next->max_rec = s_next->hdr->num_records;
3371
3372
587
            s_next->last_apos = s_next->hdr->ref_seq_start;
3373
3374
            // We know the container overlaps our range, but with multi-slice
3375
            // containers we may have slices that do not.  Skip these also.
3376
587
            if (fd->range.refid != -2 && s_next->hdr->ref_seq_id != -2) {
3377
                // ref_id beyond end of range; bail out
3378
0
                if (s_next->hdr->ref_seq_id != fd->range.refid) {
3379
0
                    fd->ooc = 1;
3380
0
                    cram_free_slice(s_next);
3381
0
                    c_next->slice = s_next = NULL;
3382
0
                    break;
3383
0
                }
3384
3385
                // position beyond end of range; bail out
3386
0
                if (fd->range.refid != -1 &&
3387
0
                    s_next->hdr->ref_seq_start > fd->range.end) {
3388
0
                    fd->ooc = 1;
3389
0
                    cram_free_slice(s_next);
3390
0
                    c_next->slice = s_next = NULL;
3391
0
                    break;
3392
0
                }
3393
3394
                // before start of range; skip to next slice
3395
0
                if (fd->range.refid != -1 &&
3396
0
                    s_next->hdr->ref_seq_start + s_next->hdr->ref_seq_span-1 <
3397
0
                    fd->range.start) {
3398
0
                    cram_free_slice(s_next);
3399
0
                    c_next->slice = s_next = NULL;
3400
0
                    continue;
3401
0
                }
3402
0
            }
3403
587
        } // end: if (!fd->ooc)
3404
3405
587
        if (!c_next || !s_next)
3406
0
            break;
3407
3408
        // Decode the slice, either right now (non-threaded) or by pushing
3409
        // it to the a decode queue (threaded).
3410
587
        if (cram_decode_slice_mt(fd, c_next, s_next, fd->header) != 0) {
3411
87
            hts_log_error("Failure to decode slice");
3412
87
            cram_free_slice(s_next);
3413
87
            c_next->slice = NULL;
3414
87
            return NULL;
3415
87
        }
3416
3417
        // No thread pool, so don't loop again
3418
500
        if (!fd->pool) {
3419
500
            c_curr = c_next;
3420
500
            s_curr = s_next;
3421
500
            break;
3422
500
        }
3423
3424
        // With thread pool, but we have a job pending so our decode queue
3425
        // is full.
3426
0
        if (fd->job_pending)
3427
0
            break;
3428
3429
        // Otherwise we're threaded with room in the decode input queue, so
3430
        // keep reading slices for decode.
3431
        // Push it a bit far, to qsize in queue rather than pending arrival,
3432
        // as cram tends to be a bit bursty in decode timings.
3433
0
        if (hts_tpool_process_len(fd->rqueue) >
3434
0
            hts_tpool_process_qsize(fd->rqueue))
3435
0
            break;
3436
0
    } // end of for(;;)
3437
3438
3439
    // When not threaded we've already have c_curr and s_curr.
3440
    // Otherwise we need get them by pulling off the decode output queue.
3441
500
    if (fd->pool) {
3442
0
        hts_tpool_result *res;
3443
0
        cram_decode_job *j;
3444
3445
0
        if (fd->ooc && hts_tpool_process_empty(fd->rqueue)) {
3446
0
            fd->eof = 1;
3447
0
            return NULL;
3448
0
        }
3449
3450
0
        res = hts_tpool_next_result_wait(fd->rqueue);
3451
3452
0
        if (!res || !hts_tpool_result_data(res)) {
3453
0
            hts_log_error("Call to hts_tpool_next_result failed");
3454
0
            return NULL;
3455
0
        }
3456
3457
0
        j = (cram_decode_job *)hts_tpool_result_data(res);
3458
0
        c_curr = j->c;
3459
0
        s_curr = j->s;
3460
3461
0
        if (j->exit_code != 0) {
3462
0
            hts_log_error("Slice decode failure");
3463
0
            fd->eof = 0;
3464
0
            hts_tpool_delete_result(res, 1);
3465
0
            return NULL;
3466
0
        }
3467
3468
0
        hts_tpool_delete_result(res, 1);
3469
0
    }
3470
3471
500
    *cp = c_curr;
3472
3473
    // Update current slice being processed (as opposed to current
3474
    // slice in the multi-threaded reahead.
3475
500
    fd->ctr = c_curr;
3476
500
    if (c_curr) {
3477
500
        c_curr->slice = s_curr;
3478
500
        if (s_curr)
3479
500
            c_curr->curr_slice = s_curr->slice_num;
3480
500
    }
3481
500
    if (s_curr)
3482
500
        s_curr->curr_rec = 0;
3483
0
    else
3484
0
        fd->eof = 1;
3485
3486
500
    return s_curr;
3487
500
}
3488
3489
/*
3490
 * Read the next cram record and return it.
3491
 * Note that to decode cram_record the caller will need to look up some data
3492
 * in the current slice, pointed to by fd->ctr->slice. This is valid until
3493
 * the next call to cram_get_seq (which may invalidate it).
3494
 *
3495
 * Returns record pointer on success (do not free)
3496
 *        NULL on failure
3497
 */
3498
3.50k
cram_record *cram_get_seq(cram_fd *fd) {
3499
3.50k
    cram_container *c;
3500
3.50k
    cram_slice *s;
3501
3502
4.00k
    for (;;) {
3503
4.00k
        c = fd->ctr;
3504
4.00k
        if (c && c->slice && c->slice->curr_rec < c->slice->max_rec) {
3505
0
            s = c->slice;
3506
4.00k
        } else {
3507
4.00k
            if (!(s = cram_next_slice(fd, &c)))
3508
3.50k
                return NULL;
3509
500
            continue; /* In case slice contains no records */
3510
4.00k
        }
3511
3512
        // No need to lock here as get_seq is running in the main thread,
3513
        // which is also the same one that does the range modifications.
3514
0
        if (fd->range.refid != -2) {
3515
0
            if (fd->range.refid == -1 && s->crecs[s->curr_rec].ref_id != -1) {
3516
                // Special case when looking for unmapped blocks at end.
3517
                // If these are mixed in with mapped data (c->ref_id == -2)
3518
                // then we need skip until we find the unmapped data, if at all
3519
0
                s->curr_rec++;
3520
0
                continue;
3521
0
            }
3522
0
            if (s->crecs[s->curr_rec].ref_id < fd->range.refid &&
3523
0
                s->crecs[s->curr_rec].ref_id != -1) {
3524
                // Looking for a mapped read, but not there yet.  Special case
3525
                // as -1 (unmapped) shouldn't be considered < refid.
3526
0
                s->curr_rec++;
3527
0
                continue;
3528
0
            }
3529
3530
0
            if (s->crecs[s->curr_rec].ref_id != fd->range.refid) {
3531
0
                fd->eof = 1;
3532
0
                cram_free_slice(s);
3533
0
                c->slice = NULL;
3534
0
                return NULL;
3535
0
            }
3536
3537
0
            if (fd->range.refid != -1 && s->crecs[s->curr_rec].apos > fd->range.end) {
3538
0
                fd->eof = 1;
3539
0
                cram_free_slice(s);
3540
0
                c->slice = NULL;
3541
0
                return NULL;
3542
0
            }
3543
3544
0
            if (fd->range.refid != -1 && s->crecs[s->curr_rec].aend < fd->range.start) {
3545
0
                s->curr_rec++;
3546
0
                continue;
3547
0
            }
3548
0
        }
3549
3550
0
        break;
3551
0
    }
3552
3553
0
    fd->ctr = c;
3554
0
    c->slice = s;
3555
0
    return &s->crecs[s->curr_rec++];
3556
3.50k
}
3557
3558
/*
3559
 * Read the next cram record and convert it to a bam_seq_t struct.
3560
 *
3561
 * Returns >= 0 success (number of bytes written to *bam)
3562
 *        -1 on EOF or failure (check fd->err)
3563
 */
3564
3.50k
int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam) {
3565
3.50k
    cram_record *cr;
3566
3.50k
    cram_container *c;
3567
3.50k
    cram_slice *s;
3568
3569
3.50k
    if (!(cr = cram_get_seq(fd)))
3570
3.50k
        return -1;
3571
3572
0
    c = fd->ctr;
3573
0
    s = c->slice;
3574
3575
0
    return cram_to_bam(fd->header, fd, s, cr, s->curr_rec-1, bam);
3576
3.50k
}
3577
3578
/*
3579
 * Drains and frees the decode read-queue for a multi-threaded reader.
3580
 */
3581
4.80k
void cram_drain_rqueue(cram_fd *fd) {
3582
4.80k
    cram_container *lc = NULL;
3583
3584
4.80k
    if (!fd->pool || !fd->rqueue)
3585
4.80k
        return;
3586
3587
    // drain queue of any in-flight decode jobs
3588
0
    while (!hts_tpool_process_empty(fd->rqueue)) {
3589
0
        hts_tpool_result *r = hts_tpool_next_result_wait(fd->rqueue);
3590
0
        if (!r)
3591
0
            break;
3592
0
        cram_decode_job *j = (cram_decode_job *)hts_tpool_result_data(r);
3593
0
        if (j->c->slice == j->s)
3594
0
            j->c->slice = NULL;
3595
0
        if (j->c != lc) {
3596
0
            if (lc) {
3597
0
                if (fd->ctr == lc)
3598
0
                    fd->ctr = NULL;
3599
0
                if (fd->ctr_mt == lc)
3600
0
                    fd->ctr_mt = NULL;
3601
0
                cram_free_container(lc);
3602
0
            }
3603
0
            lc = j->c;
3604
0
        }
3605
0
        cram_free_slice(j->s);
3606
0
        hts_tpool_delete_result(r, 1);
3607
0
    }
3608
3609
    // Also tidy up any pending decode job that we didn't submit to the workers
3610
    // due to the input queue being full.
3611
0
    if (fd->job_pending) {
3612
0
        cram_decode_job *j = (cram_decode_job *)fd->job_pending;
3613
0
        if (j->c->slice == j->s)
3614
0
            j->c->slice = NULL;
3615
0
        if (j->c != lc) {
3616
0
            if (lc) {
3617
0
                if (fd->ctr == lc)
3618
0
                    fd->ctr = NULL;
3619
0
                if (fd->ctr_mt == lc)
3620
0
                    fd->ctr_mt = NULL;
3621
0
                cram_free_container(lc);
3622
0
            }
3623
0
            lc = j->c;
3624
0
        }
3625
0
        cram_free_slice(j->s);
3626
0
        free(j);
3627
0
        fd->job_pending = NULL;
3628
0
    }
3629
3630
0
    if (lc) {
3631
0
        if (fd->ctr == lc)
3632
0
            fd->ctr = NULL;
3633
0
        if (fd->ctr_mt == lc)
3634
0
            fd->ctr_mt = NULL;
3635
0
        cram_free_container(lc);
3636
0
    }
3637
0
}