Coverage Report

Created: 2025-03-17 07:02

/src/htslib/cram/cram_decode.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2012-2020, 2022-2024 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * - In-memory decoding of CRAM data structures.
33
 * - Iterator for reading CRAM record by record.
34
 */
35
36
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
37
#include <config.h>
38
39
#include <stdio.h>
40
#include <errno.h>
41
#include <assert.h>
42
#include <stdlib.h>
43
#include <string.h>
44
#include <zlib.h>
45
#include <sys/types.h>
46
#include <sys/stat.h>
47
#include <math.h>
48
#include <stdint.h>
49
#include <inttypes.h>
50
51
#include "cram.h"
52
#include "os.h"
53
#include "../htslib/hts.h"
54
55
//Whether CIGAR has just M or uses = and X to indicate match and mismatch
56
//#define USE_X
57
58
/* ----------------------------------------------------------------------
59
 * CRAM compression headers
60
 */
61
62
/*
63
 * Decodes the Tag Dictionary record in the preservation map
64
 * Updates the cram compression header.
65
 *
66
 * Returns number of bytes decoded on success
67
 *        -1 on failure
68
 */
69
int cram_decode_TD(cram_fd *fd, char *cp, const char *endp,
70
729
                   cram_block_compression_hdr *h) {
71
729
    char *op = cp;
72
729
    unsigned char *dat;
73
729
    cram_block *b;
74
729
    int32_t blk_size = 0;
75
729
    int nTL, i, sz, err = 0;
76
77
729
    if (!(b = cram_new_block(0, 0)))
78
0
        return -1;
79
80
729
    if (h->TD_blk || h->TL) {
81
408
        hts_log_warning("More than one TD block found in compression header");
82
408
        cram_free_block(h->TD_blk);
83
408
        free(h->TL);
84
408
        h->TD_blk = NULL;
85
408
        h->TL = NULL;
86
408
    }
87
88
    /* Decode */
89
729
    blk_size = fd->vv.varint_get32(&cp, endp, &err);
90
729
    if (!blk_size) {
91
132
        h->nTL = 0;
92
132
        cram_free_block(b);
93
132
        return cp - op;
94
132
    }
95
96
597
    if (err || blk_size < 0 || endp - cp < blk_size) {
97
57
        cram_free_block(b);
98
57
        return -1;
99
57
    }
100
101
540
    BLOCK_APPEND(b, cp, blk_size);
102
540
    cp += blk_size;
103
540
    sz = cp - op;
104
    // Force nul termination if missing
105
540
    if (BLOCK_DATA(b)[BLOCK_SIZE(b)-1])
106
357
        BLOCK_APPEND_CHAR(b, '\0');
107
108
    /* Set up TL lookup table */
109
540
    dat = BLOCK_DATA(b);
110
111
    // Count
112
6.29k
    for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
113
5.75k
        nTL++;
114
32.1k
        while (dat[i])
115
26.3k
            i++;
116
5.75k
    }
117
118
    // Copy
119
540
    if (!(h->TL = calloc(nTL, sizeof(*h->TL)))) {
120
0
        cram_free_block(b);
121
0
        return -1;
122
0
    }
123
6.29k
    for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
124
5.75k
        h->TL[nTL++] = &dat[i];
125
32.1k
        while (dat[i])
126
26.3k
            i++;
127
5.75k
    }
128
540
    h->TD_blk = b;
129
540
    h->nTL = nTL;
130
131
540
    return sz;
132
133
0
 block_err:
134
0
    cram_free_block(b);
135
0
    return -1;
136
540
}
137
138
/*
139
 * Decodes a CRAM block compression header.
140
 * Returns header ptr on success
141
 *         NULL on failure
142
 */
143
cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
144
7.80k
                                                           cram_block *b) {
145
7.80k
    char *cp, *endp, *cp_copy;
146
7.80k
    cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
147
7.80k
    int i, err = 0;
148
7.80k
    int32_t map_size = 0, map_count = 0;
149
150
7.80k
    if (!hdr)
151
0
        return NULL;
152
153
7.80k
    if (b->method != RAW) {
154
5.36k
        if (cram_uncompress_block(b)) {
155
4.63k
            free(hdr);
156
4.63k
            return NULL;
157
4.63k
        }
158
5.36k
    }
159
160
3.16k
    cp = (char *)b->data;
161
3.16k
    endp = cp + b->uncomp_size;
162
163
3.16k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
164
2.93k
        hdr->ref_seq_id = fd->vv.varint_get32(&cp, endp, &err);
165
2.93k
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
166
0
            hdr->ref_seq_start = fd->vv.varint_get64(&cp, endp, &err);
167
0
            hdr->ref_seq_span  = fd->vv.varint_get64(&cp, endp, &err);
168
2.93k
        } else {
169
2.93k
            hdr->ref_seq_start = fd->vv.varint_get32(&cp, endp, &err);
170
2.93k
            hdr->ref_seq_span  = fd->vv.varint_get32(&cp, endp, &err);
171
2.93k
        }
172
2.93k
        hdr->num_records   = fd->vv.varint_get32(&cp, endp, &err);
173
2.93k
        hdr->num_landmarks = fd->vv.varint_get32(&cp, endp, &err);
174
2.93k
        if (hdr->num_landmarks < 0 ||
175
2.93k
            hdr->num_landmarks >= SIZE_MAX / sizeof(int32_t) ||
176
2.93k
            endp - cp < hdr->num_landmarks) {
177
171
            free(hdr);
178
171
            return NULL;
179
171
        }
180
2.76k
        if (!(hdr->landmark = malloc(hdr->num_landmarks * sizeof(int32_t)))) {
181
0
            free(hdr);
182
0
            return NULL;
183
0
        }
184
112k
        for (i = 0; i < hdr->num_landmarks; i++)
185
110k
            hdr->landmark[i] = fd->vv.varint_get32(&cp, endp, &err);;
186
2.76k
    }
187
188
2.99k
    hdr->preservation_map = kh_init(map);
189
190
2.99k
    memset(hdr->rec_encoding_map, 0,
191
2.99k
           CRAM_MAP_HASH * sizeof(hdr->rec_encoding_map[0]));
192
2.99k
    memset(hdr->tag_encoding_map, 0,
193
2.99k
           CRAM_MAP_HASH * sizeof(hdr->tag_encoding_map[0]));
194
195
2.99k
    if (!hdr->preservation_map) {
196
0
        cram_free_compression_header(hdr);
197
0
        return NULL;
198
0
    }
199
200
    /* Initialise defaults for preservation map */
201
2.99k
    hdr->read_names_included = 0;
202
2.99k
    hdr->AP_delta = 1;
203
2.99k
    hdr->qs_seq_orient = 1;
204
2.99k
    memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20);
205
206
    /* Preservation map */
207
2.99k
    map_size  = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp;
208
2.99k
    map_count = fd->vv.varint_get32(&cp, endp, &err);
209
89.0k
    for (i = 0; i < map_count; i++) {
210
86.3k
        pmap_t hd;
211
86.3k
        khint_t k;
212
86.3k
        int r;
213
214
86.3k
        if (endp - cp < 3) {
215
255
            cram_free_compression_header(hdr);
216
255
            return NULL;
217
255
        }
218
86.0k
        cp += 2;
219
86.0k
        switch(CRAM_KEY(cp[-2],cp[-1])) {
220
27
        case CRAM_KEY('M','I'): // was mapped QS included in V1.0
221
111
        case CRAM_KEY('U','I'): // was unmapped QS included in V1.0
222
168
        case CRAM_KEY('P','I'): // was unmapped placed in V1.0
223
168
            hd.i = *cp++;
224
168
            break;
225
226
216
        case CRAM_KEY('R','N'):
227
216
            hd.i = *cp++;
228
216
            k = kh_put(map, hdr->preservation_map, "RN", &r);
229
216
            if (-1 == r) {
230
0
                cram_free_compression_header(hdr);
231
0
                return NULL;
232
0
            }
233
234
216
            kh_val(hdr->preservation_map, k) = hd;
235
216
            hdr->read_names_included = hd.i;
236
216
            break;
237
238
189
        case CRAM_KEY('A','P'):
239
189
            hd.i = *cp++;
240
189
            k = kh_put(map, hdr->preservation_map, "AP", &r);
241
189
            if (-1 == r) {
242
0
                cram_free_compression_header(hdr);
243
0
                return NULL;
244
0
            }
245
246
189
            kh_val(hdr->preservation_map, k) = hd;
247
189
            hdr->AP_delta = hd.i;
248
189
            break;
249
250
3.19k
        case CRAM_KEY('R','R'):
251
3.19k
            hd.i = *cp++;
252
3.19k
            k = kh_put(map, hdr->preservation_map, "RR", &r);
253
3.19k
            if (-1 == r) {
254
0
                cram_free_compression_header(hdr);
255
0
                return NULL;
256
0
            }
257
258
3.19k
            kh_val(hdr->preservation_map, k) = hd;
259
3.19k
            hdr->no_ref = !hd.i;
260
3.19k
            break;
261
262
357
        case CRAM_KEY('Q','O'):
263
357
            hd.i = *cp++;
264
357
            k = kh_put(map, hdr->preservation_map, "QO", &r);
265
357
            if (-1 == r) {
266
0
                cram_free_compression_header(hdr);
267
0
                return NULL;
268
0
            }
269
270
357
            kh_val(hdr->preservation_map, k) = hd;
271
357
            hdr->qs_seq_orient = hd.i;
272
357
            break;
273
274
942
        case CRAM_KEY('S','M'):
275
942
            if (endp - cp < 5) {
276
6
                cram_free_compression_header(hdr);
277
6
                return NULL;
278
6
            }
279
936
            hdr->substitution_matrix[0][(cp[0]>>6)&3] = 'C';
280
936
            hdr->substitution_matrix[0][(cp[0]>>4)&3] = 'G';
281
936
            hdr->substitution_matrix[0][(cp[0]>>2)&3] = 'T';
282
936
            hdr->substitution_matrix[0][(cp[0]>>0)&3] = 'N';
283
284
936
            hdr->substitution_matrix[1][(cp[1]>>6)&3] = 'A';
285
936
            hdr->substitution_matrix[1][(cp[1]>>4)&3] = 'G';
286
936
            hdr->substitution_matrix[1][(cp[1]>>2)&3] = 'T';
287
936
            hdr->substitution_matrix[1][(cp[1]>>0)&3] = 'N';
288
289
936
            hdr->substitution_matrix[2][(cp[2]>>6)&3] = 'A';
290
936
            hdr->substitution_matrix[2][(cp[2]>>4)&3] = 'C';
291
936
            hdr->substitution_matrix[2][(cp[2]>>2)&3] = 'T';
292
936
            hdr->substitution_matrix[2][(cp[2]>>0)&3] = 'N';
293
294
936
            hdr->substitution_matrix[3][(cp[3]>>6)&3] = 'A';
295
936
            hdr->substitution_matrix[3][(cp[3]>>4)&3] = 'C';
296
936
            hdr->substitution_matrix[3][(cp[3]>>2)&3] = 'G';
297
936
            hdr->substitution_matrix[3][(cp[3]>>0)&3] = 'N';
298
299
936
            hdr->substitution_matrix[4][(cp[4]>>6)&3] = 'A';
300
936
            hdr->substitution_matrix[4][(cp[4]>>4)&3] = 'C';
301
936
            hdr->substitution_matrix[4][(cp[4]>>2)&3] = 'G';
302
936
            hdr->substitution_matrix[4][(cp[4]>>0)&3] = 'T';
303
304
936
            hd.p = cp;
305
936
            cp += 5;
306
307
936
            k = kh_put(map, hdr->preservation_map, "SM", &r);
308
936
            if (-1 == r) {
309
0
                cram_free_compression_header(hdr);
310
0
                return NULL;
311
0
            }
312
936
            kh_val(hdr->preservation_map, k) = hd;
313
936
            break;
314
315
729
        case CRAM_KEY('T','D'): {
316
729
            int sz = cram_decode_TD(fd, cp, endp, hdr); // tag dictionary
317
729
            if (sz < 0) {
318
57
                cram_free_compression_header(hdr);
319
57
                return NULL;
320
57
            }
321
322
672
            hd.p = cp;
323
672
            cp += sz;
324
325
672
            k = kh_put(map, hdr->preservation_map, "TD", &r);
326
672
            if (-1 == r) {
327
0
                cram_free_compression_header(hdr);
328
0
                return NULL;
329
0
            }
330
672
            kh_val(hdr->preservation_map, k) = hd;
331
672
            break;
332
672
        }
333
334
80.2k
        default:
335
80.2k
            hts_log_warning("Unrecognised preservation map key %c%c", cp[-2], cp[-1]);
336
            // guess byte;
337
80.2k
            cp++;
338
80.2k
            break;
339
86.0k
        }
340
86.0k
    }
341
2.67k
    if (cp - cp_copy != map_size) {
342
189
        cram_free_compression_header(hdr);
343
189
        return NULL;
344
189
    }
345
346
    /* Record encoding map */
347
2.48k
    map_size  = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp;
348
2.48k
    map_count = fd->vv.varint_get32(&cp, endp, &err);
349
2.48k
    int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0;
350
77.7k
    for (i = 0; i < map_count; i++) {
351
76.6k
        char *key = cp;
352
76.6k
        int32_t encoding = E_NULL;
353
76.6k
        int32_t size = 0;
354
76.6k
        ptrdiff_t offset;
355
76.6k
        cram_map *m;
356
76.6k
        enum cram_DS_ID ds_id;
357
76.6k
        enum cram_external_type type;
358
359
76.6k
        if (endp - cp < 4) {
360
129
            cram_free_compression_header(hdr);
361
129
            return NULL;
362
129
        }
363
364
76.4k
        cp += 2;
365
76.4k
        encoding = fd->vv.varint_get32(&cp, endp, &err);
366
76.4k
        size     = fd->vv.varint_get32(&cp, endp, &err);
367
368
76.4k
        offset = cp - (char *)b->data;
369
370
76.4k
        if (encoding == E_NULL)
371
22.3k
            continue;
372
373
54.1k
        if (size < 0 || endp - cp < size) {
374
447
            cram_free_compression_header(hdr);
375
447
            return NULL;
376
447
        }
377
378
        //printf("%s codes for %.2s\n", cram_encoding2str(encoding), key);
379
380
        /*
381
         * For CRAM1.0 CF and BF are Byte and not Int.
382
         * Practically speaking it makes no difference unless we have a
383
         * 1.0 format file that stores these in EXTERNAL as only then
384
         * does Byte vs Int matter.
385
         *
386
         * Neither this C code nor Java reference implementations did this,
387
         * so we gloss over it and treat them as int.
388
         */
389
53.6k
        ds_id = DS_CORE;
390
53.6k
        if (key[0] == 'B' && key[1] == 'F') {
391
165
            ds_id = DS_BF; type = E_INT;
392
53.5k
        } else if (key[0] == 'C' && key[1] == 'F') {
393
363
            ds_id = DS_CF; type = E_INT;
394
53.1k
        } else if (key[0] == 'R' && key[1] == 'I') {
395
177
            ds_id = DS_RI; type = E_INT;
396
52.9k
        } else if (key[0] == 'R' && key[1] == 'L') {
397
30
            ds_id = DS_RL; type = E_INT;
398
52.9k
        } else if (key[0] == 'A' && key[1] == 'P') {
399
1.13k
            ds_id = DS_AP;
400
1.13k
            type = is_v4 ? E_SLONG : E_INT;
401
51.8k
        } else if (key[0] == 'R' && key[1] == 'G') {
402
99
            ds_id = DS_RG;
403
99
            type = E_INT;
404
51.7k
        } else if (key[0] == 'M' && key[1] == 'F') {
405
60
            ds_id = DS_MF; type = E_INT;
406
51.6k
        } else if (key[0] == 'N' && key[1] == 'S') {
407
495
            ds_id = DS_NS; type = E_INT;
408
51.1k
        } else if (key[0] == 'N' && key[1] == 'P') {
409
2.41k
            ds_id = DS_NP;
410
2.41k
            type = is_v4 ? E_LONG : E_INT;
411
48.7k
        } else if (key[0] == 'T' && key[1] == 'S') {
412
69
            ds_id = DS_TS;
413
69
            type = is_v4 ? E_SLONG : E_INT;
414
48.6k
        } else if (key[0] == 'N' && key[1] == 'F') {
415
144
            ds_id = DS_NF; type = E_INT;
416
48.5k
        } else if (key[0] == 'T' && key[1] == 'C') {
417
141
            ds_id = DS_TC; type = E_BYTE;
418
48.3k
        } else if (key[0] == 'T' && key[1] == 'N') {
419
3
            ds_id = DS_TN; type = E_INT;
420
48.3k
        } else if (key[0] == 'F' && key[1] == 'N') {
421
3
            ds_id = DS_FN; type = E_INT;
422
48.3k
        } else if (key[0] == 'F' && key[1] == 'C') {
423
396
            ds_id = DS_FC; type = E_BYTE;
424
47.9k
        } else if (key[0] == 'F' && key[1] == 'P') {
425
72
            ds_id = DS_FP; type = E_INT;
426
47.9k
        } else if (key[0] == 'B' && key[1] == 'S') {
427
36
            ds_id = DS_BS; type = E_BYTE;
428
47.8k
        } else if (key[0] == 'I' && key[1] == 'N') {
429
69
            ds_id = DS_IN; type = E_BYTE_ARRAY;
430
47.8k
        } else if (key[0] == 'S' && key[1] == 'C') {
431
3
            ds_id = DS_SC; type = E_BYTE_ARRAY;
432
47.8k
        } else if (key[0] == 'D' && key[1] == 'L') {
433
90
            ds_id = DS_DL; type = E_INT;
434
47.7k
        } else if (key[0] == 'B' && key[1] == 'A') {
435
177
            ds_id = DS_BA; type = E_BYTE;
436
47.5k
        } else if (key[0] == 'B' && key[1] == 'B') {
437
141
            ds_id = DS_BB; type = E_BYTE_ARRAY;
438
47.4k
        } else if (key[0] == 'R' && key[1] == 'S') {
439
285
            ds_id = DS_RS; type = E_INT;
440
47.1k
        } else if (key[0] == 'P' && key[1] == 'D') {
441
3
            ds_id = DS_PD; type = E_INT;
442
47.1k
        } else if (key[0] == 'H' && key[1] == 'C') {
443
12
            ds_id = DS_HC; type = E_INT;
444
47.1k
        } else if (key[0] == 'M' && key[1] == 'Q') {
445
6.03k
            ds_id = DS_MQ; type = E_INT;
446
41.0k
        } else if (key[0] == 'R' && key[1] == 'N') {
447
30
            ds_id = DS_RN; type = E_BYTE_ARRAY_BLOCK;
448
41.0k
        } else if (key[0] == 'Q' && key[1] == 'S') {
449
240
            ds_id = DS_QS; type = E_BYTE;
450
40.7k
        } else if (key[0] == 'Q' && key[1] == 'Q') {
451
9
            ds_id = DS_QQ; type = E_BYTE_ARRAY;
452
40.7k
        } else if (key[0] == 'T' && key[1] == 'L') {
453
3
            ds_id = DS_TL; type = E_INT;
454
40.7k
        } else if (key[0] == 'T' && key[1] == 'M') {
455
40.7k
        } else if (key[0] == 'T' && key[1] == 'V') {
456
40.7k
        } else {
457
40.7k
            hts_log_warning("Unrecognised key: %.2s", key);
458
40.7k
        }
459
460
53.6k
        if (ds_id != DS_CORE) {
461
12.9k
            if (hdr->codecs[ds_id] != NULL) {
462
11.2k
                hts_log_warning("Codec for key %.2s defined more than once",
463
11.2k
                                key);
464
11.2k
                hdr->codecs[ds_id]->free(hdr->codecs[ds_id]);
465
11.2k
            }
466
12.9k
            hdr->codecs[ds_id] = cram_decoder_init(hdr, encoding, cp, size,
467
12.9k
                                                   type, fd->version, &fd->vv);
468
12.9k
            if (!hdr->codecs[ds_id]) {
469
822
                cram_free_compression_header(hdr);
470
822
                return NULL;
471
822
            }
472
12.9k
        }
473
474
52.8k
        cp += size;
475
476
        // Fill out cram_map purely for cram_dump to dump out.
477
52.8k
        m = malloc(sizeof(*m));
478
52.8k
        if (!m) {
479
0
            cram_free_compression_header(hdr);
480
0
            return NULL;
481
0
        }
482
52.8k
        m->key = CRAM_KEY(key[0], key[1]);
483
52.8k
        m->encoding = encoding;
484
52.8k
        m->size     = size;
485
52.8k
        m->offset   = offset;
486
52.8k
        m->codec = NULL;
487
488
52.8k
        m->next = hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])];
489
52.8k
        hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])] = m;
490
52.8k
    }
491
1.09k
    if (cp - cp_copy != map_size) {
492
126
        cram_free_compression_header(hdr);
493
126
        return NULL;
494
126
    }
495
496
    /* Tag encoding map */
497
965
    map_size  = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp;
498
965
    map_count = fd->vv.varint_get32(&cp, endp, &err);
499
2.93k
    for (i = 0; i < map_count; i++) {
500
2.13k
        int32_t encoding = E_NULL;
501
2.13k
        int32_t size = 0;
502
2.13k
        cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
503
2.13k
        uint8_t key[3];
504
505
2.13k
        if (!m || endp - cp < 6) {
506
9
            free(m);
507
9
            cram_free_compression_header(hdr);
508
9
            return NULL;
509
9
        }
510
511
2.12k
        m->key = fd->vv.varint_get32(&cp, endp, &err);
512
2.12k
        key[0] = m->key>>16;
513
2.12k
        key[1] = m->key>>8;
514
2.12k
        key[2] = m->key;
515
2.12k
        encoding = fd->vv.varint_get32(&cp, endp, &err);
516
2.12k
        size     = fd->vv.varint_get32(&cp, endp, &err);
517
518
2.12k
        m->encoding = encoding;
519
2.12k
        m->size     = size;
520
2.12k
        m->offset   = cp - (char *)b->data;
521
2.12k
        if (size < 0 || endp - cp < size ||
522
2.12k
            !(m->codec = cram_decoder_init(hdr, encoding, cp, size,
523
2.07k
                                           E_BYTE_ARRAY_BLOCK, fd->version, &fd->vv))) {
524
159
            cram_free_compression_header(hdr);
525
159
            free(m);
526
159
            return NULL;
527
159
        }
528
529
1.96k
        cp += size;
530
531
1.96k
        m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])];
532
1.96k
        hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m;
533
1.96k
    }
534
797
    if (err || cp - cp_copy != map_size) {
535
170
        cram_free_compression_header(hdr);
536
170
        return NULL;
537
170
    }
538
539
627
    return hdr;
540
797
}
541
542
/*
543
 * Note we also need to scan through the record encoding map to
544
 * see which data series share the same block, either external or
545
 * CORE. For example if we need the BF data series but MQ and CF
546
 * are also encoded in the same block then we need to add those in
547
 * as a dependency in order to correctly decode BF.
548
 *
549
 * Returns 0 on success
550
 *        -1 on failure
551
 */
552
int cram_dependent_data_series(cram_fd *fd,
553
                               cram_block_compression_hdr *hdr,
554
480
                               cram_slice *s) {
555
480
    int *block_used;
556
480
    int core_used = 0;
557
480
    int i;
558
480
    static int i_to_id[] = {
559
480
        DS_BF, DS_AP, DS_FP, DS_RL, DS_DL, DS_NF, DS_BA, DS_QS,
560
480
        DS_FC, DS_FN, DS_BS, DS_IN, DS_RG, DS_MQ, DS_TL, DS_RN,
561
480
        DS_NS, DS_NP, DS_TS, DS_MF, DS_CF, DS_RI, DS_RS, DS_PD,
562
480
        DS_HC, DS_SC, DS_BB, DS_QQ,
563
480
    };
564
480
    uint32_t orig_ds;
565
566
    /*
567
     * Set the data_series bit field based on fd->required_fields
568
     * contents.
569
     */
570
480
    if (fd->required_fields && fd->required_fields != INT_MAX) {
571
0
        s->data_series = 0;
572
573
0
        if (fd->required_fields & SAM_QNAME)
574
0
            s->data_series |= CRAM_RN;
575
576
0
        if (fd->required_fields & SAM_FLAG)
577
0
            s->data_series |= CRAM_BF;
578
579
0
        if (fd->required_fields & SAM_RNAME)
580
0
            s->data_series |= CRAM_RI | CRAM_BF;
581
582
0
        if (fd->required_fields & SAM_POS)
583
0
            s->data_series |= CRAM_AP | CRAM_BF;
584
585
0
        if (fd->required_fields & SAM_MAPQ)
586
0
            s->data_series |= CRAM_MQ;
587
588
0
        if (fd->required_fields & SAM_CIGAR)
589
0
            s->data_series |= CRAM_CIGAR;
590
591
0
        if (fd->required_fields & SAM_RNEXT)
592
0
            s->data_series |= CRAM_CF | CRAM_NF | CRAM_RI | CRAM_NS |CRAM_BF;
593
594
0
        if (fd->required_fields & SAM_PNEXT)
595
0
            s->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_NP | CRAM_BF;
596
597
0
        if (fd->required_fields & SAM_TLEN)
598
0
            s->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_TS |
599
0
                CRAM_BF | CRAM_MF | CRAM_RI | CRAM_CIGAR;
600
601
0
        if (fd->required_fields & SAM_SEQ)
602
0
            s->data_series |= CRAM_SEQ;
603
604
0
        if (!(fd->required_fields & SAM_AUX))
605
            // No easy way to get MD/NM without other tags at present
606
0
            s->decode_md = 0;
607
608
0
        if (fd->required_fields & SAM_QUAL)
609
0
            s->data_series |= CRAM_QUAL;
610
611
0
        if (fd->required_fields & SAM_AUX)
612
0
            s->data_series |= CRAM_RG | CRAM_TL | CRAM_aux;
613
614
0
        if (fd->required_fields & SAM_RGAUX)
615
0
            s->data_series |= CRAM_RG | CRAM_BF;
616
617
        // Always uncompress CORE block
618
0
        if (cram_uncompress_block(s->block[0]))
619
0
            return -1;
620
480
    } else {
621
480
        s->data_series = CRAM_ALL;
622
623
2.35k
        for (i = 0; i < s->hdr->num_blocks; i++) {
624
1.88k
            if (cram_uncompress_block(s->block[i]))
625
9
                return -1;
626
1.88k
        }
627
628
471
        return 0;
629
480
    }
630
631
0
    block_used = calloc(s->hdr->num_blocks+1, sizeof(int));
632
0
    if (!block_used)
633
0
        return -1;
634
635
0
    do {
636
        /*
637
         * Also set data_series based on code prerequisites. Eg if we need
638
         * CRAM_QS then we also need to know CRAM_RL so we know how long it
639
         * is, or if we need FC/FP then we also need FN (number of features).
640
         *
641
         * It's not reciprocal though. We may be needing to decode FN
642
         * but have no need to decode FC, FP and cigar ops.
643
         */
644
0
        if (s->data_series & CRAM_RS)    s->data_series |= CRAM_FC|CRAM_FP;
645
0
        if (s->data_series & CRAM_PD)    s->data_series |= CRAM_FC|CRAM_FP;
646
0
        if (s->data_series & CRAM_HC)    s->data_series |= CRAM_FC|CRAM_FP;
647
0
        if (s->data_series & CRAM_QS)    s->data_series |= CRAM_FC|CRAM_FP;
648
0
        if (s->data_series & CRAM_IN)    s->data_series |= CRAM_FC|CRAM_FP;
649
0
        if (s->data_series & CRAM_SC)    s->data_series |= CRAM_FC|CRAM_FP;
650
0
        if (s->data_series & CRAM_BS)    s->data_series |= CRAM_FC|CRAM_FP;
651
0
        if (s->data_series & CRAM_DL)    s->data_series |= CRAM_FC|CRAM_FP;
652
0
        if (s->data_series & CRAM_BA)    s->data_series |= CRAM_FC|CRAM_FP;
653
0
        if (s->data_series & CRAM_BB)    s->data_series |= CRAM_FC|CRAM_FP;
654
0
        if (s->data_series & CRAM_QQ)    s->data_series |= CRAM_FC|CRAM_FP;
655
656
        // cram_decode_seq() needs seq[] array
657
0
        if (s->data_series & (CRAM_SEQ|CRAM_CIGAR)) s->data_series |= CRAM_RL;
658
659
0
        if (s->data_series & CRAM_FP)    s->data_series |= CRAM_FC;
660
0
        if (s->data_series & CRAM_FC)    s->data_series |= CRAM_FN;
661
0
        if (s->data_series & CRAM_aux)   s->data_series |= CRAM_TL;
662
0
        if (s->data_series & CRAM_MF)    s->data_series |= CRAM_CF;
663
0
        if (s->data_series & CRAM_MQ)    s->data_series |= CRAM_BF;
664
0
        if (s->data_series & CRAM_BS)    s->data_series |= CRAM_RI;
665
0
        if (s->data_series & (CRAM_MF |CRAM_NS |CRAM_NP |CRAM_TS |CRAM_NF))
666
0
            s->data_series |= CRAM_CF;
667
0
        if (!hdr->read_names_included && s->data_series & CRAM_RN)
668
0
            s->data_series |= CRAM_CF | CRAM_NF;
669
0
        if (s->data_series & (CRAM_BA | CRAM_QS | CRAM_BB | CRAM_QQ))
670
0
            s->data_series |= CRAM_BF | CRAM_CF | CRAM_RL;
671
0
        if (s->data_series & CRAM_FN) {
672
            // The CRAM_FN loop checks for reference length boundaries,
673
            // which needs a working seq_pos.  Some fields are fixed size
674
            // irrespective of if we decode (BS), but others need to know
675
            // the size of the string fetched back (SC, IN, BB).
676
0
            s->data_series |= CRAM_SC | CRAM_IN | CRAM_BB;
677
0
        }
678
679
0
        orig_ds = s->data_series;
680
681
        // Find which blocks are in use.
682
0
        for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
683
0
            int bnum1, bnum2, j;
684
0
            cram_codec *c = hdr->codecs[i_to_id[i]];
685
686
0
            if (!(s->data_series & (1<<i)))
687
0
                continue;
688
689
0
            if (!c)
690
0
                continue;
691
692
0
            bnum1 = cram_codec_to_id(c, &bnum2);
693
694
0
            for (;;) {
695
0
                switch (bnum1) {
696
0
                case -2:
697
0
                    break;
698
699
0
                case -1:
700
0
                    core_used = 1;
701
0
                    break;
702
703
0
                default:
704
0
                    for (j = 0; j < s->hdr->num_blocks; j++) {
705
0
                        if (s->block[j]->content_type == EXTERNAL &&
706
0
                            s->block[j]->content_id == bnum1) {
707
0
                            block_used[j] = 1;
708
0
                            if (cram_uncompress_block(s->block[j])) {
709
0
                                free(block_used);
710
0
                                return -1;
711
0
                            }
712
0
                        }
713
0
                    }
714
0
                    break;
715
0
                }
716
717
0
                if (bnum2 == -2 || bnum1 == bnum2)
718
0
                    break;
719
720
0
                bnum1 = bnum2; // 2nd pass
721
0
            }
722
0
        }
723
724
        // Tags too
725
0
        if ((fd->required_fields & SAM_AUX) ||
726
0
            (s->data_series & CRAM_aux)) {
727
0
            for (i = 0; i < CRAM_MAP_HASH; i++) {
728
0
                int bnum1, bnum2, j;
729
0
                cram_map *m = hdr->tag_encoding_map[i];
730
731
0
                while (m) {
732
0
                    cram_codec *c = m->codec;
733
0
                    if (!c) {
734
0
                        m = m->next;
735
0
                        continue;
736
0
                    }
737
738
0
                    bnum1 = cram_codec_to_id(c, &bnum2);
739
740
0
                    for (;;) {
741
0
                        switch (bnum1) {
742
0
                        case -2:
743
0
                            break;
744
745
0
                        case -1:
746
0
                            core_used = 1;
747
0
                            break;
748
749
0
                        default:
750
0
                            for (j = 0; j < s->hdr->num_blocks; j++) {
751
0
                                if (s->block[j]->content_type == EXTERNAL &&
752
0
                                    s->block[j]->content_id == bnum1) {
753
0
                                    block_used[j] = 1;
754
0
                                    if (cram_uncompress_block(s->block[j])) {
755
0
                                        free(block_used);
756
0
                                        return -1;
757
0
                                    }
758
0
                                }
759
0
                            }
760
0
                            break;
761
0
                        }
762
763
0
                        if (bnum2 == -2 || bnum1 == bnum2)
764
0
                            break;
765
766
0
                        bnum1 = bnum2; // 2nd pass
767
0
                    }
768
769
0
                    m = m->next;
770
0
                }
771
0
            }
772
0
        }
773
774
        // We now know which blocks are in used, so repeat and find
775
        // which other data series need to be added.
776
0
        for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
777
0
            int bnum1, bnum2, j;
778
0
            cram_codec *c = hdr->codecs[i_to_id[i]];
779
780
0
            if (!c)
781
0
                continue;
782
783
0
            bnum1 = cram_codec_to_id(c, &bnum2);
784
785
0
            for (;;) {
786
0
                switch (bnum1) {
787
0
                case -2:
788
0
                    break;
789
790
0
                case -1:
791
0
                    if (core_used) {
792
                        //printf(" + data series %08x:\n", 1<<i);
793
0
                        s->data_series |= 1<<i;
794
0
                    }
795
0
                    break;
796
797
0
                default:
798
0
                    for (j = 0; j < s->hdr->num_blocks; j++) {
799
0
                        if (s->block[j]->content_type == EXTERNAL &&
800
0
                            s->block[j]->content_id == bnum1) {
801
0
                            if (block_used[j]) {
802
                                //printf(" + data series %08x:\n", 1<<i);
803
0
                                s->data_series |= 1<<i;
804
0
                            }
805
0
                        }
806
0
                    }
807
0
                    break;
808
0
                }
809
810
0
                if (bnum2 == -2 || bnum1 == bnum2)
811
0
                    break;
812
813
0
                bnum1 = bnum2; // 2nd pass
814
0
            }
815
0
        }
816
817
        // Tags too
818
0
        for (i = 0; i < CRAM_MAP_HASH; i++) {
819
0
            int bnum1, bnum2, j;
820
0
            cram_map *m = hdr->tag_encoding_map[i];
821
822
0
            while (m) {
823
0
                cram_codec *c = m->codec;
824
0
                if (!c) {
825
0
                    m = m->next;
826
0
                    continue;
827
0
                }
828
829
0
                bnum1 = cram_codec_to_id(c, &bnum2);
830
831
0
                for (;;) {
832
0
                    switch (bnum1) {
833
0
                    case -2:
834
0
                        break;
835
836
0
                    case -1:
837
                        //printf(" + data series %08x:\n", CRAM_aux);
838
0
                        s->data_series |= CRAM_aux;
839
0
                        break;
840
841
0
                    default:
842
0
                        for (j = 0; j < s->hdr->num_blocks; j++) {
843
0
                            if (s->block[j]->content_type == EXTERNAL &&
844
0
                                s->block[j]->content_id == bnum1) {
845
0
                                if (block_used[j]) {
846
                                    //printf(" + data series %08x:\n",
847
                                    //       CRAM_aux);
848
0
                                    s->data_series |= CRAM_aux;
849
0
                                }
850
0
                            }
851
0
                        }
852
0
                        break;
853
0
                    }
854
855
0
                    if (bnum2 == -2 || bnum1 == bnum2)
856
0
                        break;
857
858
0
                    bnum1 = bnum2; // 2nd pass
859
0
                }
860
861
0
                m = m->next;
862
0
            }
863
0
        }
864
0
    } while (orig_ds != s->data_series);
865
866
0
    free(block_used);
867
0
    return 0;
868
0
}
869
870
/*
871
 * Checks whether an external block is used solely by a single data series.
872
 * Returns the codec type if so (EXTERNAL, BYTE_ARRAY_LEN, BYTE_ARRAY_STOP)
873
 *         or 0 if not (E_NULL).
874
 */
875
static int cram_ds_unique(cram_block_compression_hdr *hdr, cram_codec *c,
876
0
                          int id) {
877
0
    int i, n_id = 0;
878
0
    enum cram_encoding e_type = 0;
879
880
0
    for (i = 0; i < DS_END; i++) {
881
0
        cram_codec *c;
882
0
        int bnum1, bnum2, old_n_id;
883
884
0
        if (!(c = hdr->codecs[i]))
885
0
            continue;
886
887
0
        bnum1 = cram_codec_to_id(c, &bnum2);
888
889
0
        old_n_id = n_id;
890
0
        if (bnum1 == id) {
891
0
            n_id++;
892
0
            e_type = c->codec;
893
0
        }
894
0
        if (bnum2 == id) {
895
0
            n_id++;
896
0
            e_type = c->codec;
897
0
        }
898
899
0
        if (n_id == old_n_id+2)
900
0
            n_id--; // len/val in same place counts once only.
901
0
    }
902
903
0
    return n_id == 1 ? e_type : 0;
904
0
}
905
906
/*
907
 * Attempts to estimate the size of some blocks so we can preallocate them
908
 * before decoding.  Although decoding will automatically grow the blocks,
909
 * it is typically more efficient to preallocate.
910
 */
911
void cram_decode_estimate_sizes(cram_block_compression_hdr *hdr, cram_slice *s,
912
                                int *qual_size, int *name_size,
913
471
                                int *q_id) {
914
471
    int bnum1, bnum2;
915
471
    cram_codec *cd;
916
917
471
    *qual_size = 0;
918
471
    *name_size = 0;
919
920
    /* Qual */
921
471
    cd = hdr->codecs[DS_QS];
922
471
    if (cd == NULL) return;
923
0
    bnum1 = cram_codec_to_id(cd, &bnum2);
924
0
    if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
925
0
    if (cram_ds_unique(hdr, cd, bnum1)) {
926
0
        cram_block *b = cram_get_block_by_id(s, bnum1);
927
0
        if (b) *qual_size = b->uncomp_size;
928
0
        if (q_id && cd->codec == E_EXTERNAL)
929
0
            *q_id = bnum1;
930
0
    }
931
932
    /* Name */
933
0
    cd = hdr->codecs[DS_RN];
934
0
    if (cd == NULL) return;
935
0
    bnum1 = cram_codec_to_id(cd, &bnum2);
936
0
    if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
937
0
    if (cram_ds_unique(hdr, cd, bnum1)) {
938
0
        cram_block *b = cram_get_block_by_id(s, bnum1);
939
0
        if (b) *name_size = b->uncomp_size;
940
0
    }
941
0
}
942
943
944
/* ----------------------------------------------------------------------
945
 * CRAM slices
946
 */
947
948
/*
949
 * Decodes a CRAM (un)mapped slice header block.
950
 * Returns slice header ptr on success
951
 *         NULL on failure
952
 */
953
633
cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) {
954
633
    cram_block_slice_hdr *hdr;
955
633
    unsigned char *cp;
956
633
    unsigned char *cp_end;
957
633
    int i, err = 0;
958
959
633
    if (b->method != RAW) {
960
        /* Spec. says slice header should be RAW, but we can future-proof
961
           by trying to decode it if it isn't. */
962
6
        if (cram_uncompress_block(b) < 0)
963
3
            return NULL;
964
6
    }
965
630
    cp =  (unsigned char *)BLOCK_DATA(b);
966
630
    cp_end = cp + b->uncomp_size;
967
968
630
    if (b->content_type != MAPPED_SLICE &&
969
630
        b->content_type != UNMAPPED_SLICE)
970
0
        return NULL;
971
972
630
    if (!(hdr  = calloc(1, sizeof(*hdr))))
973
0
        return NULL;
974
975
630
    hdr->content_type = b->content_type;
976
977
630
    if (b->content_type == MAPPED_SLICE) {
978
603
        hdr->ref_seq_id = fd->vv.varint_get32s((char **)&cp, (char *)cp_end, &err);
979
603
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
980
0
            hdr->ref_seq_start = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err);
981
0
            hdr->ref_seq_span  = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err);
982
603
        } else {
983
603
            hdr->ref_seq_start = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
984
603
            hdr->ref_seq_span  = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
985
603
        }
986
603
        if (hdr->ref_seq_start < 0 || hdr->ref_seq_span < 0) {
987
45
            free(hdr);
988
45
            hts_log_error("Negative values not permitted for header "
989
45
                          "sequence start or span fields");
990
45
            return NULL;
991
45
        }
992
603
    }
993
585
    hdr->num_records = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err);
994
585
    hdr->record_counter = 0;
995
585
    if (CRAM_MAJOR_VERS(fd->version) == 2) {
996
0
        hdr->record_counter = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
997
585
    } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
998
0
        hdr->record_counter = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err);
999
0
    }
1000
585
    hdr->num_blocks      = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
1001
585
    hdr->num_content_ids = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
1002
585
    if (hdr->num_content_ids < 1 ||
1003
585
        hdr->num_content_ids >= 10000) {
1004
        // Slice must have at least one data block, and there is no need
1005
        // for more than 2 per possible aux-tag plus ancillary.
1006
15
        free(hdr);
1007
15
        return NULL;
1008
15
    }
1009
570
    hdr->block_content_ids = malloc(hdr->num_content_ids * sizeof(int32_t));
1010
570
    if (!hdr->block_content_ids) {
1011
0
        free(hdr);
1012
0
        return NULL;
1013
0
    }
1014
1015
9.50k
    for (i = 0; i < hdr->num_content_ids; i++)
1016
8.93k
        hdr->block_content_ids[i] = fd->vv.varint_get32((char **)&cp,
1017
8.93k
                                                         (char *)cp_end,
1018
8.93k
                                                         &err);
1019
570
    if (err) {
1020
6
        free(hdr->block_content_ids);
1021
6
        free(hdr);
1022
6
        return NULL;
1023
6
    }
1024
1025
564
    if (b->content_type == MAPPED_SLICE)
1026
540
        hdr->ref_base_id = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err);
1027
1028
564
    if (CRAM_MAJOR_VERS(fd->version) != 1) {
1029
0
        if (cp_end - cp < 16) {
1030
0
            free(hdr->block_content_ids);
1031
0
            free(hdr);
1032
0
            return NULL;
1033
0
        }
1034
0
        memcpy(hdr->md5, cp, 16);
1035
564
    } else {
1036
564
        memset(hdr->md5, 0, 16);
1037
564
    }
1038
1039
564
    if (!err)
1040
555
        return hdr;
1041
1042
9
    free(hdr->block_content_ids);
1043
9
    free(hdr);
1044
9
    return NULL;
1045
564
}
1046
1047
1048
#if 0
1049
/* Returns the number of bits set in val; it the highest bit used */
1050
static int nbits(int v) {
1051
    static const int MultiplyDeBruijnBitPosition[32] = {
1052
        1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31,
1053
        9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32
1054
    };
1055
1056
    v |= v >> 1; // first up to set all bits 1 after the first 1 */
1057
    v |= v >> 2;
1058
    v |= v >> 4;
1059
    v |= v >> 8;
1060
    v |= v >> 16;
1061
1062
    // DeBruijn magic to find top bit
1063
    return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
1064
}
1065
#endif
1066
1067
#if 0
1068
static int sort_freqs(const void *vp1, const void *vp2) {
1069
    const int i1 = *(const int *)vp1;
1070
    const int i2 = *(const int *)vp2;
1071
    return i1-i2;
1072
}
1073
#endif
1074
1075
/* ----------------------------------------------------------------------
1076
 * Primary CRAM sequence decoder
1077
 */
1078
1079
0
static inline int add_md_char(cram_slice *s, int decode_md, char c, int32_t *md_dist) {
1080
0
    if (decode_md) {
1081
0
        BLOCK_APPEND_UINT(s->aux_blk, *md_dist);
1082
0
        BLOCK_APPEND_CHAR(s->aux_blk, c);
1083
0
        *md_dist = 0;
1084
0
    }
1085
0
    return 0;
1086
1087
0
 block_err:
1088
0
    return -1;
1089
0
}
1090
1091
/*
1092
 * Internal part of cram_decode_slice().
1093
 * Generates the sequence, quality and cigar components.
1094
 */
1095
static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
1096
                           cram_block *blk, cram_record *cr, sam_hdr_t *sh,
1097
                           int cf, char *seq, char *qual,
1098
0
                           int has_MD, int has_NM) {
1099
0
    int prev_pos = 0, f, r = 0, out_sz = 1;
1100
0
    int seq_pos = 1;
1101
0
    int cig_len = 0;
1102
0
    int64_t ref_pos = cr->apos;
1103
0
    int32_t fn, i32;
1104
0
    enum cigar_op cig_op = BAM_CMATCH;
1105
0
    uint32_t *cigar = s->cigar;
1106
0
    uint32_t ncigar = s->ncigar;
1107
0
    uint32_t cigar_alloc = s->cigar_alloc;
1108
0
    uint32_t nm = 0;
1109
0
    int32_t md_dist = 0;
1110
0
    int orig_aux = 0;
1111
    // CRAM <  4.0 decode_md is off/on
1112
    // CRAM >= 4.0 decode_md is auto/on (auto=on if MD* present, off otherwise)
1113
0
    int do_md = CRAM_MAJOR_VERS(fd->version) >= 4
1114
0
        ? (s->decode_md > 0)
1115
0
        : (s->decode_md != 0);
1116
0
    int decode_md = s->ref && cr->ref_id >= 0 && ((do_md && !has_MD) || has_MD < 0);
1117
0
    int decode_nm = s->ref && cr->ref_id >= 0 && ((do_md && !has_NM) || has_NM < 0);
1118
0
    uint32_t ds = s->data_series;
1119
0
    sam_hrecs_t *bfd = sh->hrecs;
1120
1121
0
    cram_codec **codecs = c->comp_hdr->codecs;
1122
1123
0
    if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1124
0
        memset(qual, 255, cr->len);
1125
0
    }
1126
1127
0
    if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
1128
0
        decode_md = decode_nm = 0;
1129
1130
0
    if (decode_md) {
1131
0
        orig_aux = BLOCK_SIZE(s->aux_blk);
1132
0
        if (has_MD == 0)
1133
0
            BLOCK_APPEND(s->aux_blk, "MDZ", 3);
1134
0
    }
1135
1136
0
    if (ds & CRAM_FN) {
1137
0
        if (!codecs[DS_FN]) return -1;
1138
0
        r |= codecs[DS_FN]->decode(s,codecs[DS_FN],
1139
0
                                   blk, (char *)&fn, &out_sz);
1140
0
        if (r) return r;
1141
0
    } else {
1142
0
        fn = 0;
1143
0
    }
1144
1145
0
    ref_pos--; // count from 0
1146
0
    cr->cigar = ncigar;
1147
1148
0
    if (!(ds & (CRAM_FC | CRAM_FP)))
1149
0
        goto skip_cigar;
1150
1151
0
    if (fn) {
1152
0
        if ((ds & CRAM_FC) && !codecs[DS_FC])
1153
0
            return -1;
1154
0
        if ((ds & CRAM_FP) && !codecs[DS_FP])
1155
0
            return -1;
1156
0
    }
1157
1158
0
    for (f = 0; f < fn; f++) {
1159
0
        int32_t pos = 0;
1160
0
        char op;
1161
1162
0
        if (ncigar+2 >= cigar_alloc) {
1163
0
            cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1164
0
            if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar))))
1165
0
                return -1;
1166
0
            s->cigar = cigar;
1167
0
        }
1168
1169
0
        if (ds & CRAM_FC) {
1170
0
            r |= codecs[DS_FC]->decode(s,
1171
0
                                       codecs[DS_FC],
1172
0
                                       blk,
1173
0
                                       &op,  &out_sz);
1174
0
            if (r) return r;
1175
0
        }
1176
1177
0
        if (!(ds & CRAM_FP))
1178
0
            continue;
1179
1180
0
        r |= codecs[DS_FP]->decode(s,
1181
0
                                   codecs[DS_FP],
1182
0
                                   blk,
1183
0
                                   (char *)&pos, &out_sz);
1184
0
        if (r) return r;
1185
0
        pos += prev_pos;
1186
1187
0
        if (pos <= 0) {
1188
0
            hts_log_error("Feature position %d before start of read", pos);
1189
0
            return -1;
1190
0
        }
1191
1192
0
        if (pos > seq_pos) {
1193
0
            if (pos > cr->len+1)
1194
0
                return -1;
1195
1196
0
            if (s->ref && cr->ref_id >= 0) {
1197
0
                if (ref_pos + pos - seq_pos > bfd->ref[cr->ref_id].len) {
1198
0
                    static int whinged = 0;
1199
0
                    int rlen;
1200
0
                    if (!whinged)
1201
0
                        hts_log_warning("Ref pos outside of ref sequence boundary");
1202
0
                    whinged = 1;
1203
0
                    rlen = bfd->ref[cr->ref_id].len - ref_pos;
1204
                    // May miss MD/NM cases where both seq/ref are N, but this is a
1205
                    // malformed cram file anyway.
1206
0
                    if (rlen > 0) {
1207
0
                        if (ref_pos + rlen > s->ref_end)
1208
0
                            goto beyond_slice;
1209
1210
0
                        memcpy(&seq[seq_pos-1],
1211
0
                               &s->ref[ref_pos - s->ref_start +1], rlen);
1212
0
                        if ((pos - seq_pos) - rlen > 0)
1213
0
                            memset(&seq[seq_pos-1+rlen], 'N',
1214
0
                                   (pos - seq_pos) - rlen);
1215
0
                    } else {
1216
0
                        memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
1217
0
                    }
1218
0
                    if (md_dist >= 0)
1219
0
                        md_dist += pos - seq_pos;
1220
0
                } else {
1221
                    // 'N' in both ref and seq is also mismatch for NM/MD
1222
0
                    if (ref_pos + pos-seq_pos > s->ref_end)
1223
0
                        goto beyond_slice;
1224
1225
0
                    const char *refp = s->ref + ref_pos - s->ref_start + 1;
1226
0
                    const int frag_len = pos - seq_pos;
1227
0
                    int do_cpy = 1;
1228
0
                    if (decode_md || decode_nm) {
1229
0
                        char *N = memchr(refp, 'N', frag_len);
1230
0
                        if (N) {
1231
0
                            int i;
1232
0
                            for (i = 0; i < frag_len; i++) {
1233
0
                                char base = refp[i];
1234
0
                                if (base == 'N') {
1235
0
                                    if (add_md_char(s, decode_md,
1236
0
                                                    'N', &md_dist) < 0)
1237
0
                                        return -1;
1238
0
                                    nm++;
1239
0
                                } else {
1240
0
                                    md_dist++;
1241
0
                                }
1242
0
                                seq[seq_pos-1+i] = base;
1243
0
                            }
1244
0
                            do_cpy = 0;
1245
0
                        } else {
1246
0
                            md_dist += frag_len;
1247
0
                        }
1248
0
                    }
1249
0
                    if (do_cpy)
1250
0
                        memcpy(&seq[seq_pos-1], refp, frag_len);
1251
0
                }
1252
0
            }
1253
#ifdef USE_X
1254
            if (cig_len && cig_op != BAM_CBASE_MATCH) {
1255
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1256
                cig_len = 0;
1257
            }
1258
            cig_op = BAM_CBASE_MATCH;
1259
#else
1260
0
            if (cig_len && cig_op != BAM_CMATCH) {
1261
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1262
0
                cig_len = 0;
1263
0
            }
1264
0
            cig_op = BAM_CMATCH;
1265
0
#endif
1266
0
            cig_len += pos - seq_pos;
1267
0
            ref_pos += pos - seq_pos;
1268
0
            seq_pos = pos;
1269
0
        }
1270
1271
0
        prev_pos = pos;
1272
1273
0
        if (!(ds & CRAM_FC))
1274
0
            goto skip_cigar;
1275
1276
0
        switch(op) {
1277
0
        case 'S': { // soft clip: IN
1278
0
            int32_t out_sz2 = 1;
1279
0
            int have_sc = 0;
1280
1281
0
            if (cig_len) {
1282
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1283
0
                cig_len = 0;
1284
0
            }
1285
0
            switch (CRAM_MAJOR_VERS(fd->version)) {
1286
0
            case 1:
1287
0
                if (ds & CRAM_IN) {
1288
0
                    r |= codecs[DS_IN]
1289
0
                        ? codecs[DS_IN]->decode(s, codecs[DS_IN],
1290
0
                                                blk,
1291
0
                                                cr->len ? &seq[pos-1] : NULL,
1292
0
                                                &out_sz2)
1293
0
                        : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1294
0
                    have_sc = 1;
1295
0
                }
1296
0
                break;
1297
0
            case 2:
1298
0
            default:
1299
0
                if (ds & CRAM_SC) {
1300
0
                    r |= codecs[DS_SC]
1301
0
                        ? codecs[DS_SC]->decode(s, codecs[DS_SC],
1302
0
                                                blk,
1303
0
                                                cr->len ? &seq[pos-1] : NULL,
1304
0
                                                &out_sz2)
1305
0
                        : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1306
0
                    have_sc = 1;
1307
0
                }
1308
0
                break;
1309
1310
                //default:
1311
                //    r |= codecs[DS_BB]
1312
                //        ? codecs[DS_BB]->decode(s, codecs[DS_BB],
1313
                //                                blk, &seq[pos-1], &out_sz2)
1314
                //        : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1315
0
            }
1316
0
            if (have_sc) {
1317
0
                if (r) return r;
1318
0
                cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP;
1319
0
                cig_op = BAM_CSOFT_CLIP;
1320
0
                seq_pos += out_sz2;
1321
0
            }
1322
0
            break;
1323
0
        }
1324
1325
0
        case 'X': { // Substitution; BS
1326
0
            unsigned char base;
1327
#ifdef USE_X
1328
            if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1329
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1330
                cig_len = 0;
1331
            }
1332
            if (ds & CRAM_BS) {
1333
                if (!codecs[DS_BS]) return -1;
1334
                r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk,
1335
                                           (char *)&base, &out_sz);
1336
                if (pos-1 < cr->len)
1337
                    seq[pos-1] = 'N'; // FIXME look up BS=base value
1338
            }
1339
            cig_op = BAM_CBASE_MISMATCH;
1340
#else
1341
0
            int ref_base;
1342
0
            if (cig_len && cig_op != BAM_CMATCH) {
1343
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1344
0
                cig_len = 0;
1345
0
            }
1346
0
            if (ds & CRAM_BS) {
1347
0
                if (!codecs[DS_BS]) return -1;
1348
0
                r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk,
1349
0
                                           (char *)&base, &out_sz);
1350
0
                if (r) return -1;
1351
0
                if (cr->ref_id < 0 || ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
1352
0
                    if (pos-1 < cr->len)
1353
0
                        seq[pos-1] = c->comp_hdr->
1354
0
                            substitution_matrix[fd->L1['N']][base];
1355
0
                    if (decode_md || decode_nm) {
1356
0
                        if (md_dist >= 0 && decode_md)
1357
0
                            BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1358
0
                        md_dist = -1;
1359
0
                        nm--;
1360
0
                    }
1361
0
                } else {
1362
0
                    unsigned char ref_call = ref_pos < s->ref_end
1363
0
                        ? (uc)s->ref[ref_pos - s->ref_start +1]
1364
0
                        : 'N';
1365
0
                    ref_base = fd->L1[ref_call];
1366
0
                    if (pos-1 < cr->len)
1367
0
                        seq[pos-1] = c->comp_hdr->
1368
0
                            substitution_matrix[ref_base][base];
1369
0
                    if (add_md_char(s, decode_md, ref_call, &md_dist) < 0)
1370
0
                        return -1;
1371
0
                }
1372
0
            }
1373
0
            cig_op = BAM_CMATCH;
1374
0
#endif
1375
0
            nm++;
1376
0
            cig_len++;
1377
0
            seq_pos++;
1378
0
            ref_pos++;
1379
0
            break;
1380
0
        }
1381
1382
0
        case 'D': { // Deletion; DL
1383
0
            if (cig_len && cig_op != BAM_CDEL) {
1384
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1385
0
                cig_len = 0;
1386
0
            }
1387
0
            if (ds & CRAM_DL) {
1388
0
                if (!codecs[DS_DL]) return -1;
1389
0
                r |= codecs[DS_DL]->decode(s, codecs[DS_DL], blk,
1390
0
                                           (char *)&i32, &out_sz);
1391
0
                if (r) return r;
1392
0
                if (decode_md || decode_nm) {
1393
0
                    if (ref_pos + i32 > s->ref_end)
1394
0
                        goto beyond_slice;
1395
0
                    if (md_dist >= 0 && decode_md)
1396
0
                        BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1397
0
                    if (ref_pos + i32 <= bfd->ref[cr->ref_id].len) {
1398
0
                        if (decode_md) {
1399
0
                            BLOCK_APPEND_CHAR(s->aux_blk, '^');
1400
0
                            BLOCK_APPEND(s->aux_blk,
1401
0
                                         &s->ref[ref_pos - s->ref_start +1],
1402
0
                                         i32);
1403
0
                            md_dist = 0;
1404
0
                        }
1405
0
                        nm += i32;
1406
0
                    } else {
1407
0
                        uint32_t dlen;
1408
0
                        if (bfd->ref[cr->ref_id].len >= ref_pos) {
1409
0
                            if (decode_md) {
1410
0
                                BLOCK_APPEND_CHAR(s->aux_blk, '^');
1411
0
                                BLOCK_APPEND(s->aux_blk,
1412
0
                                             &s->ref[ref_pos - s->ref_start+1],
1413
0
                                             bfd->ref[cr->ref_id].len-ref_pos);
1414
0
                                BLOCK_APPEND_UINT(s->aux_blk, 0);
1415
0
                            }
1416
0
                            dlen = i32 - (bfd->ref[cr->ref_id].len - ref_pos);
1417
0
                            nm += i32 - dlen;
1418
0
                        } else {
1419
0
                            dlen = i32;
1420
0
                        }
1421
1422
0
                        md_dist = -1;
1423
0
                    }
1424
0
                }
1425
0
                cig_op = BAM_CDEL;
1426
0
                cig_len += i32;
1427
0
                ref_pos += i32;
1428
                //printf("  %d: DL = %d (ret %d)\n", f, i32, r);
1429
0
            }
1430
0
            break;
1431
0
        }
1432
1433
0
        case 'I': { // Insertion (several bases); IN
1434
0
            int32_t out_sz2 = 1;
1435
1436
0
            if (cig_len && cig_op != BAM_CINS) {
1437
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1438
0
                cig_len = 0;
1439
0
            }
1440
1441
0
            if (ds & CRAM_IN) {
1442
0
                if (!codecs[DS_IN]) return -1;
1443
0
                r |= codecs[DS_IN]->decode(s, codecs[DS_IN], blk,
1444
0
                                           cr->len ? &seq[pos-1] : NULL,
1445
0
                                           &out_sz2);
1446
0
                if (r) return r;
1447
0
                cig_op = BAM_CINS;
1448
0
                cig_len += out_sz2;
1449
0
                seq_pos += out_sz2;
1450
0
                nm      += out_sz2;
1451
                //printf("  %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2);
1452
0
            }
1453
0
            break;
1454
0
        }
1455
1456
0
        case 'i': { // Insertion (single base); BA
1457
0
            if (cig_len && cig_op != BAM_CINS) {
1458
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1459
0
                cig_len = 0;
1460
0
            }
1461
0
            if (ds & CRAM_BA) {
1462
0
                if (!codecs[DS_BA]) return -1;
1463
0
                r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk,
1464
0
                                           cr->len ? &seq[pos-1] : NULL,
1465
0
                                           &out_sz);
1466
0
                if (r) return r;
1467
0
            }
1468
0
            cig_op = BAM_CINS;
1469
0
            cig_len++;
1470
0
            seq_pos++;
1471
0
            nm++;
1472
0
            break;
1473
0
        }
1474
1475
0
        case 'b': { // Several bases
1476
0
            int32_t len = 1;
1477
1478
0
            if (cig_len && cig_op != BAM_CMATCH) {
1479
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1480
0
                cig_len = 0;
1481
0
            }
1482
1483
0
            if (ds & CRAM_BB) {
1484
0
                if (!codecs[DS_BB]) return -1;
1485
0
                r |= codecs[DS_BB]->decode(s, codecs[DS_BB], blk,
1486
0
                                           cr->len ? &seq[pos-1] : NULL,
1487
0
                                           &len);
1488
0
                if (r) return r;
1489
1490
0
                if (decode_md || decode_nm) {
1491
0
                    int x;
1492
0
                    if (md_dist >= 0 && decode_md)
1493
0
                        BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1494
1495
0
                    for (x = 0; x < len; x++) {
1496
0
                        if (x && decode_md)
1497
0
                            BLOCK_APPEND_UINT(s->aux_blk, 0);
1498
0
                        if (ref_pos+x >= bfd->ref[cr->ref_id].len || !s->ref) {
1499
0
                            md_dist = -1;
1500
0
                            break;
1501
0
                        } else {
1502
0
                            if (decode_md) {
1503
0
                                if (ref_pos + x > s->ref_end)
1504
0
                                    goto beyond_slice;
1505
0
                                char r = s->ref[ref_pos+x-s->ref_start +1];
1506
0
                                BLOCK_APPEND_CHAR(s->aux_blk, r);
1507
0
                            }
1508
0
                        }
1509
0
                    }
1510
1511
0
                    nm += x;
1512
0
                    md_dist = 0;
1513
0
                }
1514
0
            }
1515
1516
0
            cig_op = BAM_CMATCH;
1517
1518
0
            cig_len+=len;
1519
0
            seq_pos+=len;
1520
0
            ref_pos+=len;
1521
            //prev_pos+=len;
1522
0
            break;
1523
0
        }
1524
1525
0
        case 'q': { // Several quality values
1526
0
            int32_t len = 1;
1527
1528
0
            if (cig_len && cig_op != BAM_CMATCH) {
1529
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1530
0
                cig_len = 0;
1531
0
            }
1532
1533
0
            if (ds & CRAM_QQ) {
1534
0
                if (!codecs[DS_QQ]) return -1;
1535
0
                if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)
1536
0
                    && (unsigned char)*qual == 255)
1537
0
                    memset(qual, 30, cr->len); // ?
1538
0
                r |= codecs[DS_QQ]->decode(s, codecs[DS_QQ], blk,
1539
0
                                           (char *)&qual[pos-1], &len);
1540
0
                if (r) return r;
1541
0
            }
1542
1543
0
            cig_op = BAM_CMATCH;
1544
1545
            //prev_pos+=len;
1546
0
            break;
1547
0
        }
1548
1549
0
        case 'B': { // Read base; BA, QS
1550
#ifdef USE_X
1551
            if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1552
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1553
                cig_len = 0;
1554
            }
1555
#else
1556
0
            if (cig_len && cig_op != BAM_CMATCH) {
1557
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1558
0
                cig_len = 0;
1559
0
            }
1560
0
#endif
1561
0
            if (ds & CRAM_BA) {
1562
0
                if (!codecs[DS_BA]) return -1;
1563
0
                r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk,
1564
0
                                           cr->len ? &seq[pos-1] : NULL,
1565
0
                                           &out_sz);
1566
1567
0
                if (decode_md || decode_nm) {
1568
0
                    if (md_dist >= 0 && decode_md)
1569
0
                        BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1570
0
                    if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
1571
0
                        md_dist = -1;
1572
0
                    } else {
1573
0
                        if (decode_md) {
1574
0
                            if (ref_pos > s->ref_end)
1575
0
                                goto beyond_slice;
1576
0
                            BLOCK_APPEND_CHAR(s->aux_blk,
1577
0
                                              s->ref[ref_pos-s->ref_start +1]);
1578
0
                        }
1579
0
                        nm++;
1580
0
                        md_dist = 0;
1581
0
                    }
1582
0
                }
1583
0
            }
1584
0
            if (ds & CRAM_QS) {
1585
0
                if (!codecs[DS_QS]) return -1;
1586
0
                if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)
1587
0
                    && (unsigned char)*qual == 255)
1588
0
                    memset(qual, 30, cr->len); // ASCII ?.  Same as htsjdk
1589
0
                r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk,
1590
0
                                           (char *)&qual[pos-1], &out_sz);
1591
0
            }
1592
#ifdef USE_X
1593
            cig_op = BAM_CBASE_MISMATCH;
1594
#else
1595
0
            cig_op = BAM_CMATCH;
1596
0
#endif
1597
0
            cig_len++;
1598
0
            seq_pos++;
1599
0
            ref_pos++;
1600
            //printf("  %d: BA/QS(B) = %c/%d (ret %d)\n", f, i32, qc, r);
1601
0
            break;
1602
0
        }
1603
1604
0
        case 'Q': { // Quality score; QS
1605
0
            if (ds & CRAM_QS) {
1606
0
                if (!codecs[DS_QS]) return -1;
1607
0
                if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) &&
1608
0
                    (unsigned char)*qual == 255)
1609
0
                    memset(qual, 30, cr->len); // ?
1610
0
                r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk,
1611
0
                                           (char *)&qual[pos-1], &out_sz);
1612
                //printf("  %d: QS = %d (ret %d)\n", f, qc, r);
1613
0
            }
1614
0
            break;
1615
0
        }
1616
1617
0
        case 'H': { // hard clip; HC
1618
0
            if (cig_len && cig_op != BAM_CHARD_CLIP) {
1619
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1620
0
                cig_len = 0;
1621
0
            }
1622
0
            if (ds & CRAM_HC) {
1623
0
                if (!codecs[DS_HC]) return -1;
1624
0
                r |= codecs[DS_HC]->decode(s, codecs[DS_HC], blk,
1625
0
                                           (char *)&i32, &out_sz);
1626
0
                if (r) return r;
1627
0
                cig_op = BAM_CHARD_CLIP;
1628
0
                cig_len += i32;
1629
0
            }
1630
0
            break;
1631
0
        }
1632
1633
0
        case 'P': { // padding; PD
1634
0
            if (cig_len && cig_op != BAM_CPAD) {
1635
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1636
0
                cig_len = 0;
1637
0
            }
1638
0
            if (ds & CRAM_PD) {
1639
0
                if (!codecs[DS_PD]) return -1;
1640
0
                r |= codecs[DS_PD]->decode(s, codecs[DS_PD], blk,
1641
0
                                           (char *)&i32, &out_sz);
1642
0
                if (r) return r;
1643
0
                cig_op = BAM_CPAD;
1644
0
                cig_len += i32;
1645
0
            }
1646
0
            break;
1647
0
        }
1648
1649
0
        case 'N': { // Ref skip; RS
1650
0
            if (cig_len && cig_op != BAM_CREF_SKIP) {
1651
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1652
0
                cig_len = 0;
1653
0
            }
1654
0
            if (ds & CRAM_RS) {
1655
0
                if (!codecs[DS_RS]) return -1;
1656
0
                r |= codecs[DS_RS]->decode(s, codecs[DS_RS], blk,
1657
0
                                           (char *)&i32, &out_sz);
1658
0
                if (r) return r;
1659
0
                cig_op = BAM_CREF_SKIP;
1660
0
                cig_len += i32;
1661
0
                ref_pos += i32;
1662
0
            }
1663
0
            break;
1664
0
        }
1665
1666
0
        default:
1667
0
            hts_log_error("Unknown feature code '%c'", op);
1668
0
            return -1;
1669
0
        }
1670
0
    }
1671
1672
0
    if (!(ds & CRAM_FC))
1673
0
        goto skip_cigar;
1674
1675
    /* An implicit match op for any unaccounted for bases */
1676
0
    if ((ds & CRAM_FN) && cr->len >= seq_pos) {
1677
0
        if (s->ref && cr->ref_id >= 0) {
1678
0
            if (ref_pos + cr->len - seq_pos + 1 > bfd->ref[cr->ref_id].len) {
1679
0
                static int whinged = 0;
1680
0
                int rlen;
1681
0
                if (!whinged)
1682
0
                    hts_log_warning("Ref pos outside of ref sequence boundary");
1683
0
                whinged = 1;
1684
0
                rlen = bfd->ref[cr->ref_id].len - ref_pos;
1685
                // May miss MD/NM cases where both seq/ref are N, but this is a
1686
                // malformed cram file anyway.
1687
0
                if (rlen > 0) {
1688
0
                    if (seq_pos-1 + rlen < cr->len)
1689
0
                        memcpy(&seq[seq_pos-1],
1690
0
                               &s->ref[ref_pos - s->ref_start +1], rlen);
1691
0
                    if ((cr->len - seq_pos + 1) - rlen > 0)
1692
0
                        memset(&seq[seq_pos-1+rlen], 'N',
1693
0
                               (cr->len - seq_pos + 1) - rlen);
1694
0
                } else {
1695
0
                    if (cr->len - seq_pos + 1 > 0)
1696
0
                        memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
1697
0
                }
1698
0
                if (md_dist >= 0)
1699
0
                    md_dist += cr->len - seq_pos + 1;
1700
0
            } else {
1701
0
                if (cr->len - seq_pos + 1 > 0) {
1702
0
                    if (ref_pos + cr->len-seq_pos +1 > s->ref_end)
1703
0
                        goto beyond_slice;
1704
0
                    int remainder = cr->len - (seq_pos-1);
1705
0
                    int j = ref_pos - s->ref_start + 1;
1706
0
                    if (decode_md || decode_nm) {
1707
0
                        int i;
1708
0
                        char *N = memchr(&s->ref[j], 'N', remainder);
1709
0
                        if (!N) {
1710
                            // short cut the common case
1711
0
                            md_dist += cr->len - (seq_pos-1);
1712
0
                        } else {
1713
0
                            char *refp = &s->ref[j-(seq_pos-1)];
1714
0
                            md_dist += N-&s->ref[j];
1715
0
                            int i_start = seq_pos-1 + (N - &s->ref[j]);
1716
0
                            for (i = i_start; i < cr->len; i++) {
1717
0
                                char base = refp[i];
1718
0
                                if (base == 'N') {
1719
0
                                    if (add_md_char(s, decode_md, 'N',
1720
0
                                                    &md_dist) < 0)
1721
0
                                        return -1;
1722
0
                                    nm++;
1723
0
                                } else {
1724
0
                                    md_dist++;
1725
0
                                }
1726
0
                            }
1727
0
                        }
1728
0
                    }
1729
0
                    memcpy(&seq[seq_pos-1], &s->ref[j], remainder);
1730
0
                }
1731
0
                ref_pos += cr->len - seq_pos + 1;
1732
0
            }
1733
0
        } else if (cr->ref_id >= 0) {
1734
            // So alignment end can be computed even when not decoding sequence
1735
0
            ref_pos += cr->len - seq_pos + 1;
1736
0
        }
1737
1738
0
        if (ncigar+1 >= cigar_alloc) {
1739
0
            cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1740
0
            if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar))))
1741
0
                return -1;
1742
0
            s->cigar = cigar;
1743
0
        }
1744
#ifdef USE_X
1745
        if (cig_len && cig_op != BAM_CBASE_MATCH) {
1746
            cigar[ncigar++] = (cig_len<<4) + cig_op;
1747
            cig_len = 0;
1748
        }
1749
        cig_op = BAM_CBASE_MATCH;
1750
#else
1751
0
        if (cig_len && cig_op != BAM_CMATCH) {
1752
0
            cigar[ncigar++] = (cig_len<<4) + cig_op;
1753
0
            cig_len = 0;
1754
0
        }
1755
0
        cig_op = BAM_CMATCH;
1756
0
#endif
1757
0
        cig_len += cr->len - seq_pos+1;
1758
0
    }
1759
1760
0
 skip_cigar:
1761
1762
0
    if ((ds & CRAM_FN) && decode_md) {
1763
0
        if (md_dist >= 0)
1764
0
            BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1765
0
    }
1766
1767
0
    if (cig_len) {
1768
0
        if (ncigar >= cigar_alloc) {
1769
0
            cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1770
0
            if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar))))
1771
0
                return -1;
1772
0
            s->cigar = cigar;
1773
0
        }
1774
1775
0
        cigar[ncigar++] = (cig_len<<4) + cig_op;
1776
0
    }
1777
1778
0
    cr->ncigar = ncigar - cr->cigar;
1779
0
    cr->aend = ref_pos > cr->apos ? ref_pos : cr->apos;
1780
1781
    //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos);
1782
1783
0
    if (ds & CRAM_MQ) {
1784
0
        if (!codecs[DS_MQ]) return -1;
1785
0
        r |= codecs[DS_MQ]->decode(s, codecs[DS_MQ], blk,
1786
0
                                   (char *)&cr->mqual, &out_sz);
1787
0
    } else {
1788
0
        cr->mqual = 40;
1789
0
    }
1790
1791
0
    if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1792
0
        int32_t out_sz2 = cr->len;
1793
1794
0
        if (!codecs[DS_QS]) return -1;
1795
0
        r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk,
1796
0
                                   qual, &out_sz2);
1797
0
    }
1798
1799
0
    s->cigar = cigar;
1800
0
    s->cigar_alloc = cigar_alloc;
1801
0
    s->ncigar = ncigar;
1802
1803
0
    if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
1804
0
        cr->len = 0;
1805
1806
0
    if (decode_md) {
1807
0
        BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z:
1808
0
        size_t sz = BLOCK_SIZE(s->aux_blk) - orig_aux;
1809
0
        if (has_MD < 0) {
1810
            // has_MD < 0; already have MDZ allocated in aux at -has_MD,
1811
            // but wrote MD to end of aux (at orig_aux).
1812
            // We need some memmoves to shuffle it around.
1813
0
            char tmp_MD_[1024], *tmp_MD = tmp_MD_;
1814
0
            unsigned char *orig_aux_p = BLOCK_DATA(s->aux_blk) + orig_aux;
1815
0
            if (sz > 1024) {
1816
0
                tmp_MD = malloc(sz);
1817
0
                if (!tmp_MD)
1818
0
                    return -1;
1819
0
            }
1820
0
            memcpy(tmp_MD, orig_aux_p, sz);
1821
0
            memmove(&BLOCK_DATA(s->aux_blk)[-has_MD] + sz,
1822
0
                    &BLOCK_DATA(s->aux_blk)[-has_MD],
1823
0
                    orig_aux_p - &BLOCK_DATA(s->aux_blk)[-has_MD]);
1824
0
            memcpy(&BLOCK_DATA(s->aux_blk)[-has_MD], tmp_MD, sz);
1825
0
            if (tmp_MD != tmp_MD_)
1826
0
                free(tmp_MD);
1827
1828
0
            if (-has_NM > -has_MD)
1829
                // we inserted before NM, so move it up a bit
1830
0
                has_NM -= sz;
1831
0
        }
1832
        // else has_MD == 0 and we've already appended MD to the end.
1833
1834
0
        cr->aux_size += sz;
1835
0
    }
1836
1837
0
    if (decode_nm) {
1838
0
        if (has_NM == 0) {
1839
0
            char buf[7];
1840
0
            size_t buf_size;
1841
0
            buf[0] = 'N'; buf[1] = 'M';
1842
0
            if (nm <= UINT8_MAX) {
1843
0
                buf_size = 4;
1844
0
                buf[2] = 'C';
1845
0
                buf[3] = (nm>> 0) & 0xff;
1846
0
            } else if (nm <= UINT16_MAX) {
1847
0
                buf_size = 5;
1848
0
                buf[2] = 'S';
1849
0
                buf[3] = (nm>> 0) & 0xff;
1850
0
                buf[4] = (nm>> 8) & 0xff;
1851
0
            } else {
1852
0
                buf_size = 7;
1853
0
                buf[2] = 'I';
1854
0
                buf[3] = (nm>> 0) & 0xff;
1855
0
                buf[4] = (nm>> 8) & 0xff;
1856
0
                buf[5] = (nm>>16) & 0xff;
1857
0
                buf[6] = (nm>>24) & 0xff;
1858
0
            }
1859
0
            BLOCK_APPEND(s->aux_blk, buf, buf_size);
1860
0
            cr->aux_size += buf_size;
1861
0
        } else {
1862
            // Preallocated space for NM at -has_NM into aux block
1863
0
            unsigned char *buf = BLOCK_DATA(s->aux_blk) + -has_NM;
1864
0
            buf[0] = (nm>> 0) & 0xff;
1865
0
            buf[1] = (nm>> 8) & 0xff;
1866
0
            buf[2] = (nm>>16) & 0xff;
1867
0
            buf[3] = (nm>>24) & 0xff;
1868
0
        }
1869
0
    }
1870
1871
0
    return r;
1872
1873
0
 beyond_slice:
1874
    // Cramtools can create CRAMs that have sequence features outside the
1875
    // stated range of the container & slice reference extents (start + span).
1876
    // We have to check for these in many places, but for brevity have the
1877
    // error reporting in only one.
1878
0
    hts_log_error("CRAM CIGAR extends beyond slice reference extents");
1879
0
    return -1;
1880
1881
0
 block_err:
1882
0
    return -1;
1883
0
}
1884
1885
/*
1886
 * Quick and simple hash lookup for cram_map arrays
1887
 */
1888
0
static cram_map *map_find(cram_map **map, unsigned char *key, int id) {
1889
0
    cram_map *m;
1890
1891
0
    m = map[CRAM_MAP(key[0],key[1])];
1892
0
    while (m && m->key != id)
1893
0
        m= m->next;
1894
1895
0
    return m;
1896
0
}
1897
1898
//#define map_find(M,K,I) M[CRAM_MAP(K[0],K[1])];while (m && m->key != I);m= m->next
1899
1900
1901
static int cram_decode_aux_1_0(cram_container *c, cram_slice *s,
1902
0
                               cram_block *blk, cram_record *cr) {
1903
0
    int i, r = 0, out_sz = 1;
1904
0
    unsigned char ntags;
1905
1906
0
    if (!c->comp_hdr->codecs[DS_TC]) return -1;
1907
0
    r |= c->comp_hdr->codecs[DS_TC]->decode(s, c->comp_hdr->codecs[DS_TC], blk,
1908
0
                                            (char *)&ntags, &out_sz);
1909
0
    cr->ntags = ntags;
1910
1911
    //printf("TC=%d\n", cr->ntags);
1912
0
    cr->aux_size = 0;
1913
0
    cr->aux = BLOCK_SIZE(s->aux_blk);
1914
1915
0
    for (i = 0; i < cr->ntags; i++) {
1916
0
        int32_t id, out_sz = 1;
1917
0
        unsigned char tag_data[3];
1918
0
        cram_map *m;
1919
1920
        //printf("Tag %d/%d\n", i+1, cr->ntags);
1921
0
        if (!c->comp_hdr->codecs[DS_TN]) return -1;
1922
0
        r |= c->comp_hdr->codecs[DS_TN]->decode(s, c->comp_hdr->codecs[DS_TN],
1923
0
                                                blk, (char *)&id, &out_sz);
1924
0
        if (out_sz == 3) {
1925
            // Tag name stored as 3 chars instead of an int?
1926
0
            memcpy(tag_data, &id, 3);
1927
0
        } else {
1928
0
            tag_data[0] = (id>>16) & 0xff;
1929
0
            tag_data[1] = (id>>8)  & 0xff;
1930
0
            tag_data[2] = id       & 0xff;
1931
0
        }
1932
1933
0
        m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
1934
0
        if (!m)
1935
0
            return -1;
1936
0
        BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
1937
1938
0
        if (!m->codec) return -1;
1939
0
        r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
1940
1941
0
        cr->aux_size += out_sz + 3;
1942
0
    }
1943
1944
0
    return r;
1945
1946
0
 block_err:
1947
0
    return -1;
1948
0
}
1949
1950
// has_MD and has_NM are filled out with 0 for none present,
1951
// 1 for present and verbatim, and -pos for present as placeholder
1952
// (MD*, NM*) to be generated and filled out at offset +pos.
1953
static int cram_decode_aux(cram_fd *fd,
1954
                           cram_container *c, cram_slice *s,
1955
                           cram_block *blk, cram_record *cr,
1956
0
                           int *has_MD, int *has_NM) {
1957
0
    int i, r = 0, out_sz = 1;
1958
0
    int32_t TL = 0;
1959
0
    unsigned char *TN;
1960
0
    uint32_t ds = s->data_series;
1961
1962
0
    if (!(ds & (CRAM_TL|CRAM_aux))) {
1963
0
        cr->aux = 0;
1964
0
        cr->aux_size = 0;
1965
0
        return 0;
1966
0
    }
1967
1968
0
    if (!c->comp_hdr->codecs[DS_TL]) return -1;
1969
0
    r |= c->comp_hdr->codecs[DS_TL]->decode(s, c->comp_hdr->codecs[DS_TL], blk,
1970
0
                                            (char *)&TL, &out_sz);
1971
0
    if (r || TL < 0 || TL >= c->comp_hdr->nTL)
1972
0
        return -1;
1973
1974
0
    TN = c->comp_hdr->TL[TL];
1975
0
    cr->ntags = strlen((char *)TN)/3; // optimise to remove strlen
1976
1977
    //printf("TC=%d\n", cr->ntags);
1978
0
    cr->aux_size = 0;
1979
0
    cr->aux = BLOCK_SIZE(s->aux_blk);
1980
1981
0
    if (!(ds & CRAM_aux))
1982
0
        return 0;
1983
1984
0
    for (i = 0; i < cr->ntags; i++) {
1985
0
        int32_t id, out_sz = 1;
1986
0
        unsigned char tag_data[7];
1987
0
        cram_map *m;
1988
1989
0
        if (TN[0] == 'M' && TN[1] == 'D' && has_MD)
1990
0
            *has_MD = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1);
1991
0
        if (TN[0] == 'N' && TN[1] == 'M' && has_NM)
1992
0
            *has_NM = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1);;
1993
1994
        //printf("Tag %d/%d\n", i+1, cr->ntags);
1995
0
        tag_data[0] = TN[0];
1996
0
        tag_data[1] = TN[1];
1997
0
        tag_data[2] = TN[2];
1998
0
        id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2];
1999
2000
0
        if (CRAM_MAJOR_VERS(fd->version) >= 4 && TN[2] == '*') {
2001
            // Place holder, fill out contents later.
2002
0
            int tag_data_size;
2003
0
            if (TN[0] == 'N' && TN[1] == 'M') {
2004
                // Use a fixed size, so we can allocate room for it now.
2005
0
                memcpy(&tag_data[2], "I\0\0\0\0", 5);
2006
0
                tag_data_size = 7;
2007
0
            } else if (TN[0] == 'R' && TN[1] == 'G') {
2008
                // RG is variable size, but known already.  Insert now
2009
0
                TN += 3;
2010
                // Equiv to fd->header->hrecs->rg[cr->rg], but this is the
2011
                // new header API equivalent.
2012
0
                const char *rg = sam_hdr_line_name(fd->header, "RG", cr->rg);
2013
0
                if (!rg)
2014
0
                    continue;
2015
2016
0
                size_t rg_len = strlen(rg);
2017
0
                tag_data[2] = 'Z';
2018
0
                BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
2019
0
                BLOCK_APPEND(s->aux_blk, rg, rg_len);
2020
0
                BLOCK_APPEND_CHAR(s->aux_blk, '\0');
2021
0
                cr->aux_size += 3 + rg_len + 1;
2022
0
                cr->rg = -1; // prevents auto-add later
2023
0
                continue;
2024
0
            } else {
2025
                // Unknown size.  We'll insert MD into stream later.
2026
0
                tag_data[2] = 'Z';
2027
0
                tag_data_size = 3;
2028
0
            }
2029
0
            BLOCK_APPEND(s->aux_blk, (char *)tag_data, tag_data_size);
2030
0
            cr->aux_size += tag_data_size;
2031
0
            TN += 3;
2032
0
        } else {
2033
0
            TN += 3;
2034
0
            m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
2035
0
            if (!m)
2036
0
                return -1;
2037
2038
0
            BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
2039
2040
0
            if (!m->codec) return -1;
2041
0
            r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
2042
0
            if (r) break;
2043
0
            cr->aux_size += out_sz + 3;
2044
2045
            // cF CRAM flags.
2046
0
            if (TN[-3]=='c' && TN[-2]=='F' && TN[-1]=='C' && out_sz == 1) {
2047
                // Remove cF tag
2048
0
                uint8_t cF = BLOCK_END(s->aux_blk)[-1];
2049
0
                BLOCK_SIZE(s->aux_blk) -= out_sz+3;
2050
0
                cr->aux_size -= out_sz+3;
2051
2052
                // bit 1 => don't auto-decode MD.
2053
                // Pretend MD is present verbatim, so we don't auto-generate
2054
0
                if ((cF & 1) && has_MD && *has_MD == 0)
2055
0
                    *has_MD = 1;
2056
2057
                // bit 1 => don't auto-decode NM
2058
0
                if ((cF & 2) && has_NM && *has_NM == 0)
2059
0
                    *has_NM = 1;
2060
0
            }
2061
0
        }
2062
2063
        // We could go to 2^32 fine, but we shouldn't be hitting this anyway,
2064
        // and it's protecting against memory hogs too.
2065
0
        if (BLOCK_SIZE(s->aux_blk) > (1u<<31)) {
2066
0
            hts_log_error("CRAM->BAM aux block size overflow");
2067
0
            goto block_err;
2068
0
        }
2069
0
    }
2070
2071
0
    return r;
2072
2073
0
 block_err:
2074
0
    return -1;
2075
0
}
2076
2077
/* Resolve mate pair cross-references between recs within this slice */
2078
408
static int cram_decode_slice_xref(cram_slice *s, int required_fields) {
2079
408
    int rec;
2080
2081
408
    if (!(required_fields & (SAM_RNEXT | SAM_PNEXT | SAM_TLEN))) {
2082
0
        for (rec = 0; rec < s->hdr->num_records; rec++) {
2083
0
            cram_record *cr = &s->crecs[rec];
2084
2085
0
            cr->tlen = 0;
2086
0
            cr->mate_pos = 0;
2087
0
            cr->mate_ref_id = -1;
2088
0
        }
2089
2090
0
        return 0;
2091
0
    }
2092
2093
408
    for (rec = 0; rec < s->hdr->num_records; rec++) {
2094
0
        cram_record *cr = &s->crecs[rec];
2095
2096
0
        if (cr->mate_line >= 0) {
2097
0
            if (cr->mate_line < s->hdr->num_records) {
2098
                /*
2099
                 * On the first read, loop through computing lengths.
2100
                 * It's not perfect as we have one slice per reference so we
2101
                 * cannot detect when TLEN should be zero due to seqs that
2102
                 * map to multiple references.
2103
                 *
2104
                 * We also cannot set tlen correct when it spans a slice for
2105
                 * other reasons. This may make tlen too small. Should we
2106
                 * fix this by forcing TLEN to be stored verbatim in such cases?
2107
                 *
2108
                 * Or do we just admit defeat and output 0 for tlen? It's the
2109
                 * safe option...
2110
                 */
2111
0
                if (cr->tlen == INT64_MIN) {
2112
0
                    int id1 = rec, id2 = rec;
2113
0
                    int64_t aleft = cr->apos, aright = cr->aend;
2114
0
                    int64_t tlen;
2115
0
                    int ref = cr->ref_id;
2116
2117
                    // number of segments starting at the same point.
2118
0
                    int left_cnt = 0;
2119
2120
0
                    do {
2121
0
                        if (aleft > s->crecs[id2].apos)
2122
0
                            aleft = s->crecs[id2].apos, left_cnt = 1;
2123
0
                        else if (aleft == s->crecs[id2].apos)
2124
0
                            left_cnt++;
2125
0
                        if (aright < s->crecs[id2].aend)
2126
0
                            aright = s->crecs[id2].aend;
2127
0
                        if (s->crecs[id2].mate_line == -1) {
2128
0
                            s->crecs[id2].mate_line = rec;
2129
0
                            break;
2130
0
                        }
2131
0
                        if (s->crecs[id2].mate_line <= id2 ||
2132
0
                            s->crecs[id2].mate_line >= s->hdr->num_records)
2133
0
                            return -1;
2134
0
                        id2 = s->crecs[id2].mate_line;
2135
2136
0
                        if (s->crecs[id2].ref_id != ref)
2137
0
                            ref = -1;
2138
0
                    } while (id2 != id1);
2139
2140
0
                    if (ref != -1) {
2141
0
                        tlen = aright - aleft + 1;
2142
0
                        id1 = id2 = rec;
2143
2144
                        /*
2145
                         * When we have two seqs with identical start and
2146
                         * end coordinates, set +/- tlen based on 1st/last
2147
                         * bit flags instead, as a tie breaker.
2148
                         */
2149
0
                        if (s->crecs[id2].apos == aleft) {
2150
0
                            if (left_cnt == 1 ||
2151
0
                                (s->crecs[id2].flags & BAM_FREAD1))
2152
0
                                s->crecs[id2].tlen = tlen;
2153
0
                            else
2154
0
                                s->crecs[id2].tlen = -tlen;
2155
0
                        } else {
2156
0
                            s->crecs[id2].tlen = -tlen;
2157
0
                        }
2158
2159
0
                        id2 = s->crecs[id2].mate_line;
2160
0
                        while (id2 != id1) {
2161
0
                            if (s->crecs[id2].apos == aleft) {
2162
0
                                if (left_cnt == 1 ||
2163
0
                                    (s->crecs[id2].flags & BAM_FREAD1))
2164
0
                                    s->crecs[id2].tlen = tlen;
2165
0
                                else
2166
0
                                    s->crecs[id2].tlen = -tlen;
2167
0
                            } else {
2168
0
                                s->crecs[id2].tlen = -tlen;
2169
0
                            }
2170
0
                            id2 = s->crecs[id2].mate_line;
2171
0
                        }
2172
0
                    } else {
2173
0
                        id1 = id2 = rec;
2174
2175
0
                        s->crecs[id2].tlen = 0;
2176
0
                        id2 = s->crecs[id2].mate_line;
2177
0
                        while (id2 != id1) {
2178
0
                            s->crecs[id2].tlen = 0;
2179
0
                            id2 = s->crecs[id2].mate_line;
2180
0
                        }
2181
0
                    }
2182
0
                }
2183
2184
0
                cr->mate_pos = s->crecs[cr->mate_line].apos;
2185
0
                cr->mate_ref_id = s->crecs[cr->mate_line].ref_id;
2186
2187
                // paired
2188
0
                cr->flags |= BAM_FPAIRED;
2189
2190
                // set mate unmapped if needed
2191
0
                if (s->crecs[cr->mate_line].flags & BAM_FUNMAP) {
2192
0
                    cr->flags |= BAM_FMUNMAP;
2193
0
                    cr->tlen = 0;
2194
0
                }
2195
0
                if (cr->flags & BAM_FUNMAP) {
2196
0
                    cr->tlen = 0;
2197
0
                }
2198
2199
                // set mate reversed if needed
2200
0
                if (s->crecs[cr->mate_line].flags & BAM_FREVERSE)
2201
0
                    cr->flags |= BAM_FMREVERSE;
2202
0
            } else {
2203
0
                hts_log_error("Mate line out of bounds: %d vs [0, %d]",
2204
0
                              cr->mate_line, s->hdr->num_records-1);
2205
0
            }
2206
2207
            /* FIXME: construct read names here too if needed */
2208
0
        } else {
2209
0
            if (cr->mate_flags & CRAM_M_REVERSE) {
2210
0
                cr->flags |= BAM_FPAIRED | BAM_FMREVERSE;
2211
0
            }
2212
0
            if (cr->mate_flags & CRAM_M_UNMAP) {
2213
0
                cr->flags |= BAM_FMUNMAP;
2214
                //cr->mate_ref_id = -1;
2215
0
            }
2216
0
            if (!(cr->flags & BAM_FPAIRED))
2217
0
                cr->mate_ref_id = -1;
2218
0
        }
2219
2220
0
        if (cr->tlen == INT64_MIN)
2221
0
            cr->tlen = 0; // Just incase
2222
0
    }
2223
2224
408
    for (rec = 0; rec < s->hdr->num_records; rec++) {
2225
0
        cram_record *cr = &s->crecs[rec];
2226
0
        if (cr->explicit_tlen != INT64_MIN)
2227
0
            cr->tlen = cr->explicit_tlen;
2228
0
    }
2229
2230
408
    return 0;
2231
408
}
2232
2233
0
static char *md5_print(unsigned char *md5, char *out) {
2234
0
    int i;
2235
0
    for (i = 0; i < 16; i++) {
2236
0
        out[i*2+0] = "0123456789abcdef"[md5[i]>>4];
2237
0
        out[i*2+1] = "0123456789abcdef"[md5[i]&15];
2238
0
    }
2239
0
    out[32] = 0;
2240
2241
0
    return out;
2242
0
}
2243
2244
/*
2245
 * Utility function to decode tlen (ISIZE), as it's called
2246
 * in multiple places.
2247
 *
2248
 * Returns codec return value (0 on success).
2249
 */
2250
static int cram_decode_tlen(cram_fd *fd, cram_container *c, cram_slice *s,
2251
0
                            cram_block *blk, int64_t *tlen) {
2252
0
    int out_sz = 1, r = 0;
2253
2254
0
    if (!c->comp_hdr->codecs[DS_TS]) return -1;
2255
0
    if (CRAM_MAJOR_VERS(fd->version) < 4) {
2256
0
        int32_t i32;
2257
0
        r |= c->comp_hdr->codecs[DS_TS]
2258
0
            ->decode(s, c->comp_hdr->codecs[DS_TS], blk,
2259
0
                     (char *)&i32, &out_sz);
2260
0
        *tlen = i32;
2261
0
    } else {
2262
0
        r |= c->comp_hdr->codecs[DS_TS]
2263
0
            ->decode(s, c->comp_hdr->codecs[DS_TS], blk,
2264
0
                     (char *)tlen, &out_sz);
2265
0
    }
2266
0
    return r;
2267
0
}
2268
2269
/*
2270
 * Decode an entire slice from container blocks. Fills out s->crecs[] array.
2271
 * Returns 0 on success
2272
 *        -1 on failure
2273
 */
2274
int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
2275
480
                      sam_hdr_t *sh) {
2276
480
    cram_block *blk = s->block[0];
2277
480
    int32_t bf, ref_id;
2278
480
    unsigned char cf;
2279
480
    int out_sz, r = 0;
2280
480
    int rec;
2281
480
    char *seq = NULL, *qual = NULL;
2282
480
    int unknown_rg = -1;
2283
480
    int embed_ref;
2284
480
    char **refs = NULL;
2285
480
    uint32_t ds;
2286
480
    sam_hrecs_t *bfd = sh->hrecs;
2287
2288
480
    if (cram_dependent_data_series(fd, c->comp_hdr, s) != 0)
2289
9
        return -1;
2290
2291
471
    ds = s->data_series;
2292
2293
471
    blk->bit = 7; // MSB first
2294
2295
    // Study the blocks and estimate approx sizes to preallocate.
2296
    // This looks to speed up decoding by around 8-9%.
2297
    // We can always shrink back down at the end if we overestimated.
2298
    // However it's likely that this also saves memory as own growth
2299
    // factor (*=1.5) is never applied.
2300
471
    {
2301
471
        int qsize, nsize, q_id;
2302
471
        cram_decode_estimate_sizes(c->comp_hdr, s, &qsize, &nsize, &q_id);
2303
        //fprintf(stderr, "qsize=%d nsize=%d\n", qsize, nsize);
2304
2305
471
        if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->seqs_blk, qsize+1);
2306
471
        if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->qual_blk, qsize+1);
2307
471
        if (nsize && (ds & CRAM_NS)) BLOCK_RESIZE_EXACT(s->name_blk, nsize+1);
2308
2309
        // To do - consider using q_id here to usurp the quality block and
2310
        // avoid a memcpy during decode.
2311
        // Specifically when quality is an external block uniquely used by
2312
        // DS_QS only, then we can set s->qual_blk directly to this
2313
        // block and save the codec->decode() calls. (Approx 3% cpu saving)
2314
471
    }
2315
2316
    /* Look for unknown RG, added as last by Java CRAM? */
2317
471
    if (bfd->nrg > 0 &&
2318
471
        bfd->rg[bfd->nrg-1].name != NULL &&
2319
471
        !strcmp(bfd->rg[bfd->nrg-1].name, "UNKNOWN"))
2320
0
        unknown_rg = bfd->nrg-1;
2321
2322
471
    if (blk->content_type != CORE)
2323
6
        return -1;
2324
2325
465
    if (s->crecs)
2326
0
        free(s->crecs);
2327
465
    if (!(s->crecs = malloc(s->hdr->num_records * sizeof(*s->crecs))))
2328
0
        return -1;
2329
2330
465
    ref_id = s->hdr->ref_seq_id;
2331
465
    if (CRAM_MAJOR_VERS(fd->version) < 4)
2332
465
       embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0;
2333
0
    else
2334
0
       embed_ref = s->hdr->ref_base_id > 0 ? 1 : 0;
2335
2336
465
    if (ref_id >= 0) {
2337
405
        if (embed_ref) {
2338
399
            cram_block *b;
2339
399
            if (s->hdr->ref_base_id < 0) {
2340
0
                hts_log_error("No reference specified and no embedded reference is available"
2341
0
                              " at #%d:%"PRId64"-%"PRId64, ref_id, s->hdr->ref_seq_start,
2342
0
                              s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2343
0
                return -1;
2344
0
            }
2345
399
            b = cram_get_block_by_id(s, s->hdr->ref_base_id);
2346
399
            if (!b)
2347
27
                return -1;
2348
372
            if (cram_uncompress_block(b) != 0)
2349
0
                return -1;
2350
372
            s->ref = (char *)BLOCK_DATA(b);
2351
372
            s->ref_start = s->hdr->ref_seq_start;
2352
372
            s->ref_end   = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
2353
372
            if (s->hdr->ref_seq_span > b->uncomp_size) {
2354
3
                hts_log_error("Embedded reference is too small at #%d:%"PRIhts_pos"-%"PRIhts_pos,
2355
3
                              ref_id, s->ref_start, s->ref_end);
2356
3
                return -1;
2357
3
            }
2358
372
        } else if (!c->comp_hdr->no_ref) {
2359
            //// Avoid Java cramtools bug by loading entire reference seq
2360
            //s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0);
2361
            //s->ref_start = 1;
2362
2363
6
            if (fd->required_fields & SAM_SEQ) {
2364
6
                s->ref =
2365
6
                cram_get_ref(fd, s->hdr->ref_seq_id,
2366
6
                             s->hdr->ref_seq_start,
2367
6
                             s->hdr->ref_seq_start + s->hdr->ref_seq_span -1);
2368
6
            }
2369
6
            s->ref_start = s->hdr->ref_seq_start;
2370
6
            s->ref_end   = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
2371
2372
            /* Sanity check */
2373
6
            if (s->ref_start < 0) {
2374
0
                hts_log_warning("Slice starts before base 1"
2375
0
                                " at #%d:%"PRId64"-%"PRId64, ref_id, s->hdr->ref_seq_start,
2376
0
                                s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2377
0
                s->ref_start = 0;
2378
0
            }
2379
6
            pthread_mutex_lock(&fd->ref_lock);
2380
6
            pthread_mutex_lock(&fd->refs->lock);
2381
6
            if ((fd->required_fields & SAM_SEQ) &&
2382
6
                ref_id < fd->refs->nref && fd->refs->ref_id &&
2383
6
                s->ref_end > fd->refs->ref_id[ref_id]->length) {
2384
0
                s->ref_end = fd->refs->ref_id[ref_id]->length;
2385
0
            }
2386
6
            pthread_mutex_unlock(&fd->refs->lock);
2387
6
            pthread_mutex_unlock(&fd->ref_lock);
2388
6
        }
2389
405
    }
2390
2391
435
    if ((fd->required_fields & SAM_SEQ) &&
2392
435
        s->ref == NULL && s->hdr->ref_seq_id >= 0 && !c->comp_hdr->no_ref) {
2393
6
        hts_log_error("Unable to fetch reference #%d:%"PRId64"-%"PRId64"\n",
2394
6
                      ref_id, s->hdr->ref_seq_start,
2395
6
                      s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2396
6
        return -1;
2397
6
    }
2398
2399
429
    if (CRAM_MAJOR_VERS(fd->version) != 1
2400
429
        && (fd->required_fields & SAM_SEQ)
2401
429
        && s->hdr->ref_seq_id >= 0
2402
429
        && !fd->ignore_md5
2403
429
        && memcmp(s->hdr->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) {
2404
0
        hts_md5_context *md5;
2405
0
        unsigned char digest[16];
2406
2407
0
        if (s->ref && s->hdr->ref_seq_id >= 0) {
2408
0
            int start, len;
2409
2410
0
            if (s->hdr->ref_seq_start >= s->ref_start) {
2411
0
                start = s->hdr->ref_seq_start - s->ref_start;
2412
0
            } else {
2413
0
                hts_log_warning("Slice starts before base 1 at #%d:%"PRIhts_pos"-%"PRIhts_pos,
2414
0
                                ref_id, s->ref_start, s->ref_end);
2415
0
                start = 0;
2416
0
            }
2417
2418
0
            if (s->hdr->ref_seq_span <= s->ref_end - s->ref_start + 1) {
2419
0
                len = s->hdr->ref_seq_span;
2420
0
            } else {
2421
0
                hts_log_warning("Slice ends beyond reference end at #%d:%"PRIhts_pos"-%"PRIhts_pos,
2422
0
                                ref_id, s->ref_start, s->ref_end);
2423
0
                len = s->ref_end - s->ref_start + 1;
2424
0
            }
2425
2426
0
            if (!(md5 = hts_md5_init()))
2427
0
                return -1;
2428
0
            if (start + len > s->ref_end - s->ref_start + 1)
2429
0
                len = s->ref_end - s->ref_start + 1 - start;
2430
0
            if (len >= 0)
2431
0
                hts_md5_update(md5, s->ref + start, len);
2432
0
            hts_md5_final(digest, md5);
2433
0
            hts_md5_destroy(md5);
2434
0
        } else if (!s->ref && s->hdr->ref_base_id >= 0) {
2435
0
            cram_block *b = cram_get_block_by_id(s, s->hdr->ref_base_id);
2436
0
            if (b) {
2437
0
                if (!(md5 = hts_md5_init()))
2438
0
                    return -1;
2439
0
                hts_md5_update(md5, b->data, b->uncomp_size);
2440
0
                hts_md5_final(digest, md5);
2441
0
                hts_md5_destroy(md5);
2442
0
            }
2443
0
        }
2444
2445
0
        if (!c->comp_hdr->no_ref &&
2446
0
            ((!s->ref && s->hdr->ref_base_id < 0)
2447
0
             || memcmp(digest, s->hdr->md5, 16) != 0)) {
2448
0
            char M[33];
2449
0
            const char *rname = sam_hdr_tid2name(sh, ref_id);
2450
0
            if (!rname) rname="?"; // cannot happen normally
2451
0
            hts_log_error("MD5 checksum reference mismatch at %s:%"PRIhts_pos"-%"PRIhts_pos,
2452
0
                          rname, s->ref_start, s->ref_end);
2453
0
            hts_log_error("CRAM  : %s", md5_print(s->hdr->md5, M));
2454
0
            hts_log_error("Ref   : %s", md5_print(digest, M));
2455
0
            kstring_t ks = KS_INITIALIZE;
2456
0
            if (sam_hdr_find_tag_id(sh, "SQ", "SN", rname, "M5", &ks) == 0)
2457
0
                hts_log_error("@SQ M5: %s", ks.s);
2458
0
            hts_log_error("Please check the reference given is correct");
2459
0
            ks_free(&ks);
2460
0
            return -1;
2461
0
        }
2462
0
    }
2463
2464
429
    if (ref_id == -2) {
2465
33
        pthread_mutex_lock(&fd->ref_lock);
2466
33
        pthread_mutex_lock(&fd->refs->lock);
2467
33
        refs = calloc(fd->refs->nref, sizeof(char *));
2468
33
        pthread_mutex_unlock(&fd->refs->lock);
2469
33
        pthread_mutex_unlock(&fd->ref_lock);
2470
33
        if (!refs)
2471
0
            return -1;
2472
33
    }
2473
2474
429
    int last_ref_id = -9; // Arbitrary -ve marker for not-yet-set
2475
429
    for (rec = 0; rec < s->hdr->num_records; rec++) {
2476
21
        cram_record *cr = &s->crecs[rec];
2477
21
        int has_MD, has_NM;
2478
2479
        //fprintf(stderr, "Decode seq %d, %d/%d\n", rec, blk->byte, blk->bit);
2480
2481
21
        cr->s = s;
2482
2483
21
        out_sz = 1; /* decode 1 item */
2484
21
        if (ds & CRAM_BF) {
2485
21
            if (!c->comp_hdr->codecs[DS_BF]) goto block_err;
2486
0
            r |= c->comp_hdr->codecs[DS_BF]
2487
0
                            ->decode(s, c->comp_hdr->codecs[DS_BF], blk,
2488
0
                                     (char *)&bf, &out_sz);
2489
0
            if (r || bf < 0 ||
2490
0
                bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap))
2491
0
                goto block_err;
2492
0
            bf = fd->bam_flag_swap[bf];
2493
0
            cr->flags = bf;
2494
0
        } else {
2495
0
            cr->flags = bf = 0x4; // unmapped
2496
0
        }
2497
2498
0
        if (ds & CRAM_CF) {
2499
0
            if (CRAM_MAJOR_VERS(fd->version) == 1) {
2500
                /* CF is byte in 1.0, int32 in 2.0 */
2501
0
                if (!c->comp_hdr->codecs[DS_CF]) goto block_err;
2502
0
                r |= c->comp_hdr->codecs[DS_CF]
2503
0
                                ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2504
0
                                         (char *)&cf, &out_sz);
2505
0
                if (r) goto block_err;
2506
0
                cr->cram_flags = cf;
2507
0
            } else {
2508
0
                if (!c->comp_hdr->codecs[DS_CF]) goto block_err;
2509
0
                r |= c->comp_hdr->codecs[DS_CF]
2510
0
                                ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2511
0
                                         (char *)&cr->cram_flags, &out_sz);
2512
0
                if (r) goto block_err;
2513
0
                cf = cr->cram_flags;
2514
0
            }
2515
0
        } else {
2516
0
            cf = cr->cram_flags = 0;
2517
0
        }
2518
2519
0
        if (CRAM_MAJOR_VERS(fd->version) != 1 && ref_id == -2) {
2520
0
            if (ds & CRAM_RI) {
2521
0
                if (!c->comp_hdr->codecs[DS_RI]) goto block_err;
2522
0
                r |= c->comp_hdr->codecs[DS_RI]
2523
0
                                ->decode(s, c->comp_hdr->codecs[DS_RI], blk,
2524
0
                                         (char *)&cr->ref_id, &out_sz);
2525
0
                if (r) goto block_err;
2526
0
                if ((fd->required_fields & (SAM_SEQ|SAM_TLEN))
2527
0
                    && cr->ref_id >= 0
2528
0
                    && cr->ref_id != last_ref_id) {
2529
0
                    if (!c->comp_hdr->no_ref) {
2530
                        // Range(fd):  seq >= 0, unmapped -1, unspecified   -2
2531
                        // Slice(s):   seq >= 0, unmapped -1, multiple refs -2
2532
                        // Record(cr): seq >= 0, unmapped -1
2533
0
                        pthread_mutex_lock(&fd->range_lock);
2534
0
                        int need_ref = (fd->range.refid == -2 || cr->ref_id == fd->range.refid);
2535
0
                        pthread_mutex_unlock(&fd->range_lock);
2536
0
                        if  (need_ref) {
2537
0
                            if (!refs[cr->ref_id])
2538
0
                                refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id, 1, 0);
2539
0
                            if (!(s->ref = refs[cr->ref_id]))
2540
0
                                goto block_err;
2541
0
                        } else {
2542
                            // For multi-ref containers, we don't need to fetch all
2543
                            // refs if we're only querying one.
2544
0
                            s->ref = NULL;
2545
0
                        }
2546
2547
0
                        pthread_mutex_lock(&fd->range_lock);
2548
0
                        int discard_last_ref = (last_ref_id >= 0 &&
2549
0
                                                refs[last_ref_id] &&
2550
0
                                                (fd->range.refid == -2 ||
2551
0
                                                 last_ref_id == fd->range.refid));
2552
0
                        pthread_mutex_unlock(&fd->range_lock);
2553
0
                        if (discard_last_ref) {
2554
0
                            pthread_mutex_lock(&fd->ref_lock);
2555
0
                            discard_last_ref = !fd->unsorted;
2556
0
                            pthread_mutex_unlock(&fd->ref_lock);
2557
0
                        }
2558
0
                        if (discard_last_ref) {
2559
0
                            cram_ref_decr(fd->refs, last_ref_id);
2560
0
                            refs[last_ref_id] = NULL;
2561
0
                        }
2562
0
                    }
2563
0
                    s->ref_start = 1;
2564
0
                    pthread_mutex_lock(&fd->ref_lock);
2565
0
                    pthread_mutex_lock(&fd->refs->lock);
2566
0
                    s->ref_end = fd->refs->ref_id[cr->ref_id]->length;
2567
0
                    pthread_mutex_unlock(&fd->refs->lock);
2568
0
                    pthread_mutex_unlock(&fd->ref_lock);
2569
2570
0
                    last_ref_id = cr->ref_id;
2571
0
                }
2572
0
            } else {
2573
0
                cr->ref_id = -1;
2574
0
            }
2575
0
        } else {
2576
0
            cr->ref_id = ref_id; // Forced constant in CRAM 1.0
2577
0
        }
2578
0
        if (cr->ref_id < -1 || cr->ref_id >= bfd->nref) {
2579
0
            hts_log_error("Requested unknown reference ID %d", cr->ref_id);
2580
0
            goto block_err;
2581
0
        }
2582
2583
0
        if (ds & CRAM_RL) {
2584
0
            if (!c->comp_hdr->codecs[DS_RL]) goto block_err;
2585
0
            r |= c->comp_hdr->codecs[DS_RL]
2586
0
                            ->decode(s, c->comp_hdr->codecs[DS_RL], blk,
2587
0
                                     (char *)&cr->len, &out_sz);
2588
0
            if (r) goto block_err;
2589
0
            if (cr->len < 0) {
2590
0
                hts_log_error("Read has negative length");
2591
0
                goto block_err;
2592
0
            }
2593
0
        }
2594
2595
0
        if (ds & CRAM_AP) {
2596
0
            if (!c->comp_hdr->codecs[DS_AP]) goto block_err;
2597
0
            if (CRAM_MAJOR_VERS(fd->version) >= 4) {
2598
0
                r |= c->comp_hdr->codecs[DS_AP]
2599
0
                                ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
2600
0
                                         (char *)&cr->apos, &out_sz);
2601
0
            } else  {
2602
0
                int32_t i32;
2603
0
                r |= c->comp_hdr->codecs[DS_AP]
2604
0
                                ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
2605
0
                                         (char *)&i32, &out_sz);
2606
0
                cr->apos = i32;
2607
0
            }
2608
0
            if (r) goto block_err;;
2609
0
            if (c->comp_hdr->AP_delta) {
2610
0
                if (cr->apos < 0 && c->unsorted == 0) {
2611
                    // cache locally in c->unsorted so we don't have an
2612
                    // excessive number of locks
2613
0
                    pthread_mutex_lock(&fd->ref_lock);
2614
0
                    c->unsorted = fd->unsorted = 1;
2615
0
                    pthread_mutex_unlock(&fd->ref_lock);
2616
0
                }
2617
0
                cr->apos += s->last_apos;
2618
0
            }
2619
0
            s->last_apos=  cr->apos;
2620
0
        } else {
2621
0
            cr->apos = c->ref_seq_start;
2622
0
        }
2623
2624
0
        if (ds & CRAM_RG) {
2625
0
            if (!c->comp_hdr->codecs[DS_RG]) goto block_err;
2626
0
            r |= c->comp_hdr->codecs[DS_RG]
2627
0
                           ->decode(s, c->comp_hdr->codecs[DS_RG], blk,
2628
0
                                    (char *)&cr->rg, &out_sz);
2629
0
            if (r) goto block_err;
2630
0
            if (cr->rg == unknown_rg)
2631
0
                cr->rg = -1;
2632
0
        } else {
2633
0
            cr->rg = -1;
2634
0
        }
2635
2636
0
        cr->name_len = 0;
2637
2638
0
        if (c->comp_hdr->read_names_included) {
2639
0
            int32_t out_sz2 = 1;
2640
2641
            // Read directly into name cram_block
2642
0
            cr->name = BLOCK_SIZE(s->name_blk);
2643
0
            if (ds & CRAM_RN) {
2644
0
                if (!c->comp_hdr->codecs[DS_RN]) goto block_err;
2645
0
                r |= c->comp_hdr->codecs[DS_RN]
2646
0
                                ->decode(s, c->comp_hdr->codecs[DS_RN], blk,
2647
0
                                         (char *)s->name_blk, &out_sz2);
2648
0
                if (r) goto block_err;
2649
0
                cr->name_len = out_sz2;
2650
0
            }
2651
0
        }
2652
2653
0
        cr->mate_pos = 0;
2654
0
        cr->mate_line = -1;
2655
0
        cr->mate_ref_id = -1;
2656
0
        cr->explicit_tlen = INT64_MIN;
2657
0
        if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) {
2658
0
            if (ds & CRAM_MF) {
2659
0
                if (CRAM_MAJOR_VERS(fd->version) == 1) {
2660
                    /* MF is byte in 1.0, int32 in 2.0 */
2661
0
                    unsigned char mf;
2662
0
                    if (!c->comp_hdr->codecs[DS_MF]) goto block_err;
2663
0
                    r |= c->comp_hdr->codecs[DS_MF]
2664
0
                                    ->decode(s, c->comp_hdr->codecs[DS_MF],
2665
0
                                             blk, (char *)&mf, &out_sz);
2666
0
                    if (r) goto block_err;
2667
0
                    cr->mate_flags = mf;
2668
0
                } else {
2669
0
                    if (!c->comp_hdr->codecs[DS_MF]) goto block_err;
2670
0
                    r |= c->comp_hdr->codecs[DS_MF]
2671
0
                                    ->decode(s, c->comp_hdr->codecs[DS_MF],
2672
0
                                             blk,
2673
0
                                             (char *)&cr->mate_flags,
2674
0
                                             &out_sz);
2675
0
                    if (r) goto block_err;
2676
0
                }
2677
0
            } else {
2678
0
                cr->mate_flags = 0;
2679
0
            }
2680
2681
0
            if (!c->comp_hdr->read_names_included) {
2682
0
                int32_t out_sz2 = 1;
2683
2684
                // Read directly into name cram_block
2685
0
                cr->name = BLOCK_SIZE(s->name_blk);
2686
0
                if (ds & CRAM_RN) {
2687
0
                    if (!c->comp_hdr->codecs[DS_RN]) goto block_err;
2688
0
                    r |= c->comp_hdr->codecs[DS_RN]
2689
0
                                    ->decode(s, c->comp_hdr->codecs[DS_RN],
2690
0
                                             blk, (char *)s->name_blk,
2691
0
                                             &out_sz2);
2692
0
                    if (r) goto block_err;
2693
0
                    cr->name_len = out_sz2;
2694
0
                }
2695
0
            }
2696
2697
0
            if (ds & CRAM_NS) {
2698
0
                if (!c->comp_hdr->codecs[DS_NS]) goto block_err;
2699
0
                r |= c->comp_hdr->codecs[DS_NS]
2700
0
                                ->decode(s, c->comp_hdr->codecs[DS_NS], blk,
2701
0
                                         (char *)&cr->mate_ref_id, &out_sz);
2702
0
                if (r) goto block_err;
2703
0
            }
2704
2705
            // Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown.
2706
            // if (cr->mate_ref_id == -1 && cr->flags & 0x01) {
2707
            //     /* Paired, but unmapped */
2708
            //     cr->flags |= BAM_FMUNMAP;
2709
            // }
2710
2711
0
            if (ds & CRAM_NP) {
2712
0
                if (!c->comp_hdr->codecs[DS_NP]) goto block_err;;
2713
0
                if (CRAM_MAJOR_VERS(fd->version) < 4) {
2714
0
                    int32_t i32;
2715
0
                    r |= c->comp_hdr->codecs[DS_NP]
2716
0
                                    ->decode(s, c->comp_hdr->codecs[DS_NP], blk,
2717
0
                                             (char *)&i32, &out_sz);
2718
0
                    cr->mate_pos = i32;
2719
0
                } else {
2720
0
                    r |= c->comp_hdr->codecs[DS_NP]
2721
0
                        ->decode(s, c->comp_hdr->codecs[DS_NP], blk,
2722
0
                                 (char *)&cr->mate_pos, &out_sz);
2723
0
                }
2724
0
                if (r) goto block_err;
2725
0
            }
2726
2727
0
            if (ds & CRAM_TS) {
2728
0
                if (!c->comp_hdr->codecs[DS_TS]) goto block_err;
2729
0
                r = cram_decode_tlen(fd, c, s, blk, &cr->tlen);
2730
0
                if (r) goto block_err;
2731
0
            } else {
2732
0
                cr->tlen = INT64_MIN;
2733
0
            }
2734
0
        } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) {
2735
            // else not detached
2736
0
            if (ds & CRAM_NF) {
2737
0
                if (!c->comp_hdr->codecs[DS_NF]) goto block_err;
2738
0
                r |= c->comp_hdr->codecs[DS_NF]
2739
0
                                ->decode(s, c->comp_hdr->codecs[DS_NF], blk,
2740
0
                                         (char *)&cr->mate_line, &out_sz);
2741
0
                if (r) goto block_err;
2742
0
                cr->mate_line += rec + 1;
2743
2744
                //cr->name_len = sprintf(name, "%d", name_id++);
2745
                //cr->name = DSTRING_LEN(name_ds);
2746
                //dstring_nappend(name_ds, name, cr->name_len);
2747
2748
0
                cr->mate_ref_id = -1;
2749
0
                cr->tlen = INT64_MIN;
2750
0
                cr->mate_pos = 0;
2751
0
            } else  {
2752
0
                cr->mate_flags = 0;
2753
0
                cr->tlen = INT64_MIN;
2754
0
            }
2755
0
            if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) {
2756
0
                if (ds & CRAM_TS) {
2757
0
                    r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen);
2758
0
                    if (r) return r;
2759
0
                } else {
2760
0
                    cr->mate_flags = 0;
2761
0
                    cr->tlen = INT64_MIN;
2762
0
                }
2763
0
            }
2764
0
        } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) {
2765
0
            if (ds & CRAM_TS) {
2766
0
                r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen);
2767
0
                if (r) return r;
2768
0
            } else {
2769
0
                cr->mate_flags = 0;
2770
0
                cr->tlen = INT64_MIN;
2771
0
            }
2772
0
        } else {
2773
0
            cr->mate_flags = 0;
2774
0
            cr->tlen = INT64_MIN;
2775
0
        }
2776
        /*
2777
        else if (!name[0]) {
2778
            //name[0] = '?'; name[1] = 0;
2779
            //cr->name_len = 1;
2780
            //cr->name=  DSTRING_LEN(s->name_ds);
2781
            //dstring_nappend(s->name_ds, "?", 1);
2782
2783
            cr->mate_ref_id = -1;
2784
            cr->tlen = 0;
2785
            cr->mate_pos = 0;
2786
        }
2787
        */
2788
2789
        /* Auxiliary tags */
2790
0
        has_MD = has_NM = 0;
2791
0
        if (CRAM_MAJOR_VERS(fd->version) == 1)
2792
0
            r |= cram_decode_aux_1_0(c, s, blk, cr);
2793
0
        else
2794
0
            r |= cram_decode_aux(fd, c, s, blk, cr, &has_MD, &has_NM);
2795
0
        if (r) goto block_err;
2796
2797
        /* Fake up dynamic string growth and appending */
2798
0
        if (ds & CRAM_RL) {
2799
0
            cr->seq = BLOCK_SIZE(s->seqs_blk);
2800
0
            BLOCK_GROW(s->seqs_blk, cr->len);
2801
0
            seq = (char *)BLOCK_END(s->seqs_blk);
2802
0
            BLOCK_SIZE(s->seqs_blk) += cr->len;
2803
2804
0
            if (!seq)
2805
0
                goto block_err;
2806
2807
0
            cr->qual = BLOCK_SIZE(s->qual_blk);
2808
0
            BLOCK_GROW(s->qual_blk, cr->len);
2809
0
            qual = (char *)BLOCK_END(s->qual_blk);
2810
0
            BLOCK_SIZE(s->qual_blk) += cr->len;
2811
2812
0
            if (!s->ref)
2813
0
                memset(seq, '=', cr->len);
2814
0
        }
2815
2816
0
        if (!(bf & BAM_FUNMAP)) {
2817
0
            if ((ds & CRAM_AP) && cr->apos <= 0) {
2818
0
                hts_log_error("Read has alignment position %"PRId64
2819
0
                              " but no unmapped flag",
2820
0
                              cr->apos);
2821
0
                goto block_err;
2822
0
            }
2823
            /* Decode sequence and generate CIGAR */
2824
0
            if (ds & (CRAM_SEQ | CRAM_MQ)) {
2825
0
                r |= cram_decode_seq(fd, c, s, blk, cr, sh, cf, seq, qual,
2826
0
                                     has_MD, has_NM);
2827
0
                if (r) goto block_err;
2828
0
            } else {
2829
0
                cr->cigar = 0;
2830
0
                cr->ncigar = 0;
2831
0
                cr->aend = cr->apos;
2832
0
                cr->mqual = 0;
2833
0
            }
2834
0
        } else {
2835
0
            int out_sz2 = cr->len;
2836
2837
            //puts("Unmapped");
2838
0
            cr->cigar = 0;
2839
0
            cr->ncigar = 0;
2840
0
            cr->aend = cr->apos;
2841
0
            cr->mqual = 0;
2842
2843
0
            if (ds & CRAM_BA && cr->len) {
2844
0
                if (!c->comp_hdr->codecs[DS_BA]) goto block_err;
2845
0
                r |= c->comp_hdr->codecs[DS_BA]
2846
0
                                ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
2847
0
                                         (char *)seq, &out_sz2);
2848
0
                if (r) goto block_err;
2849
0
            }
2850
2851
0
            if ((ds & CRAM_CF) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
2852
0
                out_sz2 = cr->len;
2853
0
                if (ds & CRAM_QS && cr->len >= 0) {
2854
0
                    if (!c->comp_hdr->codecs[DS_QS]) goto block_err;
2855
0
                    r |= c->comp_hdr->codecs[DS_QS]
2856
0
                                    ->decode(s, c->comp_hdr->codecs[DS_QS],
2857
0
                                             blk, qual, &out_sz2);
2858
0
                    if (r) goto block_err;
2859
0
                }
2860
0
            } else {
2861
0
                if (ds & CRAM_RL)
2862
0
                    memset(qual, 255, cr->len);
2863
0
            }
2864
0
        }
2865
2866
0
        if (!c->comp_hdr->qs_seq_orient && (ds & CRAM_QS) && (cr->flags & BAM_FREVERSE)) {
2867
0
            int i, j;
2868
0
            for (i = 0, j = cr->len-1; i < j; i++, j--) {
2869
0
                unsigned char c;
2870
0
                c = qual[i];
2871
0
                qual[i] = qual[j];
2872
0
                qual[j] = c;
2873
0
            }
2874
0
        }
2875
0
    }
2876
2877
408
    pthread_mutex_lock(&fd->ref_lock);
2878
408
    if (refs) {
2879
18
        int i;
2880
27
        for (i = 0; i < fd->refs->nref; i++) {
2881
9
            if (refs[i])
2882
0
                cram_ref_decr(fd->refs, i);
2883
9
        }
2884
18
        free(refs);
2885
18
        refs = NULL;
2886
390
    } else if (ref_id >= 0 && s->ref != fd->ref_free && !embed_ref) {
2887
0
        cram_ref_decr(fd->refs, ref_id);
2888
0
    }
2889
408
    pthread_mutex_unlock(&fd->ref_lock);
2890
2891
    /* Resolve mate pair cross-references between recs within this slice */
2892
408
    r |= cram_decode_slice_xref(s, fd->required_fields);
2893
2894
    // Free the original blocks as we no longer need these.
2895
408
    {
2896
408
        int i;
2897
2.05k
        for (i = 0; i < s->hdr->num_blocks; i++) {
2898
1.64k
            cram_block *b = s->block[i];
2899
1.64k
            cram_free_block(b);
2900
1.64k
            s->block[i] = NULL;
2901
1.64k
        }
2902
408
    }
2903
2904
    // Also see initial BLOCK_RESIZE_EXACT at top of function.
2905
    // As we grow blocks we overallocate by up to 50%. So shrink
2906
    // back to their final sizes here.
2907
    //
2908
    //fprintf(stderr, "%d %d // %d %d // %d %d // %d %d\n",
2909
    //      (int)s->seqs_blk->byte, (int)s->seqs_blk->alloc,
2910
    //      (int)s->qual_blk->byte, (int)s->qual_blk->alloc,
2911
    //      (int)s->name_blk->byte, (int)s->name_blk->alloc,
2912
    //      (int)s->aux_blk->byte,  (int)s->aux_blk->alloc);
2913
408
    BLOCK_RESIZE_EXACT(s->seqs_blk, BLOCK_SIZE(s->seqs_blk)+1);
2914
408
    BLOCK_RESIZE_EXACT(s->qual_blk, BLOCK_SIZE(s->qual_blk)+1);
2915
408
    BLOCK_RESIZE_EXACT(s->name_blk, BLOCK_SIZE(s->name_blk)+1);
2916
408
    BLOCK_RESIZE_EXACT(s->aux_blk,  BLOCK_SIZE(s->aux_blk)+1);
2917
2918
408
    return r;
2919
2920
21
 block_err:
2921
21
    if (refs) {
2922
15
        int i;
2923
15
        pthread_mutex_lock(&fd->ref_lock);
2924
45
        for (i = 0; i < fd->refs->nref; i++) {
2925
30
            if (refs[i])
2926
0
                cram_ref_decr(fd->refs, i);
2927
30
        }
2928
15
        free(refs);
2929
15
        pthread_mutex_unlock(&fd->ref_lock);
2930
15
    }
2931
2932
21
    return -1;
2933
408
}
2934
2935
typedef struct {
2936
    cram_fd *fd;
2937
    cram_container *c;
2938
    cram_slice *s;
2939
    sam_hdr_t *h;
2940
    int exit_code;
2941
} cram_decode_job;
2942
2943
0
void *cram_decode_slice_thread(void *arg) {
2944
0
    cram_decode_job *j = (cram_decode_job *)arg;
2945
2946
0
    j->exit_code = cram_decode_slice(j->fd, j->c, j->s, j->h);
2947
2948
0
    return j;
2949
0
}
2950
2951
/*
2952
 * Spawn a multi-threaded version of cram_decode_slice().
2953
 */
2954
int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s,
2955
480
                         sam_hdr_t *bfd) {
2956
480
    cram_decode_job *j;
2957
480
    int nonblock;
2958
2959
480
    if (!fd->pool)
2960
480
        return cram_decode_slice(fd, c, s, bfd);
2961
2962
0
    if (!(j = malloc(sizeof(*j))))
2963
0
        return -1;
2964
2965
0
    j->fd = fd;
2966
0
    j->c  = c;
2967
0
    j->s  = s;
2968
0
    j->h  = bfd;
2969
2970
0
    nonblock = hts_tpool_process_sz(fd->rqueue) ? 1 : 0;
2971
2972
0
    int saved_errno = errno;
2973
0
    errno = 0;
2974
0
    if (-1 == hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_decode_slice_thread,
2975
0
                                  j, nonblock)) {
2976
        /* Would block */
2977
0
        if (errno != EAGAIN)
2978
0
            return -1;
2979
0
        fd->job_pending = j;
2980
0
    } else {
2981
0
        fd->job_pending = NULL;
2982
0
    }
2983
0
    errno = saved_errno;
2984
2985
    // flush too
2986
0
    return 0;
2987
0
}
2988
2989
2990
/* ----------------------------------------------------------------------
2991
 * CRAM sequence iterators.
2992
 */
2993
2994
/*
2995
 * Converts a cram in-memory record into a bam in-memory record. We
2996
 * pass a pointer to a bam_seq_t pointer along with the a pointer to
2997
 * the allocated size. These can initially be pointers to NULL and zero.
2998
 *
2999
 * This function will reallocate the bam buffer as required and update
3000
 * (*bam)->alloc accordingly, allowing it to be used within a loop
3001
 * efficiently without needing to allocate new bam objects over and
3002
 * over again.
3003
 *
3004
 * Returns the used size of the bam record on success
3005
 *         -1 on failure.
3006
 */
3007
int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s,
3008
0
                cram_record *cr, int rec, bam_seq_t **bam) {
3009
0
    int ret, rg_len;
3010
0
    char name_a[1024], *name;
3011
0
    int name_len;
3012
0
    char *aux;
3013
0
    char *seq, *qual;
3014
0
    sam_hrecs_t *bfd = sh->hrecs;
3015
3016
    /* Assign names if not explicitly set */
3017
0
    if (fd->required_fields & SAM_QNAME) {
3018
0
        if (cr->name_len) {
3019
0
            name = (char *)BLOCK_DATA(s->name_blk) + cr->name;
3020
0
            name_len = cr->name_len;
3021
0
        } else {
3022
0
            name = name_a;
3023
0
            if (cr->mate_line >= 0 && cr->mate_line < s->max_rec &&
3024
0
                s->crecs[cr->mate_line].name_len > 0) {
3025
                // Copy our mate if non-zero.
3026
0
                memcpy(name_a, BLOCK_DATA(s->name_blk)+s->crecs[cr->mate_line].name,
3027
0
                       s->crecs[cr->mate_line].name_len);
3028
0
                name = name_a + s->crecs[cr->mate_line].name_len;
3029
0
            } else {
3030
                // Otherwise generate a name based on prefix
3031
0
                name_len = strlen(fd->prefix);
3032
0
                memcpy(name, fd->prefix, name_len);
3033
0
                name += name_len;
3034
0
                *name++ = ':';
3035
0
                if (cr->mate_line >= 0 && cr->mate_line < rec) {
3036
0
                    name = (char *)append_uint64((unsigned char *)name,
3037
0
                                                 s->hdr->record_counter +
3038
0
                                                 cr->mate_line + 1);
3039
0
                } else {
3040
0
                    name = (char *)append_uint64((unsigned char *)name,
3041
0
                                                 s->hdr->record_counter +
3042
0
                                                 rec + 1);
3043
0
                }
3044
0
            }
3045
0
            name_len = name - name_a;
3046
0
            name = name_a;
3047
0
        }
3048
0
    } else {
3049
0
        name = "?";
3050
0
        name_len = 1;
3051
0
    }
3052
3053
    /* Generate BAM record */
3054
0
    if (cr->rg < -1 || cr->rg >= bfd->nrg)
3055
0
        return -1;
3056
0
    rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0;
3057
3058
0
    if (fd->required_fields & (SAM_SEQ | SAM_QUAL)) {
3059
0
        if (!BLOCK_DATA(s->seqs_blk))
3060
0
            return -1;
3061
0
        seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
3062
0
    } else {
3063
0
        seq = "*";
3064
0
        cr->len = 0;
3065
0
    }
3066
3067
0
    if (fd->required_fields & SAM_QUAL) {
3068
0
        if (!BLOCK_DATA(s->qual_blk))
3069
0
            return -1;
3070
0
        qual = (char *)BLOCK_DATA(s->qual_blk) + cr->qual;
3071
0
    } else {
3072
0
        qual = NULL;
3073
0
    }
3074
3075
0
    ret = bam_set1(*bam,
3076
0
                   name_len, name,
3077
0
                   cr->flags, cr->ref_id, cr->apos - 1, cr->mqual,
3078
0
                   cr->ncigar, &s->cigar[cr->cigar],
3079
0
                   cr->mate_ref_id, cr->mate_pos - 1, cr->tlen,
3080
0
                   cr->len, seq, qual,
3081
0
                   cr->aux_size + rg_len);
3082
0
    if (ret < 0) {
3083
0
        return ret;
3084
0
    }
3085
3086
0
    aux = (char *)bam_aux(*bam);
3087
3088
    /* Auxiliary strings */
3089
0
    if (cr->aux_size != 0) {
3090
0
        memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size);
3091
0
        aux += cr->aux_size;
3092
0
        (*bam)->l_data += cr->aux_size;
3093
0
    }
3094
3095
    /* RG:Z: */
3096
0
    if (rg_len > 0) {
3097
0
        *aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z';
3098
0
        int len = bfd->rg[cr->rg].name_len;
3099
0
        memcpy(aux, bfd->rg[cr->rg].name, len);
3100
0
        aux += len;
3101
0
        *aux++ = 0;
3102
0
        (*bam)->l_data += rg_len;
3103
0
    }
3104
3105
0
    return (*bam)->l_data;
3106
0
}
3107
3108
/*
3109
 * Here be dragons! The multi-threading code in this is crufty beyond belief.
3110
 */
3111
3112
/*
3113
 * Load first container.
3114
 * Called when fd->ctr is NULL>
3115
 *
3116
 * Returns container on success
3117
 *        NULL on failure.
3118
 */
3119
8.48k
static cram_container *cram_first_slice(cram_fd *fd) {
3120
8.48k
    cram_container *c;
3121
3122
17.2k
    do {
3123
17.2k
        if (fd->ctr)
3124
8.76k
            cram_free_container(fd->ctr);
3125
3126
17.2k
        if (!(c = fd->ctr = cram_read_container(fd)))
3127
647
            return NULL;
3128
16.6k
        c->curr_slice_mt = c->curr_slice;
3129
16.6k
    } while (c->length == 0);
3130
3131
    /*
3132
     * The first container may be a result of a sub-range query.
3133
     * In which case it may still not be the optimal starting point
3134
     * due to skipped containers/slices in the index.
3135
     */
3136
    // No need for locks here as we're in the main thread.
3137
7.84k
    if (fd->range.refid != -2) {
3138
0
        while (c->ref_seq_id != -2 &&
3139
0
               (c->ref_seq_id < fd->range.refid ||
3140
0
                (fd->range.refid >= 0 && c->ref_seq_id == fd->range.refid
3141
0
                 && c->ref_seq_start + c->ref_seq_span-1 < fd->range.start))) {
3142
0
            if (0 != cram_seek(fd, c->length, SEEK_CUR))
3143
0
                return NULL;
3144
0
            cram_free_container(fd->ctr);
3145
0
            do {
3146
0
                if (!(c = fd->ctr = cram_read_container(fd)))
3147
0
                    return NULL;
3148
0
            } while (c->length == 0);
3149
0
        }
3150
3151
0
        if (c->ref_seq_id != -2 && c->ref_seq_id != fd->range.refid) {
3152
0
            fd->eof = 1;
3153
0
            return NULL;
3154
0
        }
3155
0
    }
3156
3157
7.84k
    if (!(c->comp_hdr_block = cram_read_block(fd)))
3158
255
        return NULL;
3159
7.58k
    if (c->comp_hdr_block->content_type != COMPRESSION_HEADER)
3160
56
        return NULL;
3161
3162
7.52k
    c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
3163
7.52k
    if (!c->comp_hdr)
3164
7.13k
        return NULL;
3165
393
    if (!c->comp_hdr->AP_delta &&
3166
393
        sam_hrecs_sort_order(fd->header->hrecs) != ORDER_COORD) {
3167
0
        pthread_mutex_lock(&fd->ref_lock);
3168
0
        fd->unsorted = 1;
3169
0
        pthread_mutex_unlock(&fd->ref_lock);
3170
0
    }
3171
3172
393
    return c;
3173
7.52k
}
3174
3175
8.89k
cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
3176
8.89k
    cram_container *c_curr;  // container being consumed via cram_get_seq()
3177
8.89k
    cram_slice *s_curr = NULL;
3178
3179
    // Populate the first container if unknown.
3180
8.89k
    if (!(c_curr = fd->ctr)) {
3181
8.48k
        if (!(c_curr = cram_first_slice(fd)))
3182
8.09k
            return NULL;
3183
8.48k
    }
3184
3185
    // Discard previous slice
3186
801
    if ((s_curr = c_curr->slice)) {
3187
408
        c_curr->slice = NULL;
3188
408
        cram_free_slice(s_curr);
3189
408
        s_curr = NULL;
3190
408
    }
3191
3192
    // If we've consumed all slices in this container, also discard
3193
    // the container too.
3194
801
    if (c_curr->curr_slice == c_curr->max_slice) {
3195
99
        if (fd->ctr == c_curr)
3196
99
            fd->ctr = NULL;
3197
99
        if (fd->ctr_mt == c_curr)
3198
18
            fd->ctr_mt = NULL;
3199
99
        cram_free_container(c_curr);
3200
99
        c_curr = NULL;
3201
99
    }
3202
3203
801
    if (!fd->ctr_mt)
3204
411
        fd->ctr_mt = c_curr;
3205
3206
    // Fetch the next slice (and the container if necessary).
3207
    //
3208
    // If single threaded this loop bails out as soon as it finds
3209
    // a slice in range.  In this case c_next and c_curr end up being
3210
    // the same thing.
3211
    //
3212
    // If multi-threaded, we loop until we have filled out
3213
    // thread pool input queue.  Here c_next and c_curr *may* differ, as
3214
    // can fd->ctr and fd->ctr_mt.
3215
801
    for (;;) {
3216
801
        cram_container *c_next = fd->ctr_mt;
3217
801
        cram_slice *s_next = NULL;
3218
3219
        // Next slice; either from the last job we failed to push
3220
        // to the input queue or via more I/O.
3221
801
        if (fd->job_pending) {
3222
0
            cram_decode_job *j = (cram_decode_job *)fd->job_pending;
3223
0
            c_next = j->c;
3224
0
            s_next = j->s;
3225
0
            free(fd->job_pending);
3226
0
            fd->job_pending = NULL;
3227
801
        } else if (!fd->ooc) {
3228
1.05k
        empty_container:
3229
1.05k
            if (!c_next || c_next->curr_slice_mt == c_next->max_slice) {
3230
                // new container
3231
1.42k
                for(;;) {
3232
1.42k
                    if (!(c_next = cram_read_container(fd))) {
3233
30
                        if (fd->pool) {
3234
0
                            fd->ooc = 1;
3235
0
                            break;
3236
0
                        }
3237
3238
30
                        return NULL;
3239
30
                    }
3240
1.39k
                    c_next->curr_slice_mt = c_next->curr_slice;
3241
3242
1.39k
                    if (c_next->length != 0)
3243
327
                        break;
3244
3245
1.07k
                    cram_free_container(c_next);
3246
1.07k
                }
3247
327
                if (fd->ooc)
3248
0
                    break;
3249
3250
//                printf("%p %d:%ld-%ld vs %d:%ld-%ld\n", fd,
3251
//                       c_next->ref_seq_id, c_next->ref_seq_start, c_next->ref_seq_start+c_next->ref_seq_span-1,
3252
//                       fd->range.refid, fd->range.start, fd->range.end);
3253
3254
                /* Skip containers not yet spanning our range */
3255
327
                if (fd->range.refid != -2 && c_next->ref_seq_id != -2) {
3256
                    // ref_id beyond end of range; bail out
3257
0
                    if (c_next->ref_seq_id != fd->range.refid) {
3258
0
                        cram_free_container(c_next);
3259
0
                        fd->ctr_mt = NULL;
3260
0
                        fd->ooc = 1;
3261
0
                        break;
3262
0
                    }
3263
3264
                    // position beyond end of range; bail out
3265
0
                    if (fd->range.refid != -1 &&
3266
0
                        c_next->ref_seq_start > fd->range.end) {
3267
0
                        cram_free_container(c_next);
3268
0
                        fd->ctr_mt = NULL;
3269
0
                        fd->ooc = 1;
3270
0
                        break;
3271
0
                    }
3272
3273
                    // before start of range; skip to next container
3274
0
                    if (fd->range.refid != -1 &&
3275
0
                        c_next->ref_seq_start + c_next->ref_seq_span-1 <
3276
0
                        fd->range.start) {
3277
0
                        c_next->curr_slice_mt = c_next->max_slice;
3278
0
                        cram_seek(fd, c_next->length, SEEK_CUR);
3279
0
                        cram_free_container(c_next);
3280
0
                        c_next = NULL;
3281
0
                        continue;
3282
0
                    }
3283
0
                }
3284
3285
                // Container is valid range, so remember it for restarting
3286
                // this function.
3287
327
                fd->ctr_mt = c_next;
3288
3289
327
                if (!(c_next->comp_hdr_block = cram_read_block(fd)))
3290
36
                    return NULL;
3291
291
                if (c_next->comp_hdr_block->content_type != COMPRESSION_HEADER)
3292
18
                    return NULL;
3293
3294
273
                c_next->comp_hdr =
3295
273
                    cram_decode_compression_header(fd, c_next->comp_hdr_block);
3296
273
                if (!c_next->comp_hdr)
3297
39
                    return NULL;
3298
3299
234
                if (!c_next->comp_hdr->AP_delta &&
3300
234
                    sam_hrecs_sort_order(fd->header->hrecs) != ORDER_COORD) {
3301
0
                    pthread_mutex_lock(&fd->ref_lock);
3302
0
                    fd->unsorted = 1;
3303
0
                    pthread_mutex_unlock(&fd->ref_lock);
3304
0
                }
3305
234
            }
3306
3307
936
            if (c_next->num_records == 0) {
3308
258
                if (fd->ctr == c_next)
3309
36
                    fd->ctr = NULL;
3310
258
                if (c_curr == c_next)
3311
36
                    c_curr = NULL;
3312
258
                if (fd->ctr_mt == c_next)
3313
258
                    fd->ctr_mt = NULL;
3314
258
                cram_free_container(c_next);
3315
258
                c_next = NULL;
3316
258
                goto empty_container;
3317
258
            }
3318
3319
678
            if (!(s_next = c_next->slice = cram_read_slice(fd)))
3320
198
                return NULL;
3321
3322
480
            s_next->slice_num = ++c_next->curr_slice_mt;
3323
480
            s_next->curr_rec = 0;
3324
480
            s_next->max_rec = s_next->hdr->num_records;
3325
3326
480
            s_next->last_apos = s_next->hdr->ref_seq_start;
3327
3328
            // We know the container overlaps our range, but with multi-slice
3329
            // containers we may have slices that do not.  Skip these also.
3330
480
            if (fd->range.refid != -2 && s_next->hdr->ref_seq_id != -2) {
3331
                // ref_id beyond end of range; bail out
3332
0
                if (s_next->hdr->ref_seq_id != fd->range.refid) {
3333
0
                    fd->ooc = 1;
3334
0
                    cram_free_slice(s_next);
3335
0
                    c_next->slice = s_next = NULL;
3336
0
                    break;
3337
0
                }
3338
3339
                // position beyond end of range; bail out
3340
0
                if (fd->range.refid != -1 &&
3341
0
                    s_next->hdr->ref_seq_start > fd->range.end) {
3342
0
                    fd->ooc = 1;
3343
0
                    cram_free_slice(s_next);
3344
0
                    c_next->slice = s_next = NULL;
3345
0
                    break;
3346
0
                }
3347
3348
                // before start of range; skip to next slice
3349
0
                if (fd->range.refid != -1 &&
3350
0
                    s_next->hdr->ref_seq_start + s_next->hdr->ref_seq_span-1 <
3351
0
                    fd->range.start) {
3352
0
                    cram_free_slice(s_next);
3353
0
                    c_next->slice = s_next = NULL;
3354
0
                    continue;
3355
0
                }
3356
0
            }
3357
480
        } // end: if (!fd->ooc)
3358
3359
480
        if (!c_next || !s_next)
3360
0
            break;
3361
3362
        // Decode the slice, either right now (non-threaded) or by pushing
3363
        // it to the a decode queue (threaded).
3364
480
        if (cram_decode_slice_mt(fd, c_next, s_next, fd->header) != 0) {
3365
72
            hts_log_error("Failure to decode slice");
3366
72
            cram_free_slice(s_next);
3367
72
            c_next->slice = NULL;
3368
72
            return NULL;
3369
72
        }
3370
3371
        // No thread pool, so don't loop again
3372
408
        if (!fd->pool) {
3373
408
            c_curr = c_next;
3374
408
            s_curr = s_next;
3375
408
            break;
3376
408
        }
3377
3378
        // With thread pool, but we have a job pending so our decode queue
3379
        // is full.
3380
0
        if (fd->job_pending)
3381
0
            break;
3382
3383
        // Otherwise we're threaded with room in the decode input queue, so
3384
        // keep reading slices for decode.
3385
        // Push it a bit far, to qsize in queue rather than pending arrival,
3386
        // as cram tends to be a bit bursty in decode timings.
3387
0
        if (hts_tpool_process_len(fd->rqueue) >
3388
0
            hts_tpool_process_qsize(fd->rqueue))
3389
0
            break;
3390
0
    } // end of for(;;)
3391
3392
3393
    // When not threaded we've already have c_curr and s_curr.
3394
    // Otherwise we need get them by pulling off the decode output queue.
3395
408
    if (fd->pool) {
3396
0
        hts_tpool_result *res;
3397
0
        cram_decode_job *j;
3398
3399
0
        if (fd->ooc && hts_tpool_process_empty(fd->rqueue)) {
3400
0
            fd->eof = 1;
3401
0
            return NULL;
3402
0
        }
3403
3404
0
        res = hts_tpool_next_result_wait(fd->rqueue);
3405
3406
0
        if (!res || !hts_tpool_result_data(res)) {
3407
0
            hts_log_error("Call to hts_tpool_next_result failed");
3408
0
            return NULL;
3409
0
        }
3410
3411
0
        j = (cram_decode_job *)hts_tpool_result_data(res);
3412
0
        c_curr = j->c;
3413
0
        s_curr = j->s;
3414
3415
0
        if (j->exit_code != 0) {
3416
0
            hts_log_error("Slice decode failure");
3417
0
            fd->eof = 0;
3418
0
            hts_tpool_delete_result(res, 1);
3419
0
            return NULL;
3420
0
        }
3421
3422
0
        hts_tpool_delete_result(res, 1);
3423
0
    }
3424
3425
408
    *cp = c_curr;
3426
3427
    // Update current slice being processed (as opposed to current
3428
    // slice in the multi-threaded reahead.
3429
408
    fd->ctr = c_curr;
3430
408
    if (c_curr) {
3431
408
        c_curr->slice = s_curr;
3432
408
        if (s_curr)
3433
408
            c_curr->curr_slice = s_curr->slice_num;
3434
408
    }
3435
408
    if (s_curr)
3436
408
        s_curr->curr_rec = 0;
3437
0
    else
3438
0
        fd->eof = 1;
3439
3440
408
    return s_curr;
3441
408
}
3442
3443
/*
3444
 * Read the next cram record and return it.
3445
 * Note that to decode cram_record the caller will need to look up some data
3446
 * in the current slice, pointed to by fd->ctr->slice. This is valid until
3447
 * the next call to cram_get_seq (which may invalidate it).
3448
 *
3449
 * Returns record pointer on success (do not free)
3450
 *        NULL on failure
3451
 */
3452
8.48k
cram_record *cram_get_seq(cram_fd *fd) {
3453
8.48k
    cram_container *c;
3454
8.48k
    cram_slice *s;
3455
3456
8.89k
    for (;;) {
3457
8.89k
        c = fd->ctr;
3458
8.89k
        if (c && c->slice && c->slice->curr_rec < c->slice->max_rec) {
3459
0
            s = c->slice;
3460
8.89k
        } else {
3461
8.89k
            if (!(s = cram_next_slice(fd, &c)))
3462
8.48k
                return NULL;
3463
408
            continue; /* In case slice contains no records */
3464
8.89k
        }
3465
3466
        // No need to lock here as get_seq is running in the main thread,
3467
        // which is also the same one that does the range modifications.
3468
0
        if (fd->range.refid != -2) {
3469
0
            if (fd->range.refid == -1 && s->crecs[s->curr_rec].ref_id != -1) {
3470
                // Special case when looking for unmapped blocks at end.
3471
                // If these are mixed in with mapped data (c->ref_id == -2)
3472
                // then we need skip until we find the unmapped data, if at all
3473
0
                s->curr_rec++;
3474
0
                continue;
3475
0
            }
3476
0
            if (s->crecs[s->curr_rec].ref_id < fd->range.refid &&
3477
0
                s->crecs[s->curr_rec].ref_id != -1) {
3478
                // Looking for a mapped read, but not there yet.  Special case
3479
                // as -1 (unmapped) shouldn't be considered < refid.
3480
0
                s->curr_rec++;
3481
0
                continue;
3482
0
            }
3483
3484
0
            if (s->crecs[s->curr_rec].ref_id != fd->range.refid) {
3485
0
                fd->eof = 1;
3486
0
                cram_free_slice(s);
3487
0
                c->slice = NULL;
3488
0
                return NULL;
3489
0
            }
3490
3491
0
            if (fd->range.refid != -1 && s->crecs[s->curr_rec].apos > fd->range.end) {
3492
0
                fd->eof = 1;
3493
0
                cram_free_slice(s);
3494
0
                c->slice = NULL;
3495
0
                return NULL;
3496
0
            }
3497
3498
0
            if (fd->range.refid != -1 && s->crecs[s->curr_rec].aend < fd->range.start) {
3499
0
                s->curr_rec++;
3500
0
                continue;
3501
0
            }
3502
0
        }
3503
3504
0
        break;
3505
0
    }
3506
3507
0
    fd->ctr = c;
3508
0
    c->slice = s;
3509
0
    return &s->crecs[s->curr_rec++];
3510
8.48k
}
3511
3512
/*
3513
 * Read the next cram record and convert it to a bam_seq_t struct.
3514
 *
3515
 * Returns >= 0 success (number of bytes written to *bam)
3516
 *        -1 on EOF or failure (check fd->err)
3517
 */
3518
8.48k
int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam) {
3519
8.48k
    cram_record *cr;
3520
8.48k
    cram_container *c;
3521
8.48k
    cram_slice *s;
3522
3523
8.48k
    if (!(cr = cram_get_seq(fd)))
3524
8.48k
        return -1;
3525
3526
0
    c = fd->ctr;
3527
0
    s = c->slice;
3528
3529
0
    return cram_to_bam(fd->header, fd, s, cr, s->curr_rec-1, bam);
3530
8.48k
}
3531
3532
/*
3533
 * Drains and frees the decode read-queue for a multi-threaded reader.
3534
 */
3535
11.3k
void cram_drain_rqueue(cram_fd *fd) {
3536
11.3k
    cram_container *lc = NULL;
3537
3538
11.3k
    if (!fd->pool || !fd->rqueue)
3539
11.3k
        return;
3540
3541
    // drain queue of any in-flight decode jobs
3542
0
    while (!hts_tpool_process_empty(fd->rqueue)) {
3543
0
        hts_tpool_result *r = hts_tpool_next_result_wait(fd->rqueue);
3544
0
        if (!r)
3545
0
            break;
3546
0
        cram_decode_job *j = (cram_decode_job *)hts_tpool_result_data(r);
3547
0
        if (j->c->slice == j->s)
3548
0
            j->c->slice = NULL;
3549
0
        if (j->c != lc) {
3550
0
            if (lc) {
3551
0
                if (fd->ctr == lc)
3552
0
                    fd->ctr = NULL;
3553
0
                if (fd->ctr_mt == lc)
3554
0
                    fd->ctr_mt = NULL;
3555
0
                cram_free_container(lc);
3556
0
            }
3557
0
            lc = j->c;
3558
0
        }
3559
0
        cram_free_slice(j->s);
3560
0
        hts_tpool_delete_result(r, 1);
3561
0
    }
3562
3563
    // Also tidy up any pending decode job that we didn't submit to the workers
3564
    // due to the input queue being full.
3565
0
    if (fd->job_pending) {
3566
0
        cram_decode_job *j = (cram_decode_job *)fd->job_pending;
3567
0
        if (j->c->slice == j->s)
3568
0
            j->c->slice = NULL;
3569
0
        if (j->c != lc) {
3570
0
            if (lc) {
3571
0
                if (fd->ctr == lc)
3572
0
                    fd->ctr = NULL;
3573
0
                if (fd->ctr_mt == lc)
3574
0
                    fd->ctr_mt = NULL;
3575
0
                cram_free_container(lc);
3576
0
            }
3577
0
            lc = j->c;
3578
0
        }
3579
0
        cram_free_slice(j->s);
3580
0
        free(j);
3581
0
        fd->job_pending = NULL;
3582
0
    }
3583
3584
0
    if (lc) {
3585
0
        if (fd->ctr == lc)
3586
0
            fd->ctr = NULL;
3587
0
        if (fd->ctr_mt == lc)
3588
0
            fd->ctr_mt = NULL;
3589
0
        cram_free_container(lc);
3590
0
    }
3591
0
}