Coverage Report

Created: 2025-09-27 07:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/cram/cram_decode.c
Line
Count
Source
1
/*
2
Copyright (c) 2012-2020, 2022-2024 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*
32
 * - In-memory decoding of CRAM data structures.
33
 * - Iterator for reading CRAM record by record.
34
 */
35
36
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
37
#include <config.h>
38
39
#include <stdio.h>
40
#include <errno.h>
41
#include <assert.h>
42
#include <stdlib.h>
43
#include <string.h>
44
#include <zlib.h>
45
#include <sys/types.h>
46
#include <sys/stat.h>
47
#include <math.h>
48
#include <stdint.h>
49
#include <inttypes.h>
50
51
#include "cram.h"
52
#include "os.h"
53
#include "../htslib/hts.h"
54
55
//Whether CIGAR has just M or uses = and X to indicate match and mismatch
56
//#define USE_X
57
58
/* ----------------------------------------------------------------------
59
 * CRAM compression headers
60
 */
61
62
/*
63
 * Decodes the Tag Dictionary record in the preservation map
64
 * Updates the cram compression header.
65
 *
66
 * Returns number of bytes decoded on success
67
 *        -1 on failure
68
 */
69
int cram_decode_TD(cram_fd *fd, char *cp, const char *endp,
70
376
                   cram_block_compression_hdr *h) {
71
376
    char *op = cp;
72
376
    unsigned char *dat;
73
376
    cram_block *b;
74
376
    int32_t blk_size = 0;
75
376
    int nTL, i, sz, err = 0;
76
77
376
    if (!(b = cram_new_block(0, 0)))
78
0
        return -1;
79
80
376
    if (h->TD_blk || h->TL) {
81
210
        hts_log_warning("More than one TD block found in compression header");
82
210
        cram_free_block(h->TD_blk);
83
210
        free(h->TL);
84
210
        h->TD_blk = NULL;
85
210
        h->TL = NULL;
86
210
    }
87
88
    /* Decode */
89
376
    blk_size = fd->vv.varint_get32(&cp, endp, &err);
90
376
    if (!blk_size) {
91
85
        h->nTL = 0;
92
85
        cram_free_block(b);
93
85
        return cp - op;
94
85
    }
95
96
291
    if (err || blk_size < 0 || endp - cp < blk_size) {
97
5
        cram_free_block(b);
98
5
        return -1;
99
5
    }
100
101
286
    BLOCK_APPEND(b, cp, blk_size);
102
286
    cp += blk_size;
103
286
    sz = cp - op;
104
    // Force nul termination if missing
105
286
    if (BLOCK_DATA(b)[BLOCK_SIZE(b)-1])
106
236
        BLOCK_APPEND_CHAR(b, '\0');
107
108
    /* Set up TL lookup table */
109
286
    dat = BLOCK_DATA(b);
110
111
    // Count
112
6.32k
    for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
113
6.03k
        nTL++;
114
32.6k
        while (dat[i])
115
26.6k
            i++;
116
6.03k
    }
117
118
    // Copy
119
286
    if (!(h->TL = calloc(nTL, sizeof(*h->TL)))) {
120
0
        cram_free_block(b);
121
0
        return -1;
122
0
    }
123
6.32k
    for (nTL = i = 0; i < BLOCK_SIZE(b); i++) {
124
6.03k
        h->TL[nTL++] = &dat[i];
125
32.6k
        while (dat[i])
126
26.6k
            i++;
127
6.03k
    }
128
286
    h->TD_blk = b;
129
286
    h->nTL = nTL;
130
131
286
    return sz;
132
133
0
 block_err:
134
0
    cram_free_block(b);
135
0
    return -1;
136
286
}
137
138
/*
139
 * Decodes a CRAM block compression header.
140
 * Returns header ptr on success
141
 *         NULL on failure
142
 */
143
cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd,
144
1.04k
                                                           cram_block *b) {
145
1.04k
    char *cp, *endp, *cp_copy;
146
1.04k
    cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr));
147
1.04k
    int i, err = 0;
148
1.04k
    int32_t map_size = 0, map_count = 0;
149
150
1.04k
    if (!hdr)
151
0
        return NULL;
152
153
1.04k
    if (b->method != RAW) {
154
392
        if (cram_uncompress_block(b)) {
155
364
            free(hdr);
156
364
            return NULL;
157
364
        }
158
392
    }
159
160
680
    cp = (char *)b->data;
161
680
    endp = cp + b->uncomp_size;
162
163
680
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
164
680
        hdr->ref_seq_id = fd->vv.varint_get32(&cp, endp, &err);
165
680
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
166
0
            hdr->ref_seq_start = fd->vv.varint_get64(&cp, endp, &err);
167
0
            hdr->ref_seq_span  = fd->vv.varint_get64(&cp, endp, &err);
168
680
        } else {
169
680
            hdr->ref_seq_start = fd->vv.varint_get32(&cp, endp, &err);
170
680
            hdr->ref_seq_span  = fd->vv.varint_get32(&cp, endp, &err);
171
680
        }
172
680
        hdr->num_records   = fd->vv.varint_get32(&cp, endp, &err);
173
680
        hdr->num_landmarks = fd->vv.varint_get32(&cp, endp, &err);
174
680
        if (hdr->num_landmarks < 0 ||
175
675
            hdr->num_landmarks >= SIZE_MAX / sizeof(int32_t) ||
176
675
            endp - cp < hdr->num_landmarks) {
177
7
            free(hdr);
178
7
            return NULL;
179
7
        }
180
673
        if (!(hdr->landmark = malloc(hdr->num_landmarks * sizeof(int32_t)))) {
181
0
            free(hdr);
182
0
            return NULL;
183
0
        }
184
67.5k
        for (i = 0; i < hdr->num_landmarks; i++)
185
66.8k
            hdr->landmark[i] = fd->vv.varint_get32(&cp, endp, &err);;
186
673
    }
187
188
673
    hdr->preservation_map = kh_init(map);
189
190
673
    memset(hdr->rec_encoding_map, 0,
191
673
           CRAM_MAP_HASH * sizeof(hdr->rec_encoding_map[0]));
192
673
    memset(hdr->tag_encoding_map, 0,
193
673
           CRAM_MAP_HASH * sizeof(hdr->tag_encoding_map[0]));
194
195
673
    if (!hdr->preservation_map) {
196
0
        cram_free_compression_header(hdr);
197
0
        return NULL;
198
0
    }
199
200
    /* Initialise defaults for preservation map */
201
673
    hdr->read_names_included = 0;
202
673
    hdr->AP_delta = 1;
203
673
    hdr->qs_seq_orient = 1;
204
673
    memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20);
205
206
    /* Preservation map */
207
673
    map_size  = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp;
208
673
    map_count = fd->vv.varint_get32(&cp, endp, &err);
209
69.7k
    for (i = 0; i < map_count; i++) {
210
69.2k
        pmap_t hd;
211
69.2k
        khint_t k;
212
69.2k
        int r;
213
214
69.2k
        if (endp - cp < 3) {
215
136
            cram_free_compression_header(hdr);
216
136
            return NULL;
217
136
        }
218
69.1k
        cp += 2;
219
69.1k
        switch(CRAM_KEY(cp[-2],cp[-1])) {
220
24
        case CRAM_KEY('M','I'): // was mapped QS included in V1.0
221
99
        case CRAM_KEY('U','I'): // was unmapped QS included in V1.0
222
117
        case CRAM_KEY('P','I'): // was unmapped placed in V1.0
223
117
            hd.i = *cp++;
224
117
            break;
225
226
89
        case CRAM_KEY('R','N'):
227
89
            hd.i = *cp++;
228
89
            k = kh_put(map, hdr->preservation_map, "RN", &r);
229
89
            if (-1 == r) {
230
0
                cram_free_compression_header(hdr);
231
0
                return NULL;
232
0
            }
233
234
89
            kh_val(hdr->preservation_map, k) = hd;
235
89
            hdr->read_names_included = hd.i;
236
89
            break;
237
238
99
        case CRAM_KEY('A','P'):
239
99
            hd.i = *cp++;
240
99
            k = kh_put(map, hdr->preservation_map, "AP", &r);
241
99
            if (-1 == r) {
242
0
                cram_free_compression_header(hdr);
243
0
                return NULL;
244
0
            }
245
246
99
            kh_val(hdr->preservation_map, k) = hd;
247
99
            hdr->AP_delta = hd.i;
248
99
            break;
249
250
5.18k
        case CRAM_KEY('R','R'):
251
5.18k
            hd.i = *cp++;
252
5.18k
            k = kh_put(map, hdr->preservation_map, "RR", &r);
253
5.18k
            if (-1 == r) {
254
0
                cram_free_compression_header(hdr);
255
0
                return NULL;
256
0
            }
257
258
5.18k
            kh_val(hdr->preservation_map, k) = hd;
259
5.18k
            hdr->no_ref = !hd.i;
260
5.18k
            break;
261
262
108
        case CRAM_KEY('Q','O'):
263
108
            hd.i = *cp++;
264
108
            k = kh_put(map, hdr->preservation_map, "QO", &r);
265
108
            if (-1 == r) {
266
0
                cram_free_compression_header(hdr);
267
0
                return NULL;
268
0
            }
269
270
108
            kh_val(hdr->preservation_map, k) = hd;
271
108
            hdr->qs_seq_orient = hd.i;
272
108
            break;
273
274
491
        case CRAM_KEY('S','M'):
275
491
            if (endp - cp < 5) {
276
0
                cram_free_compression_header(hdr);
277
0
                return NULL;
278
0
            }
279
491
            hdr->substitution_matrix[0][(cp[0]>>6)&3] = 'C';
280
491
            hdr->substitution_matrix[0][(cp[0]>>4)&3] = 'G';
281
491
            hdr->substitution_matrix[0][(cp[0]>>2)&3] = 'T';
282
491
            hdr->substitution_matrix[0][(cp[0]>>0)&3] = 'N';
283
284
491
            hdr->substitution_matrix[1][(cp[1]>>6)&3] = 'A';
285
491
            hdr->substitution_matrix[1][(cp[1]>>4)&3] = 'G';
286
491
            hdr->substitution_matrix[1][(cp[1]>>2)&3] = 'T';
287
491
            hdr->substitution_matrix[1][(cp[1]>>0)&3] = 'N';
288
289
491
            hdr->substitution_matrix[2][(cp[2]>>6)&3] = 'A';
290
491
            hdr->substitution_matrix[2][(cp[2]>>4)&3] = 'C';
291
491
            hdr->substitution_matrix[2][(cp[2]>>2)&3] = 'T';
292
491
            hdr->substitution_matrix[2][(cp[2]>>0)&3] = 'N';
293
294
491
            hdr->substitution_matrix[3][(cp[3]>>6)&3] = 'A';
295
491
            hdr->substitution_matrix[3][(cp[3]>>4)&3] = 'C';
296
491
            hdr->substitution_matrix[3][(cp[3]>>2)&3] = 'G';
297
491
            hdr->substitution_matrix[3][(cp[3]>>0)&3] = 'N';
298
299
491
            hdr->substitution_matrix[4][(cp[4]>>6)&3] = 'A';
300
491
            hdr->substitution_matrix[4][(cp[4]>>4)&3] = 'C';
301
491
            hdr->substitution_matrix[4][(cp[4]>>2)&3] = 'G';
302
491
            hdr->substitution_matrix[4][(cp[4]>>0)&3] = 'T';
303
304
491
            hd.p = cp;
305
491
            cp += 5;
306
307
491
            k = kh_put(map, hdr->preservation_map, "SM", &r);
308
491
            if (-1 == r) {
309
0
                cram_free_compression_header(hdr);
310
0
                return NULL;
311
0
            }
312
491
            kh_val(hdr->preservation_map, k) = hd;
313
491
            break;
314
315
376
        case CRAM_KEY('T','D'): {
316
376
            int sz = cram_decode_TD(fd, cp, endp, hdr); // tag dictionary
317
376
            if (sz < 0) {
318
5
                cram_free_compression_header(hdr);
319
5
                return NULL;
320
5
            }
321
322
371
            hd.p = cp;
323
371
            cp += sz;
324
325
371
            k = kh_put(map, hdr->preservation_map, "TD", &r);
326
371
            if (-1 == r) {
327
0
                cram_free_compression_header(hdr);
328
0
                return NULL;
329
0
            }
330
371
            kh_val(hdr->preservation_map, k) = hd;
331
371
            break;
332
371
        }
333
334
62.6k
        default:
335
62.6k
            hts_log_warning("Unrecognised preservation map key %c%c", cp[-2], cp[-1]);
336
            // guess byte;
337
62.6k
            cp++;
338
62.6k
            break;
339
69.1k
        }
340
69.1k
    }
341
532
    if (cp - cp_copy != map_size) {
342
26
        cram_free_compression_header(hdr);
343
26
        return NULL;
344
26
    }
345
346
    /* Record encoding map */
347
506
    map_size  = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp;
348
506
    map_count = fd->vv.varint_get32(&cp, endp, &err);
349
506
    int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0;
350
32.4k
    for (i = 0; i < map_count; i++) {
351
32.3k
        char *key = cp;
352
32.3k
        int32_t encoding = E_NULL;
353
32.3k
        int32_t size = 0;
354
32.3k
        ptrdiff_t offset;
355
32.3k
        cram_map *m;
356
32.3k
        enum cram_DS_ID ds_id;
357
32.3k
        enum cram_external_type type;
358
359
32.3k
        if (endp - cp < 4) {
360
31
            cram_free_compression_header(hdr);
361
31
            return NULL;
362
31
        }
363
364
32.3k
        cp += 2;
365
32.3k
        encoding = fd->vv.varint_get32(&cp, endp, &err);
366
32.3k
        size     = fd->vv.varint_get32(&cp, endp, &err);
367
368
32.3k
        offset = cp - (char *)b->data;
369
370
32.3k
        if (encoding == E_NULL)
371
11.0k
            continue;
372
373
21.2k
        if (size < 0 || endp - cp < size) {
374
160
            cram_free_compression_header(hdr);
375
160
            return NULL;
376
160
        }
377
378
        //printf("%s codes for %.2s\n", cram_encoding2str(encoding), key);
379
380
        /*
381
         * For CRAM1.0 CF and BF are Byte and not Int.
382
         * Practically speaking it makes no difference unless we have a
383
         * 1.0 format file that stores these in EXTERNAL as only then
384
         * does Byte vs Int matter.
385
         *
386
         * Neither this C code nor Java reference implementations did this,
387
         * so we gloss over it and treat them as int.
388
         */
389
21.1k
        ds_id = DS_CORE;
390
21.1k
        if (key[0] == 'B' && key[1] == 'F') {
391
53
            ds_id = DS_BF; type = E_INT;
392
21.0k
        } else if (key[0] == 'C' && key[1] == 'F') {
393
220
            ds_id = DS_CF; type = E_INT;
394
20.8k
        } else if (key[0] == 'R' && key[1] == 'I') {
395
138
            ds_id = DS_RI; type = E_INT;
396
20.7k
        } else if (key[0] == 'R' && key[1] == 'L') {
397
17
            ds_id = DS_RL; type = E_INT;
398
20.6k
        } else if (key[0] == 'A' && key[1] == 'P') {
399
321
            ds_id = DS_AP;
400
321
            type = is_v4 ? E_SLONG : E_INT;
401
20.3k
        } else if (key[0] == 'R' && key[1] == 'G') {
402
6
            ds_id = DS_RG;
403
6
            type = E_INT;
404
20.3k
        } else if (key[0] == 'M' && key[1] == 'F') {
405
42
            ds_id = DS_MF; type = E_INT;
406
20.3k
        } else if (key[0] == 'N' && key[1] == 'S') {
407
143
            ds_id = DS_NS; type = E_INT;
408
20.1k
        } else if (key[0] == 'N' && key[1] == 'P') {
409
569
            ds_id = DS_NP;
410
569
            type = is_v4 ? E_LONG : E_INT;
411
19.6k
        } else if (key[0] == 'T' && key[1] == 'S') {
412
71
            ds_id = DS_TS;
413
71
            type = is_v4 ? E_SLONG : E_INT;
414
19.5k
        } else if (key[0] == 'N' && key[1] == 'F') {
415
125
            ds_id = DS_NF; type = E_INT;
416
19.4k
        } else if (key[0] == 'T' && key[1] == 'C') {
417
194
            ds_id = DS_TC; type = E_BYTE;
418
19.2k
        } else if (key[0] == 'T' && key[1] == 'N') {
419
0
            ds_id = DS_TN; type = E_INT;
420
19.2k
        } else if (key[0] == 'F' && key[1] == 'N') {
421
0
            ds_id = DS_FN; type = E_INT;
422
19.2k
        } else if (key[0] == 'F' && key[1] == 'C') {
423
192
            ds_id = DS_FC; type = E_BYTE;
424
19.0k
        } else if (key[0] == 'F' && key[1] == 'P') {
425
0
            ds_id = DS_FP; type = E_INT;
426
19.0k
        } else if (key[0] == 'B' && key[1] == 'S') {
427
6
            ds_id = DS_BS; type = E_BYTE;
428
19.0k
        } else if (key[0] == 'I' && key[1] == 'N') {
429
11
            ds_id = DS_IN; type = E_BYTE_ARRAY;
430
19.0k
        } else if (key[0] == 'S' && key[1] == 'C') {
431
11
            ds_id = DS_SC; type = E_BYTE_ARRAY;
432
19.0k
        } else if (key[0] == 'D' && key[1] == 'L') {
433
104
            ds_id = DS_DL; type = E_INT;
434
18.8k
        } else if (key[0] == 'B' && key[1] == 'A') {
435
15
            ds_id = DS_BA; type = E_BYTE;
436
18.8k
        } else if (key[0] == 'B' && key[1] == 'B') {
437
15
            ds_id = DS_BB; type = E_BYTE_ARRAY;
438
18.8k
        } else if (key[0] == 'R' && key[1] == 'S') {
439
356
            ds_id = DS_RS; type = E_INT;
440
18.5k
        } else if (key[0] == 'P' && key[1] == 'D') {
441
0
            ds_id = DS_PD; type = E_INT;
442
18.5k
        } else if (key[0] == 'H' && key[1] == 'C') {
443
5
            ds_id = DS_HC; type = E_INT;
444
18.5k
        } else if (key[0] == 'M' && key[1] == 'Q') {
445
1.20k
            ds_id = DS_MQ; type = E_INT;
446
17.2k
        } else if (key[0] == 'R' && key[1] == 'N') {
447
18
            ds_id = DS_RN; type = E_BYTE_ARRAY_BLOCK;
448
17.2k
        } else if (key[0] == 'Q' && key[1] == 'S') {
449
294
            ds_id = DS_QS; type = E_BYTE;
450
16.9k
        } else if (key[0] == 'Q' && key[1] == 'Q') {
451
5
            ds_id = DS_QQ; type = E_BYTE_ARRAY;
452
16.9k
        } else if (key[0] == 'T' && key[1] == 'L') {
453
6
            ds_id = DS_TL; type = E_INT;
454
16.9k
        } else if (key[0] == 'T' && key[1] == 'M') {
455
16.9k
        } else if (key[0] == 'T' && key[1] == 'V') {
456
16.9k
        } else {
457
16.9k
            hts_log_warning("Unrecognised key: %.2s", key);
458
16.9k
        }
459
460
21.1k
        if (ds_id != DS_CORE) {
461
4.14k
            if (hdr->codecs[ds_id] != NULL) {
462
3.46k
                hts_log_warning("Codec for key %.2s defined more than once",
463
3.46k
                                key);
464
3.46k
                hdr->codecs[ds_id]->free(hdr->codecs[ds_id]);
465
3.46k
            }
466
4.14k
            hdr->codecs[ds_id] = cram_decoder_init(hdr, encoding, cp, size,
467
4.14k
                                                   type, fd->version, &fd->vv);
468
4.14k
            if (!hdr->codecs[ds_id]) {
469
170
                cram_free_compression_header(hdr);
470
170
                return NULL;
471
170
            }
472
4.14k
        }
473
474
20.9k
        cp += size;
475
476
        // Fill out cram_map purely for cram_dump to dump out.
477
20.9k
        m = malloc(sizeof(*m));
478
20.9k
        if (!m) {
479
0
            cram_free_compression_header(hdr);
480
0
            return NULL;
481
0
        }
482
20.9k
        m->key = CRAM_KEY(key[0], key[1]);
483
20.9k
        m->encoding = encoding;
484
20.9k
        m->size     = size;
485
20.9k
        m->offset   = offset;
486
20.9k
        m->codec = NULL;
487
488
20.9k
        m->next = hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])];
489
20.9k
        hdr->rec_encoding_map[CRAM_MAP(key[0], key[1])] = m;
490
20.9k
    }
491
145
    if (cp - cp_copy != map_size) {
492
7
        cram_free_compression_header(hdr);
493
7
        return NULL;
494
7
    }
495
496
    /* Tag encoding map */
497
138
    map_size  = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp;
498
138
    map_count = fd->vv.varint_get32(&cp, endp, &err);
499
1.04k
    for (i = 0; i < map_count; i++) {
500
932
        int32_t encoding = E_NULL;
501
932
        int32_t size = 0;
502
932
        cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc
503
932
        uint8_t key[3];
504
505
932
        if (!m || endp - cp < 6) {
506
0
            free(m);
507
0
            cram_free_compression_header(hdr);
508
0
            return NULL;
509
0
        }
510
511
932
        m->key = fd->vv.varint_get32(&cp, endp, &err);
512
932
        key[0] = m->key>>16;
513
932
        key[1] = m->key>>8;
514
932
        key[2] = m->key;
515
932
        encoding = fd->vv.varint_get32(&cp, endp, &err);
516
932
        size     = fd->vv.varint_get32(&cp, endp, &err);
517
518
932
        m->encoding = encoding;
519
932
        m->size     = size;
520
932
        m->offset   = cp - (char *)b->data;
521
932
        if (size < 0 || endp - cp < size ||
522
932
            !(m->codec = cram_decoder_init(hdr, encoding, cp, size,
523
932
                                           E_BYTE_ARRAY_BLOCK, fd->version, &fd->vv))) {
524
29
            cram_free_compression_header(hdr);
525
29
            free(m);
526
29
            return NULL;
527
29
        }
528
529
903
        cp += size;
530
531
903
        m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])];
532
903
        hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m;
533
903
    }
534
109
    if (err || cp - cp_copy != map_size) {
535
22
        cram_free_compression_header(hdr);
536
22
        return NULL;
537
22
    }
538
539
87
    return hdr;
540
109
}
541
542
/*
543
 * Note we also need to scan through the record encoding map to
544
 * see which data series share the same block, either external or
545
 * CORE. For example if we need the BF data series but MQ and CF
546
 * are also encoded in the same block then we need to add those in
547
 * as a dependency in order to correctly decode BF.
548
 *
549
 * Returns 0 on success
550
 *        -1 on failure
551
 */
552
int cram_dependent_data_series(cram_fd *fd,
553
                               cram_block_compression_hdr *hdr,
554
570
                               cram_slice *s) {
555
570
    int *block_used;
556
570
    int core_used = 0;
557
570
    int i;
558
570
    static int i_to_id[] = {
559
570
        DS_BF, DS_AP, DS_FP, DS_RL, DS_DL, DS_NF, DS_BA, DS_QS,
560
570
        DS_FC, DS_FN, DS_BS, DS_IN, DS_RG, DS_MQ, DS_TL, DS_RN,
561
570
        DS_NS, DS_NP, DS_TS, DS_MF, DS_CF, DS_RI, DS_RS, DS_PD,
562
570
        DS_HC, DS_SC, DS_BB, DS_QQ,
563
570
    };
564
570
    uint32_t orig_ds;
565
566
    /*
567
     * Set the data_series bit field based on fd->required_fields
568
     * contents.
569
     */
570
570
    if (fd->required_fields && fd->required_fields != INT_MAX) {
571
0
        s->data_series = 0;
572
573
0
        if (fd->required_fields & SAM_QNAME)
574
0
            s->data_series |= CRAM_RN;
575
576
0
        if (fd->required_fields & SAM_FLAG)
577
0
            s->data_series |= CRAM_BF;
578
579
0
        if (fd->required_fields & SAM_RNAME)
580
0
            s->data_series |= CRAM_RI | CRAM_BF;
581
582
0
        if (fd->required_fields & SAM_POS)
583
0
            s->data_series |= CRAM_AP | CRAM_BF;
584
585
0
        if (fd->required_fields & SAM_MAPQ)
586
0
            s->data_series |= CRAM_MQ;
587
588
0
        if (fd->required_fields & SAM_CIGAR)
589
0
            s->data_series |= CRAM_CIGAR;
590
591
0
        if (fd->required_fields & SAM_RNEXT)
592
0
            s->data_series |= CRAM_CF | CRAM_NF | CRAM_RI | CRAM_NS |CRAM_BF;
593
594
0
        if (fd->required_fields & SAM_PNEXT)
595
0
            s->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_NP | CRAM_BF;
596
597
0
        if (fd->required_fields & SAM_TLEN)
598
0
            s->data_series |= CRAM_CF | CRAM_NF | CRAM_AP | CRAM_TS |
599
0
                CRAM_BF | CRAM_MF | CRAM_RI | CRAM_CIGAR;
600
601
0
        if (fd->required_fields & SAM_SEQ)
602
0
            s->data_series |= CRAM_SEQ;
603
604
0
        if (!(fd->required_fields & SAM_AUX))
605
            // No easy way to get MD/NM without other tags at present
606
0
            s->decode_md = 0;
607
608
0
        if (fd->required_fields & SAM_QUAL)
609
0
            s->data_series |= CRAM_QUAL;
610
611
0
        if (fd->required_fields & SAM_AUX)
612
0
            s->data_series |= CRAM_RG | CRAM_TL | CRAM_aux;
613
614
0
        if (fd->required_fields & SAM_RGAUX)
615
0
            s->data_series |= CRAM_RG | CRAM_BF;
616
617
        // Always uncompress CORE block
618
0
        if (cram_uncompress_block(s->block[0]))
619
0
            return -1;
620
570
    } else {
621
570
        s->data_series = CRAM_ALL;
622
623
2.81k
        for (i = 0; i < s->hdr->num_blocks; i++) {
624
2.25k
            if (cram_uncompress_block(s->block[i]))
625
9
                return -1;
626
2.25k
        }
627
628
561
        return 0;
629
570
    }
630
631
0
    block_used = calloc(s->hdr->num_blocks+1, sizeof(int));
632
0
    if (!block_used)
633
0
        return -1;
634
635
0
    do {
636
        /*
637
         * Also set data_series based on code prerequisites. Eg if we need
638
         * CRAM_QS then we also need to know CRAM_RL so we know how long it
639
         * is, or if we need FC/FP then we also need FN (number of features).
640
         *
641
         * It's not reciprocal though. We may be needing to decode FN
642
         * but have no need to decode FC, FP and cigar ops.
643
         */
644
0
        if (s->data_series & CRAM_RS)    s->data_series |= CRAM_FC|CRAM_FP;
645
0
        if (s->data_series & CRAM_PD)    s->data_series |= CRAM_FC|CRAM_FP;
646
0
        if (s->data_series & CRAM_HC)    s->data_series |= CRAM_FC|CRAM_FP;
647
0
        if (s->data_series & CRAM_QS)    s->data_series |= CRAM_FC|CRAM_FP;
648
0
        if (s->data_series & CRAM_IN)    s->data_series |= CRAM_FC|CRAM_FP;
649
0
        if (s->data_series & CRAM_SC)    s->data_series |= CRAM_FC|CRAM_FP;
650
0
        if (s->data_series & CRAM_BS)    s->data_series |= CRAM_FC|CRAM_FP;
651
0
        if (s->data_series & CRAM_DL)    s->data_series |= CRAM_FC|CRAM_FP;
652
0
        if (s->data_series & CRAM_BA)    s->data_series |= CRAM_FC|CRAM_FP;
653
0
        if (s->data_series & CRAM_BB)    s->data_series |= CRAM_FC|CRAM_FP;
654
0
        if (s->data_series & CRAM_QQ)    s->data_series |= CRAM_FC|CRAM_FP;
655
656
        // cram_decode_seq() needs seq[] array
657
0
        if (s->data_series & (CRAM_SEQ|CRAM_CIGAR)) s->data_series |= CRAM_RL;
658
659
0
        if (s->data_series & CRAM_FP)    s->data_series |= CRAM_FC;
660
0
        if (s->data_series & CRAM_FC)    s->data_series |= CRAM_FN;
661
0
        if (s->data_series & CRAM_aux)   s->data_series |= CRAM_TL;
662
0
        if (s->data_series & CRAM_MF)    s->data_series |= CRAM_CF;
663
0
        if (s->data_series & CRAM_MQ)    s->data_series |= CRAM_BF;
664
0
        if (s->data_series & CRAM_BS)    s->data_series |= CRAM_RI;
665
0
        if (s->data_series & (CRAM_MF |CRAM_NS |CRAM_NP |CRAM_TS |CRAM_NF))
666
0
            s->data_series |= CRAM_CF;
667
0
        if (!hdr->read_names_included && s->data_series & CRAM_RN)
668
0
            s->data_series |= CRAM_CF | CRAM_NF;
669
0
        if (s->data_series & (CRAM_BA | CRAM_QS | CRAM_BB | CRAM_QQ))
670
0
            s->data_series |= CRAM_BF | CRAM_CF | CRAM_RL;
671
0
        if (s->data_series & CRAM_FN) {
672
            // The CRAM_FN loop checks for reference length boundaries,
673
            // which needs a working seq_pos.  Some fields are fixed size
674
            // irrespective of if we decode (BS), but others need to know
675
            // the size of the string fetched back (SC, IN, BB).
676
0
            s->data_series |= CRAM_SC | CRAM_IN | CRAM_BB;
677
0
        }
678
679
0
        orig_ds = s->data_series;
680
681
        // Find which blocks are in use.
682
0
        for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
683
0
            int bnum1, bnum2, j;
684
0
            cram_codec *c = hdr->codecs[i_to_id[i]];
685
686
0
            if (!(s->data_series & (1<<i)))
687
0
                continue;
688
689
0
            if (!c)
690
0
                continue;
691
692
0
            bnum1 = cram_codec_to_id(c, &bnum2);
693
694
0
            for (;;) {
695
0
                switch (bnum1) {
696
0
                case -2:
697
0
                    break;
698
699
0
                case -1:
700
0
                    core_used = 1;
701
0
                    break;
702
703
0
                default:
704
0
                    for (j = 0; j < s->hdr->num_blocks; j++) {
705
0
                        if (s->block[j]->content_type == EXTERNAL &&
706
0
                            s->block[j]->content_id == bnum1) {
707
0
                            block_used[j] = 1;
708
0
                            if (cram_uncompress_block(s->block[j])) {
709
0
                                free(block_used);
710
0
                                return -1;
711
0
                            }
712
0
                        }
713
0
                    }
714
0
                    break;
715
0
                }
716
717
0
                if (bnum2 == -2 || bnum1 == bnum2)
718
0
                    break;
719
720
0
                bnum1 = bnum2; // 2nd pass
721
0
            }
722
0
        }
723
724
        // Tags too
725
0
        if ((fd->required_fields & SAM_AUX) ||
726
0
            (s->data_series & CRAM_aux)) {
727
0
            for (i = 0; i < CRAM_MAP_HASH; i++) {
728
0
                int bnum1, bnum2, j;
729
0
                cram_map *m = hdr->tag_encoding_map[i];
730
731
0
                while (m) {
732
0
                    cram_codec *c = m->codec;
733
0
                    if (!c) {
734
0
                        m = m->next;
735
0
                        continue;
736
0
                    }
737
738
0
                    bnum1 = cram_codec_to_id(c, &bnum2);
739
740
0
                    for (;;) {
741
0
                        switch (bnum1) {
742
0
                        case -2:
743
0
                            break;
744
745
0
                        case -1:
746
0
                            core_used = 1;
747
0
                            break;
748
749
0
                        default:
750
0
                            for (j = 0; j < s->hdr->num_blocks; j++) {
751
0
                                if (s->block[j]->content_type == EXTERNAL &&
752
0
                                    s->block[j]->content_id == bnum1) {
753
0
                                    block_used[j] = 1;
754
0
                                    if (cram_uncompress_block(s->block[j])) {
755
0
                                        free(block_used);
756
0
                                        return -1;
757
0
                                    }
758
0
                                }
759
0
                            }
760
0
                            break;
761
0
                        }
762
763
0
                        if (bnum2 == -2 || bnum1 == bnum2)
764
0
                            break;
765
766
0
                        bnum1 = bnum2; // 2nd pass
767
0
                    }
768
769
0
                    m = m->next;
770
0
                }
771
0
            }
772
0
        }
773
774
        // We now know which blocks are in used, so repeat and find
775
        // which other data series need to be added.
776
0
        for (i = 0; i < sizeof(i_to_id)/sizeof(*i_to_id); i++) {
777
0
            int bnum1, bnum2, j;
778
0
            cram_codec *c = hdr->codecs[i_to_id[i]];
779
780
0
            if (!c)
781
0
                continue;
782
783
0
            bnum1 = cram_codec_to_id(c, &bnum2);
784
785
0
            for (;;) {
786
0
                switch (bnum1) {
787
0
                case -2:
788
0
                    break;
789
790
0
                case -1:
791
0
                    if (core_used) {
792
                        //printf(" + data series %08x:\n", 1<<i);
793
0
                        s->data_series |= 1<<i;
794
0
                    }
795
0
                    break;
796
797
0
                default:
798
0
                    for (j = 0; j < s->hdr->num_blocks; j++) {
799
0
                        if (s->block[j]->content_type == EXTERNAL &&
800
0
                            s->block[j]->content_id == bnum1) {
801
0
                            if (block_used[j]) {
802
                                //printf(" + data series %08x:\n", 1<<i);
803
0
                                s->data_series |= 1<<i;
804
0
                            }
805
0
                        }
806
0
                    }
807
0
                    break;
808
0
                }
809
810
0
                if (bnum2 == -2 || bnum1 == bnum2)
811
0
                    break;
812
813
0
                bnum1 = bnum2; // 2nd pass
814
0
            }
815
0
        }
816
817
        // Tags too
818
0
        for (i = 0; i < CRAM_MAP_HASH; i++) {
819
0
            int bnum1, bnum2, j;
820
0
            cram_map *m = hdr->tag_encoding_map[i];
821
822
0
            while (m) {
823
0
                cram_codec *c = m->codec;
824
0
                if (!c) {
825
0
                    m = m->next;
826
0
                    continue;
827
0
                }
828
829
0
                bnum1 = cram_codec_to_id(c, &bnum2);
830
831
0
                for (;;) {
832
0
                    switch (bnum1) {
833
0
                    case -2:
834
0
                        break;
835
836
0
                    case -1:
837
                        //printf(" + data series %08x:\n", CRAM_aux);
838
0
                        s->data_series |= CRAM_aux;
839
0
                        break;
840
841
0
                    default:
842
0
                        for (j = 0; j < s->hdr->num_blocks; j++) {
843
0
                            if (s->block[j]->content_type == EXTERNAL &&
844
0
                                s->block[j]->content_id == bnum1) {
845
0
                                if (block_used[j]) {
846
                                    //printf(" + data series %08x:\n",
847
                                    //       CRAM_aux);
848
0
                                    s->data_series |= CRAM_aux;
849
0
                                }
850
0
                            }
851
0
                        }
852
0
                        break;
853
0
                    }
854
855
0
                    if (bnum2 == -2 || bnum1 == bnum2)
856
0
                        break;
857
858
0
                    bnum1 = bnum2; // 2nd pass
859
0
                }
860
861
0
                m = m->next;
862
0
            }
863
0
        }
864
0
    } while (orig_ds != s->data_series);
865
866
0
    free(block_used);
867
0
    return 0;
868
0
}
869
870
/*
871
 * Checks whether an external block is used solely by a single data series.
872
 * Returns the codec type if so (EXTERNAL, BYTE_ARRAY_LEN, BYTE_ARRAY_STOP)
873
 *         or 0 if not (E_NULL).
874
 */
875
static int cram_ds_unique(cram_block_compression_hdr *hdr, cram_codec *c,
876
0
                          int id) {
877
0
    int i, n_id = 0;
878
0
    enum cram_encoding e_type = 0;
879
880
0
    for (i = 0; i < DS_END; i++) {
881
0
        cram_codec *c;
882
0
        int bnum1, bnum2, old_n_id;
883
884
0
        if (!(c = hdr->codecs[i]))
885
0
            continue;
886
887
0
        bnum1 = cram_codec_to_id(c, &bnum2);
888
889
0
        old_n_id = n_id;
890
0
        if (bnum1 == id) {
891
0
            n_id++;
892
0
            e_type = c->codec;
893
0
        }
894
0
        if (bnum2 == id) {
895
0
            n_id++;
896
0
            e_type = c->codec;
897
0
        }
898
899
0
        if (n_id == old_n_id+2)
900
0
            n_id--; // len/val in same place counts once only.
901
0
    }
902
903
0
    return n_id == 1 ? e_type : 0;
904
0
}
905
906
/*
907
 * Attempts to estimate the size of some blocks so we can preallocate them
908
 * before decoding.  Although decoding will automatically grow the blocks,
909
 * it is typically more efficient to preallocate.
910
 */
911
void cram_decode_estimate_sizes(cram_block_compression_hdr *hdr, cram_slice *s,
912
                                int *qual_size, int *name_size,
913
0
                                int *q_id) {
914
0
    int bnum1, bnum2;
915
0
    cram_codec *cd;
916
917
0
    *qual_size = 0;
918
0
    *name_size = 0;
919
920
    /* Qual */
921
0
    cd = hdr->codecs[DS_QS];
922
0
    if (cd == NULL) return;
923
0
    bnum1 = cram_codec_to_id(cd, &bnum2);
924
0
    if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
925
0
    if (cram_ds_unique(hdr, cd, bnum1)) {
926
0
        cram_block *b = cram_get_block_by_id(s, bnum1);
927
0
        if (b) *qual_size = b->uncomp_size;
928
0
        if (q_id && cd->codec == E_EXTERNAL)
929
0
            *q_id = bnum1;
930
0
    }
931
932
    /* Name */
933
0
    cd = hdr->codecs[DS_RN];
934
0
    if (cd == NULL) return;
935
0
    bnum1 = cram_codec_to_id(cd, &bnum2);
936
0
    if (bnum1 < 0 && bnum2 >= 0) bnum1 = bnum2;
937
0
    if (cram_ds_unique(hdr, cd, bnum1)) {
938
0
        cram_block *b = cram_get_block_by_id(s, bnum1);
939
0
        if (b) *name_size = b->uncomp_size;
940
0
    }
941
0
}
942
943
944
/* ----------------------------------------------------------------------
945
 * CRAM slices
946
 */
947
948
/*
949
 * Decodes a CRAM (un)mapped slice header block.
950
 * Returns slice header ptr on success
951
 *         NULL on failure
952
 */
953
585
cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) {
954
585
    cram_block_slice_hdr *hdr;
955
585
    unsigned char *cp;
956
585
    unsigned char *cp_end;
957
585
    int i, err = 0;
958
959
585
    if (b->method != RAW) {
960
        /* Spec. says slice header should be RAW, but we can future-proof
961
           by trying to decode it if it isn't. */
962
0
        if (cram_uncompress_block(b) < 0)
963
0
            return NULL;
964
0
    }
965
585
    cp =  (unsigned char *)BLOCK_DATA(b);
966
585
    cp_end = cp + b->uncomp_size;
967
968
585
    if (b->content_type != MAPPED_SLICE &&
969
0
        b->content_type != UNMAPPED_SLICE)
970
0
        return NULL;
971
972
585
    if (!(hdr  = calloc(1, sizeof(*hdr))))
973
0
        return NULL;
974
975
585
    hdr->content_type = b->content_type;
976
977
585
    if (b->content_type == MAPPED_SLICE) {
978
585
        hdr->ref_seq_id = fd->vv.varint_get32s((char **)&cp, (char *)cp_end, &err);
979
585
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
980
0
            hdr->ref_seq_start = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err);
981
0
            hdr->ref_seq_span  = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err);
982
585
        } else {
983
585
            hdr->ref_seq_start = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
984
585
            hdr->ref_seq_span  = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
985
585
        }
986
585
        if (hdr->ref_seq_start < 0 || hdr->ref_seq_span < 0) {
987
0
            free(hdr);
988
0
            hts_log_error("Negative values not permitted for header "
989
0
                          "sequence start or span fields");
990
0
            return NULL;
991
0
        }
992
585
    }
993
585
    hdr->num_records = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err);
994
585
    hdr->record_counter = 0;
995
585
    if (CRAM_MAJOR_VERS(fd->version) == 2) {
996
0
        hdr->record_counter = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
997
585
    } else if (CRAM_MAJOR_VERS(fd->version) >= 3) {
998
0
        hdr->record_counter = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err);
999
0
    }
1000
585
    hdr->num_blocks      = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
1001
585
    hdr->num_content_ids = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err);
1002
585
    if (hdr->num_content_ids < 1 ||
1003
585
        hdr->num_content_ids >= 10000) {
1004
        // Slice must have at least one data block, and there is no need
1005
        // for more than 2 per possible aux-tag plus ancillary.
1006
0
        free(hdr);
1007
0
        return NULL;
1008
0
    }
1009
585
    hdr->block_content_ids = malloc(hdr->num_content_ids * sizeof(int32_t));
1010
585
    if (!hdr->block_content_ids) {
1011
0
        free(hdr);
1012
0
        return NULL;
1013
0
    }
1014
1015
11.9k
    for (i = 0; i < hdr->num_content_ids; i++)
1016
11.3k
        hdr->block_content_ids[i] = fd->vv.varint_get32((char **)&cp,
1017
11.3k
                                                         (char *)cp_end,
1018
11.3k
                                                         &err);
1019
585
    if (err) {
1020
5
        free(hdr->block_content_ids);
1021
5
        free(hdr);
1022
5
        return NULL;
1023
5
    }
1024
1025
580
    if (b->content_type == MAPPED_SLICE)
1026
580
        hdr->ref_base_id = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err);
1027
1028
580
    if (CRAM_MAJOR_VERS(fd->version) != 1) {
1029
0
        if (cp_end - cp < 16) {
1030
0
            free(hdr->block_content_ids);
1031
0
            free(hdr);
1032
0
            return NULL;
1033
0
        }
1034
0
        memcpy(hdr->md5, cp, 16);
1035
580
    } else {
1036
580
        memset(hdr->md5, 0, 16);
1037
580
    }
1038
1039
580
    if (!err)
1040
577
        return hdr;
1041
1042
3
    free(hdr->block_content_ids);
1043
3
    free(hdr);
1044
3
    return NULL;
1045
580
}
1046
1047
1048
#if 0
1049
/* Returns the number of bits set in val; it the highest bit used */
1050
static int nbits(int v) {
1051
    static const int MultiplyDeBruijnBitPosition[32] = {
1052
        1, 10, 2, 11, 14, 22, 3, 30, 12, 15, 17, 19, 23, 26, 4, 31,
1053
        9, 13, 21, 29, 16, 18, 25, 8, 20, 28, 24, 7, 27, 6, 5, 32
1054
    };
1055
1056
    v |= v >> 1; // first up to set all bits 1 after the first 1 */
1057
    v |= v >> 2;
1058
    v |= v >> 4;
1059
    v |= v >> 8;
1060
    v |= v >> 16;
1061
1062
    // DeBruijn magic to find top bit
1063
    return MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
1064
}
1065
#endif
1066
1067
#if 0
1068
static int sort_freqs(const void *vp1, const void *vp2) {
1069
    const int i1 = *(const int *)vp1;
1070
    const int i2 = *(const int *)vp2;
1071
    return i1-i2;
1072
}
1073
#endif
1074
1075
/* ----------------------------------------------------------------------
1076
 * Primary CRAM sequence decoder
1077
 */
1078
1079
0
static inline int add_md_char(cram_slice *s, int decode_md, char c, int32_t *md_dist) {
1080
0
    if (decode_md) {
1081
0
        BLOCK_APPEND_UINT(s->aux_blk, *md_dist);
1082
0
        BLOCK_APPEND_CHAR(s->aux_blk, c);
1083
0
        *md_dist = 0;
1084
0
    }
1085
0
    return 0;
1086
1087
0
 block_err:
1088
0
    return -1;
1089
0
}
1090
1091
/*
1092
 * Internal part of cram_decode_slice().
1093
 * Generates the sequence, quality and cigar components.
1094
 */
1095
static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
1096
                           cram_block *blk, cram_record *cr, sam_hdr_t *sh,
1097
                           int cf, char *seq, char *qual,
1098
0
                           int has_MD, int has_NM) {
1099
0
    int prev_pos = 0, f, r = 0, out_sz = 1;
1100
0
    int seq_pos = 1;
1101
0
    int cig_len = 0;
1102
0
    int64_t ref_pos = cr->apos;
1103
0
    int32_t fn, i32;
1104
0
    enum cigar_op cig_op = BAM_CMATCH;
1105
0
    uint32_t *cigar = s->cigar;
1106
0
    uint32_t ncigar = s->ncigar;
1107
0
    uint32_t cigar_alloc = s->cigar_alloc;
1108
0
    uint32_t nm = 0;
1109
0
    int32_t md_dist = 0;
1110
0
    int orig_aux = 0;
1111
    // CRAM <  4.0 decode_md is off/on
1112
    // CRAM >= 4.0 decode_md is auto/on (auto=on if MD* present, off otherwise)
1113
0
    int do_md = CRAM_MAJOR_VERS(fd->version) >= 4
1114
0
        ? (s->decode_md > 0)
1115
0
        : (s->decode_md != 0);
1116
0
    int decode_md = s->ref && cr->ref_id >= 0 && ((do_md && !has_MD) || has_MD < 0);
1117
0
    int decode_nm = s->ref && cr->ref_id >= 0 && ((do_md && !has_NM) || has_NM < 0);
1118
0
    uint32_t ds = s->data_series;
1119
0
    sam_hrecs_t *bfd = sh->hrecs;
1120
1121
0
    cram_codec **codecs = c->comp_hdr->codecs;
1122
1123
0
    if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1124
0
        memset(qual, 255, cr->len);
1125
0
    }
1126
1127
0
    if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
1128
0
        decode_md = decode_nm = 0;
1129
1130
0
    if (decode_md) {
1131
0
        orig_aux = BLOCK_SIZE(s->aux_blk);
1132
0
        if (has_MD == 0)
1133
0
            BLOCK_APPEND(s->aux_blk, "MDZ", 3);
1134
0
    }
1135
1136
0
    if (ds & CRAM_FN) {
1137
0
        if (!codecs[DS_FN]) return -1;
1138
0
        r |= codecs[DS_FN]->decode(s,codecs[DS_FN],
1139
0
                                   blk, (char *)&fn, &out_sz);
1140
0
        if (r) return r;
1141
0
    } else {
1142
0
        fn = 0;
1143
0
    }
1144
1145
0
    ref_pos--; // count from 0
1146
0
    cr->cigar = ncigar;
1147
1148
0
    if (!(ds & (CRAM_FC | CRAM_FP)))
1149
0
        goto skip_cigar;
1150
1151
0
    if (fn) {
1152
0
        if ((ds & CRAM_FC) && !codecs[DS_FC])
1153
0
            return -1;
1154
0
        if ((ds & CRAM_FP) && !codecs[DS_FP])
1155
0
            return -1;
1156
0
    }
1157
1158
0
    for (f = 0; f < fn; f++) {
1159
0
        int32_t pos = 0;
1160
0
        char op;
1161
1162
0
        if (ncigar+2 >= cigar_alloc) {
1163
0
            cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1164
0
            if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar))))
1165
0
                return -1;
1166
0
            s->cigar = cigar;
1167
0
        }
1168
1169
0
        if (ds & CRAM_FC) {
1170
0
            r |= codecs[DS_FC]->decode(s,
1171
0
                                       codecs[DS_FC],
1172
0
                                       blk,
1173
0
                                       &op,  &out_sz);
1174
0
            if (r) return r;
1175
0
        }
1176
1177
0
        if (!(ds & CRAM_FP))
1178
0
            continue;
1179
1180
0
        r |= codecs[DS_FP]->decode(s,
1181
0
                                   codecs[DS_FP],
1182
0
                                   blk,
1183
0
                                   (char *)&pos, &out_sz);
1184
0
        if (r) return r;
1185
0
        pos += prev_pos;
1186
1187
0
        if (pos <= 0) {
1188
0
            hts_log_error("Feature position %d before start of read", pos);
1189
0
            return -1;
1190
0
        }
1191
1192
0
        if (pos > seq_pos) {
1193
0
            if (pos > cr->len+1)
1194
0
                return -1;
1195
1196
0
            if (s->ref && cr->ref_id >= 0) {
1197
0
                if (ref_pos + pos - seq_pos > bfd->ref[cr->ref_id].len) {
1198
0
                    static int whinged = 0;
1199
0
                    int rlen;
1200
0
                    if (!whinged)
1201
0
                        hts_log_warning("Ref pos outside of ref sequence boundary");
1202
0
                    whinged = 1;
1203
0
                    rlen = bfd->ref[cr->ref_id].len - ref_pos;
1204
                    // May miss MD/NM cases where both seq/ref are N, but this is a
1205
                    // malformed cram file anyway.
1206
0
                    if (rlen > 0) {
1207
0
                        if (ref_pos + rlen > s->ref_end)
1208
0
                            goto beyond_slice;
1209
1210
0
                        memcpy(&seq[seq_pos-1],
1211
0
                               &s->ref[ref_pos - s->ref_start +1], rlen);
1212
0
                        if ((pos - seq_pos) - rlen > 0)
1213
0
                            memset(&seq[seq_pos-1+rlen], 'N',
1214
0
                                   (pos - seq_pos) - rlen);
1215
0
                    } else {
1216
0
                        memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
1217
0
                    }
1218
0
                    if (md_dist >= 0)
1219
0
                        md_dist += pos - seq_pos;
1220
0
                } else {
1221
                    // 'N' in both ref and seq is also mismatch for NM/MD
1222
0
                    if (ref_pos + pos-seq_pos > s->ref_end)
1223
0
                        goto beyond_slice;
1224
1225
0
                    const char *refp = s->ref + ref_pos - s->ref_start + 1;
1226
0
                    const int frag_len = pos - seq_pos;
1227
0
                    int do_cpy = 1;
1228
0
                    if (decode_md || decode_nm) {
1229
0
                        char *N = memchr(refp, 'N', frag_len);
1230
0
                        if (N) {
1231
0
                            int i;
1232
0
                            for (i = 0; i < frag_len; i++) {
1233
0
                                char base = refp[i];
1234
0
                                if (base == 'N') {
1235
0
                                    if (add_md_char(s, decode_md,
1236
0
                                                    'N', &md_dist) < 0)
1237
0
                                        return -1;
1238
0
                                    nm++;
1239
0
                                } else {
1240
0
                                    md_dist++;
1241
0
                                }
1242
0
                                seq[seq_pos-1+i] = base;
1243
0
                            }
1244
0
                            do_cpy = 0;
1245
0
                        } else {
1246
0
                            md_dist += frag_len;
1247
0
                        }
1248
0
                    }
1249
0
                    if (do_cpy)
1250
0
                        memcpy(&seq[seq_pos-1], refp, frag_len);
1251
0
                }
1252
0
            }
1253
#ifdef USE_X
1254
            if (cig_len && cig_op != BAM_CBASE_MATCH) {
1255
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1256
                cig_len = 0;
1257
            }
1258
            cig_op = BAM_CBASE_MATCH;
1259
#else
1260
0
            if (cig_len && cig_op != BAM_CMATCH) {
1261
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1262
0
                cig_len = 0;
1263
0
            }
1264
0
            cig_op = BAM_CMATCH;
1265
0
#endif
1266
0
            cig_len += pos - seq_pos;
1267
0
            ref_pos += pos - seq_pos;
1268
0
            seq_pos = pos;
1269
0
        }
1270
1271
0
        prev_pos = pos;
1272
1273
0
        if (!(ds & CRAM_FC))
1274
0
            goto skip_cigar;
1275
1276
0
        switch(op) {
1277
0
        case 'S': { // soft clip: IN
1278
0
            int32_t out_sz2 = cr->len ? cr->len-(pos-1) : 1;
1279
0
            int have_sc = 0;
1280
1281
0
            if (cig_len) {
1282
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1283
0
                cig_len = 0;
1284
0
            }
1285
0
            switch (CRAM_MAJOR_VERS(fd->version)) {
1286
0
            case 1:
1287
0
                if (ds & CRAM_IN) {
1288
0
                    r |= codecs[DS_IN]
1289
0
                        ? codecs[DS_IN]->decode(s, codecs[DS_IN],
1290
0
                                                blk,
1291
0
                                                cr->len ? &seq[pos-1] : NULL,
1292
0
                                                &out_sz2)
1293
0
                        : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1294
0
                    have_sc = 1;
1295
0
                }
1296
0
                break;
1297
0
            case 2:
1298
0
            default:
1299
0
                if (ds & CRAM_SC) {
1300
0
                    r |= codecs[DS_SC]
1301
0
                        ? codecs[DS_SC]->decode(s, codecs[DS_SC],
1302
0
                                                blk,
1303
0
                                                cr->len ? &seq[pos-1] : NULL,
1304
0
                                                &out_sz2)
1305
0
                        : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1306
0
                    have_sc = 1;
1307
0
                }
1308
0
                break;
1309
1310
                //default:
1311
                //    r |= codecs[DS_BB]
1312
                //        ? codecs[DS_BB]->decode(s, codecs[DS_BB],
1313
                //                                blk, &seq[pos-1], &out_sz2)
1314
                //        : (seq[pos-1] = 'N', out_sz2 = 1, 0);
1315
0
            }
1316
0
            if (have_sc) {
1317
0
                if (r) return r;
1318
0
                cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP;
1319
0
                cig_op = BAM_CSOFT_CLIP;
1320
0
                seq_pos += out_sz2;
1321
0
            }
1322
0
            break;
1323
0
        }
1324
1325
0
        case 'X': { // Substitution; BS
1326
0
            unsigned char base;
1327
#ifdef USE_X
1328
            if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1329
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1330
                cig_len = 0;
1331
            }
1332
            if (ds & CRAM_BS) {
1333
                if (!codecs[DS_BS]) return -1;
1334
                r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk,
1335
                                           (char *)&base, &out_sz);
1336
                if (pos-1 < cr->len)
1337
                    seq[pos-1] = 'N'; // FIXME look up BS=base value
1338
            }
1339
            cig_op = BAM_CBASE_MISMATCH;
1340
#else
1341
0
            int ref_base;
1342
0
            if (cig_len && cig_op != BAM_CMATCH) {
1343
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1344
0
                cig_len = 0;
1345
0
            }
1346
0
            if (ds & CRAM_BS) {
1347
0
                if (!codecs[DS_BS]) return -1;
1348
0
                r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk,
1349
0
                                           (char *)&base, &out_sz);
1350
0
                if (r) return -1;
1351
0
                if (cr->ref_id < 0 || ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
1352
0
                    if (pos-1 < cr->len)
1353
0
                        seq[pos-1] = c->comp_hdr->
1354
0
                            substitution_matrix[fd->L1['N']][base];
1355
0
                    if (decode_md || decode_nm) {
1356
0
                        if (md_dist >= 0 && decode_md)
1357
0
                            BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1358
0
                        md_dist = -1;
1359
0
                        nm--;
1360
0
                    }
1361
0
                } else {
1362
0
                    unsigned char ref_call = ref_pos < s->ref_end
1363
0
                        ? (uc)s->ref[ref_pos - s->ref_start +1]
1364
0
                        : 'N';
1365
0
                    ref_base = fd->L1[ref_call];
1366
0
                    if (pos-1 < cr->len)
1367
0
                        seq[pos-1] = c->comp_hdr->
1368
0
                            substitution_matrix[ref_base][base];
1369
0
                    if (add_md_char(s, decode_md, ref_call, &md_dist) < 0)
1370
0
                        return -1;
1371
0
                }
1372
0
            }
1373
0
            cig_op = BAM_CMATCH;
1374
0
#endif
1375
0
            nm++;
1376
0
            cig_len++;
1377
0
            seq_pos++;
1378
0
            ref_pos++;
1379
0
            break;
1380
0
        }
1381
1382
0
        case 'D': { // Deletion; DL
1383
0
            if (cig_len && cig_op != BAM_CDEL) {
1384
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1385
0
                cig_len = 0;
1386
0
            }
1387
0
            if (ds & CRAM_DL) {
1388
0
                if (!codecs[DS_DL]) return -1;
1389
0
                r |= codecs[DS_DL]->decode(s, codecs[DS_DL], blk,
1390
0
                                           (char *)&i32, &out_sz);
1391
0
                if (r) return r;
1392
0
                if (decode_md || decode_nm) {
1393
0
                    if (ref_pos + i32 > s->ref_end)
1394
0
                        goto beyond_slice;
1395
0
                    if (md_dist >= 0 && decode_md)
1396
0
                        BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1397
0
                    if (ref_pos + i32 <= bfd->ref[cr->ref_id].len) {
1398
0
                        if (decode_md) {
1399
0
                            BLOCK_APPEND_CHAR(s->aux_blk, '^');
1400
0
                            BLOCK_APPEND(s->aux_blk,
1401
0
                                         &s->ref[ref_pos - s->ref_start +1],
1402
0
                                         i32);
1403
0
                            md_dist = 0;
1404
0
                        }
1405
0
                        nm += i32;
1406
0
                    } else {
1407
0
                        uint32_t dlen;
1408
0
                        if (bfd->ref[cr->ref_id].len >= ref_pos) {
1409
0
                            if (decode_md) {
1410
0
                                BLOCK_APPEND_CHAR(s->aux_blk, '^');
1411
0
                                BLOCK_APPEND(s->aux_blk,
1412
0
                                             &s->ref[ref_pos - s->ref_start+1],
1413
0
                                             bfd->ref[cr->ref_id].len-ref_pos);
1414
0
                                BLOCK_APPEND_UINT(s->aux_blk, 0);
1415
0
                            }
1416
0
                            dlen = i32 - (bfd->ref[cr->ref_id].len - ref_pos);
1417
0
                            nm += i32 - dlen;
1418
0
                        } else {
1419
0
                            dlen = i32;
1420
0
                        }
1421
1422
0
                        md_dist = -1;
1423
0
                    }
1424
0
                }
1425
0
                cig_op = BAM_CDEL;
1426
0
                cig_len += i32;
1427
0
                ref_pos += i32;
1428
                //printf("  %d: DL = %d (ret %d)\n", f, i32, r);
1429
0
            }
1430
0
            break;
1431
0
        }
1432
1433
0
        case 'I': { // Insertion (several bases); IN
1434
0
            int32_t out_sz2 = cr->len ? cr->len-(pos-1) : 1;
1435
1436
0
            if (cig_len && cig_op != BAM_CINS) {
1437
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1438
0
                cig_len = 0;
1439
0
            }
1440
1441
0
            if (ds & CRAM_IN) {
1442
0
                if (!codecs[DS_IN]) return -1;
1443
0
                r |= codecs[DS_IN]->decode(s, codecs[DS_IN], blk,
1444
0
                                           cr->len ? &seq[pos-1] : NULL,
1445
0
                                           &out_sz2);
1446
0
                if (r) return r;
1447
0
                cig_op = BAM_CINS;
1448
0
                cig_len += out_sz2;
1449
0
                seq_pos += out_sz2;
1450
0
                nm      += out_sz2;
1451
                //printf("  %d: IN(I) = %.*s (ret %d, out_sz %d)\n", f, out_sz2, dat, r, out_sz2);
1452
0
            }
1453
0
            break;
1454
0
        }
1455
1456
0
        case 'i': { // Insertion (single base); BA
1457
0
            if (cig_len && cig_op != BAM_CINS) {
1458
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1459
0
                cig_len = 0;
1460
0
            }
1461
0
            if (ds & CRAM_BA) {
1462
0
                if (!codecs[DS_BA]) return -1;
1463
0
                r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk,
1464
0
                                           cr->len ? &seq[pos-1] : NULL,
1465
0
                                           &out_sz);
1466
0
                if (r) return r;
1467
0
            }
1468
0
            cig_op = BAM_CINS;
1469
0
            cig_len++;
1470
0
            seq_pos++;
1471
0
            nm++;
1472
0
            break;
1473
0
        }
1474
1475
0
        case 'b': { // Several bases
1476
0
            int32_t len = cr->len ? cr->len-(pos-1) : 1;
1477
1478
0
            if (cig_len && cig_op != BAM_CMATCH) {
1479
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1480
0
                cig_len = 0;
1481
0
            }
1482
1483
0
            if (ds & CRAM_BB) {
1484
0
                if (!codecs[DS_BB]) return -1;
1485
0
                r |= codecs[DS_BB]->decode(s, codecs[DS_BB], blk,
1486
0
                                           cr->len ? &seq[pos-1] : NULL,
1487
0
                                           &len);
1488
0
                if (r) return r;
1489
1490
0
                if (decode_md || decode_nm) {
1491
0
                    int x;
1492
0
                    if (md_dist >= 0 && decode_md)
1493
0
                        BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1494
1495
0
                    for (x = 0; x < len; x++) {
1496
0
                        if (x && decode_md)
1497
0
                            BLOCK_APPEND_UINT(s->aux_blk, 0);
1498
0
                        if (ref_pos+x >= bfd->ref[cr->ref_id].len || !s->ref) {
1499
0
                            md_dist = -1;
1500
0
                            break;
1501
0
                        } else {
1502
0
                            if (decode_md) {
1503
0
                                if (ref_pos + x > s->ref_end)
1504
0
                                    goto beyond_slice;
1505
0
                                char r = s->ref[ref_pos+x-s->ref_start +1];
1506
0
                                BLOCK_APPEND_CHAR(s->aux_blk, r);
1507
0
                            }
1508
0
                        }
1509
0
                    }
1510
1511
0
                    nm += x;
1512
0
                    md_dist = 0;
1513
0
                }
1514
0
            }
1515
1516
0
            cig_op = BAM_CMATCH;
1517
1518
0
            cig_len+=len;
1519
0
            seq_pos+=len;
1520
0
            ref_pos+=len;
1521
            //prev_pos+=len;
1522
0
            break;
1523
0
        }
1524
1525
0
        case 'q': { // Several quality values
1526
0
            int32_t len = cr->len ? cr->len - (pos-1) : 1;
1527
1528
0
            if (cig_len && cig_op != BAM_CMATCH) {
1529
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1530
0
                cig_len = 0;
1531
0
            }
1532
1533
0
            if (ds & CRAM_QQ) {
1534
0
                if (!codecs[DS_QQ]) return -1;
1535
0
                if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)
1536
0
                    && (unsigned char)*qual == 255)
1537
0
                    memset(qual, 30, cr->len); // ?
1538
0
                r |= codecs[DS_QQ]->decode(s, codecs[DS_QQ], blk,
1539
0
                                           (char *)&qual[pos-1], &len);
1540
0
                if (r) return r;
1541
0
            }
1542
1543
0
            cig_op = BAM_CMATCH;
1544
1545
            //prev_pos+=len;
1546
0
            break;
1547
0
        }
1548
1549
0
        case 'B': { // Read base; BA, QS
1550
#ifdef USE_X
1551
            if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1552
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1553
                cig_len = 0;
1554
            }
1555
#else
1556
0
            if (cig_len && cig_op != BAM_CMATCH) {
1557
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1558
0
                cig_len = 0;
1559
0
            }
1560
0
#endif
1561
0
            if (ds & CRAM_BA) {
1562
0
                if (!codecs[DS_BA]) return -1;
1563
0
                r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk,
1564
0
                                           cr->len ? &seq[pos-1] : NULL,
1565
0
                                           &out_sz);
1566
1567
0
                if (decode_md || decode_nm) {
1568
0
                    if (md_dist >= 0 && decode_md)
1569
0
                        BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1570
0
                    if (ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) {
1571
0
                        md_dist = -1;
1572
0
                    } else {
1573
0
                        if (decode_md) {
1574
0
                            if (ref_pos > s->ref_end)
1575
0
                                goto beyond_slice;
1576
0
                            BLOCK_APPEND_CHAR(s->aux_blk,
1577
0
                                              s->ref[ref_pos-s->ref_start +1]);
1578
0
                        }
1579
0
                        nm++;
1580
0
                        md_dist = 0;
1581
0
                    }
1582
0
                }
1583
0
            }
1584
0
            if (ds & CRAM_QS) {
1585
0
                if (!codecs[DS_QS]) return -1;
1586
0
                if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)
1587
0
                    && (unsigned char)*qual == 255)
1588
0
                    memset(qual, 30, cr->len); // ASCII ?.  Same as htsjdk
1589
0
                r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk,
1590
0
                                           (char *)&qual[pos-1], &out_sz);
1591
0
            }
1592
#ifdef USE_X
1593
            cig_op = BAM_CBASE_MISMATCH;
1594
#else
1595
0
            cig_op = BAM_CMATCH;
1596
0
#endif
1597
0
            cig_len++;
1598
0
            seq_pos++;
1599
0
            ref_pos++;
1600
            //printf("  %d: BA/QS(B) = %c/%d (ret %d)\n", f, i32, qc, r);
1601
0
            break;
1602
0
        }
1603
1604
0
        case 'Q': { // Quality score; QS
1605
0
            if (ds & CRAM_QS) {
1606
0
                if (!codecs[DS_QS]) return -1;
1607
0
                if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) &&
1608
0
                    (unsigned char)*qual == 255)
1609
0
                    memset(qual, 30, cr->len); // ?
1610
0
                r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk,
1611
0
                                           (char *)&qual[pos-1], &out_sz);
1612
                //printf("  %d: QS = %d (ret %d)\n", f, qc, r);
1613
0
            }
1614
0
            break;
1615
0
        }
1616
1617
0
        case 'H': { // hard clip; HC
1618
0
            if (cig_len && cig_op != BAM_CHARD_CLIP) {
1619
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1620
0
                cig_len = 0;
1621
0
            }
1622
0
            if (ds & CRAM_HC) {
1623
0
                if (!codecs[DS_HC]) return -1;
1624
0
                r |= codecs[DS_HC]->decode(s, codecs[DS_HC], blk,
1625
0
                                           (char *)&i32, &out_sz);
1626
0
                if (r) return r;
1627
0
                cig_op = BAM_CHARD_CLIP;
1628
0
                cig_len += i32;
1629
0
            }
1630
0
            break;
1631
0
        }
1632
1633
0
        case 'P': { // padding; PD
1634
0
            if (cig_len && cig_op != BAM_CPAD) {
1635
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1636
0
                cig_len = 0;
1637
0
            }
1638
0
            if (ds & CRAM_PD) {
1639
0
                if (!codecs[DS_PD]) return -1;
1640
0
                r |= codecs[DS_PD]->decode(s, codecs[DS_PD], blk,
1641
0
                                           (char *)&i32, &out_sz);
1642
0
                if (r) return r;
1643
0
                cig_op = BAM_CPAD;
1644
0
                cig_len += i32;
1645
0
            }
1646
0
            break;
1647
0
        }
1648
1649
0
        case 'N': { // Ref skip; RS
1650
0
            if (cig_len && cig_op != BAM_CREF_SKIP) {
1651
0
                cigar[ncigar++] = (cig_len<<4) + cig_op;
1652
0
                cig_len = 0;
1653
0
            }
1654
0
            if (ds & CRAM_RS) {
1655
0
                if (!codecs[DS_RS]) return -1;
1656
0
                r |= codecs[DS_RS]->decode(s, codecs[DS_RS], blk,
1657
0
                                           (char *)&i32, &out_sz);
1658
0
                if (r) return r;
1659
0
                cig_op = BAM_CREF_SKIP;
1660
0
                cig_len += i32;
1661
0
                ref_pos += i32;
1662
0
            }
1663
0
            break;
1664
0
        }
1665
1666
0
        default:
1667
0
            hts_log_error("Unknown feature code '%c'", op);
1668
0
            return -1;
1669
0
        }
1670
0
    }
1671
1672
0
    if (!(ds & CRAM_FC))
1673
0
        goto skip_cigar;
1674
1675
    /* An implicit match op for any unaccounted for bases */
1676
0
    if ((ds & CRAM_FN) && cr->len >= seq_pos) {
1677
0
        if (s->ref && cr->ref_id >= 0) {
1678
0
            if (ref_pos + cr->len - seq_pos + 1 > bfd->ref[cr->ref_id].len) {
1679
0
                static int whinged = 0;
1680
0
                int rlen;
1681
0
                if (!whinged)
1682
0
                    hts_log_warning("Ref pos outside of ref sequence boundary");
1683
0
                whinged = 1;
1684
0
                rlen = bfd->ref[cr->ref_id].len - ref_pos;
1685
                // May miss MD/NM cases where both seq/ref are N, but this is a
1686
                // malformed cram file anyway.
1687
0
                if (rlen > 0) {
1688
0
                    if (seq_pos-1 + rlen < cr->len)
1689
0
                        memcpy(&seq[seq_pos-1],
1690
0
                               &s->ref[ref_pos - s->ref_start +1], rlen);
1691
0
                    if ((cr->len - seq_pos + 1) - rlen > 0)
1692
0
                        memset(&seq[seq_pos-1+rlen], 'N',
1693
0
                               (cr->len - seq_pos + 1) - rlen);
1694
0
                } else {
1695
0
                    if (cr->len - seq_pos + 1 > 0)
1696
0
                        memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1);
1697
0
                }
1698
0
                if (md_dist >= 0)
1699
0
                    md_dist += cr->len - seq_pos + 1;
1700
0
            } else {
1701
0
                if (cr->len - seq_pos + 1 > 0) {
1702
0
                    if (ref_pos + cr->len-seq_pos +1 > s->ref_end)
1703
0
                        goto beyond_slice;
1704
0
                    int remainder = cr->len - (seq_pos-1);
1705
0
                    int j = ref_pos - s->ref_start + 1;
1706
0
                    if (decode_md || decode_nm) {
1707
0
                        int i;
1708
0
                        char *N = memchr(&s->ref[j], 'N', remainder);
1709
0
                        if (!N) {
1710
                            // short cut the common case
1711
0
                            md_dist += cr->len - (seq_pos-1);
1712
0
                        } else {
1713
0
                            char *refp = &s->ref[j-(seq_pos-1)];
1714
0
                            md_dist += N-&s->ref[j];
1715
0
                            int i_start = seq_pos-1 + (N - &s->ref[j]);
1716
0
                            for (i = i_start; i < cr->len; i++) {
1717
0
                                char base = refp[i];
1718
0
                                if (base == 'N') {
1719
0
                                    if (add_md_char(s, decode_md, 'N',
1720
0
                                                    &md_dist) < 0)
1721
0
                                        return -1;
1722
0
                                    nm++;
1723
0
                                } else {
1724
0
                                    md_dist++;
1725
0
                                }
1726
0
                            }
1727
0
                        }
1728
0
                    }
1729
0
                    memcpy(&seq[seq_pos-1], &s->ref[j], remainder);
1730
0
                }
1731
0
                ref_pos += cr->len - seq_pos + 1;
1732
0
            }
1733
0
        } else if (cr->ref_id >= 0) {
1734
            // So alignment end can be computed even when not decoding sequence
1735
0
            ref_pos += cr->len - seq_pos + 1;
1736
0
        }
1737
1738
0
        if (ncigar+1 >= cigar_alloc) {
1739
0
            cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1740
0
            if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar))))
1741
0
                return -1;
1742
0
            s->cigar = cigar;
1743
0
        }
1744
#ifdef USE_X
1745
        if (cig_len && cig_op != BAM_CBASE_MATCH) {
1746
            cigar[ncigar++] = (cig_len<<4) + cig_op;
1747
            cig_len = 0;
1748
        }
1749
        cig_op = BAM_CBASE_MATCH;
1750
#else
1751
0
        if (cig_len && cig_op != BAM_CMATCH) {
1752
0
            cigar[ncigar++] = (cig_len<<4) + cig_op;
1753
0
            cig_len = 0;
1754
0
        }
1755
0
        cig_op = BAM_CMATCH;
1756
0
#endif
1757
0
        cig_len += cr->len - seq_pos+1;
1758
0
    }
1759
1760
0
 skip_cigar:
1761
1762
0
    if ((ds & CRAM_FN) && decode_md) {
1763
0
        if (md_dist >= 0)
1764
0
            BLOCK_APPEND_UINT(s->aux_blk, md_dist);
1765
0
    }
1766
1767
0
    if (cig_len) {
1768
0
        if (ncigar >= cigar_alloc) {
1769
0
            cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1770
0
            if (!(cigar = realloc(s->cigar, cigar_alloc * sizeof(*cigar))))
1771
0
                return -1;
1772
0
            s->cigar = cigar;
1773
0
        }
1774
1775
0
        cigar[ncigar++] = (cig_len<<4) + cig_op;
1776
0
    }
1777
1778
0
    cr->ncigar = ncigar - cr->cigar;
1779
0
    cr->aend = ref_pos > cr->apos ? ref_pos : cr->apos;
1780
1781
    //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos);
1782
1783
0
    if (ds & CRAM_MQ) {
1784
0
        if (!codecs[DS_MQ]) return -1;
1785
0
        r |= codecs[DS_MQ]->decode(s, codecs[DS_MQ], blk,
1786
0
                                   (char *)&cr->mqual, &out_sz);
1787
0
    } else {
1788
0
        cr->mqual = 40;
1789
0
    }
1790
1791
0
    if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
1792
0
        int32_t out_sz2 = cr->len;
1793
1794
0
        if (!codecs[DS_QS]) return -1;
1795
0
        r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk,
1796
0
                                   qual, &out_sz2);
1797
0
    }
1798
1799
0
    s->cigar = cigar;
1800
0
    s->cigar_alloc = cigar_alloc;
1801
0
    s->ncigar = ncigar;
1802
1803
0
    if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
1804
0
        cr->len = 0;
1805
1806
0
    if (decode_md) {
1807
0
        BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z:
1808
0
        size_t sz = BLOCK_SIZE(s->aux_blk) - orig_aux;
1809
0
        if (has_MD < 0) {
1810
            // has_MD < 0; already have MDZ allocated in aux at -has_MD,
1811
            // but wrote MD to end of aux (at orig_aux).
1812
            // We need some memmoves to shuffle it around.
1813
0
            char tmp_MD_[1024], *tmp_MD = tmp_MD_;
1814
0
            unsigned char *orig_aux_p = BLOCK_DATA(s->aux_blk) + orig_aux;
1815
0
            if (sz > 1024) {
1816
0
                tmp_MD = malloc(sz);
1817
0
                if (!tmp_MD)
1818
0
                    return -1;
1819
0
            }
1820
0
            memcpy(tmp_MD, orig_aux_p, sz);
1821
0
            memmove(&BLOCK_DATA(s->aux_blk)[-has_MD] + sz,
1822
0
                    &BLOCK_DATA(s->aux_blk)[-has_MD],
1823
0
                    orig_aux_p - &BLOCK_DATA(s->aux_blk)[-has_MD]);
1824
0
            memcpy(&BLOCK_DATA(s->aux_blk)[-has_MD], tmp_MD, sz);
1825
0
            if (tmp_MD != tmp_MD_)
1826
0
                free(tmp_MD);
1827
1828
0
            if (-has_NM > -has_MD)
1829
                // we inserted before NM, so move it up a bit
1830
0
                has_NM -= sz;
1831
0
        }
1832
        // else has_MD == 0 and we've already appended MD to the end.
1833
1834
0
        cr->aux_size += sz;
1835
0
    }
1836
1837
0
    if (decode_nm) {
1838
0
        if (has_NM == 0) {
1839
0
            char buf[7];
1840
0
            size_t buf_size;
1841
0
            buf[0] = 'N'; buf[1] = 'M';
1842
0
            if (nm <= UINT8_MAX) {
1843
0
                buf_size = 4;
1844
0
                buf[2] = 'C';
1845
0
                buf[3] = (nm>> 0) & 0xff;
1846
0
            } else if (nm <= UINT16_MAX) {
1847
0
                buf_size = 5;
1848
0
                buf[2] = 'S';
1849
0
                buf[3] = (nm>> 0) & 0xff;
1850
0
                buf[4] = (nm>> 8) & 0xff;
1851
0
            } else {
1852
0
                buf_size = 7;
1853
0
                buf[2] = 'I';
1854
0
                buf[3] = (nm>> 0) & 0xff;
1855
0
                buf[4] = (nm>> 8) & 0xff;
1856
0
                buf[5] = (nm>>16) & 0xff;
1857
0
                buf[6] = (nm>>24) & 0xff;
1858
0
            }
1859
0
            BLOCK_APPEND(s->aux_blk, buf, buf_size);
1860
0
            cr->aux_size += buf_size;
1861
0
        } else {
1862
            // Preallocated space for NM at -has_NM into aux block
1863
0
            unsigned char *buf = BLOCK_DATA(s->aux_blk) + -has_NM;
1864
0
            buf[0] = (nm>> 0) & 0xff;
1865
0
            buf[1] = (nm>> 8) & 0xff;
1866
0
            buf[2] = (nm>>16) & 0xff;
1867
0
            buf[3] = (nm>>24) & 0xff;
1868
0
        }
1869
0
    }
1870
1871
0
    return r;
1872
1873
0
 beyond_slice:
1874
    // Cramtools can create CRAMs that have sequence features outside the
1875
    // stated range of the container & slice reference extents (start + span).
1876
    // We have to check for these in many places, but for brevity have the
1877
    // error reporting in only one.
1878
0
    hts_log_error("CRAM CIGAR extends beyond slice reference extents");
1879
0
    return -1;
1880
1881
0
 block_err:
1882
0
    return -1;
1883
0
}
1884
1885
/*
1886
 * Quick and simple hash lookup for cram_map arrays
1887
 */
1888
0
static cram_map *map_find(cram_map **map, unsigned char *key, int id) {
1889
0
    cram_map *m;
1890
1891
0
    m = map[CRAM_MAP(key[0],key[1])];
1892
0
    while (m && m->key != id)
1893
0
        m= m->next;
1894
1895
0
    return m;
1896
0
}
1897
1898
//#define map_find(M,K,I) M[CRAM_MAP(K[0],K[1])];while (m && m->key != I);m= m->next
1899
1900
1901
static int cram_decode_aux_1_0(cram_container *c, cram_slice *s,
1902
0
                               cram_block *blk, cram_record *cr) {
1903
0
    int i, r = 0, out_sz = 1;
1904
0
    unsigned char ntags;
1905
1906
0
    if (!c->comp_hdr->codecs[DS_TC]) return -1;
1907
0
    r |= c->comp_hdr->codecs[DS_TC]->decode(s, c->comp_hdr->codecs[DS_TC], blk,
1908
0
                                            (char *)&ntags, &out_sz);
1909
0
    cr->ntags = ntags;
1910
1911
    //printf("TC=%d\n", cr->ntags);
1912
0
    cr->aux_size = 0;
1913
0
    cr->aux = BLOCK_SIZE(s->aux_blk);
1914
1915
0
    for (i = 0; i < cr->ntags; i++) {
1916
0
        int32_t id, out_sz = 1;
1917
0
        unsigned char tag_data[3];
1918
0
        cram_map *m;
1919
1920
        //printf("Tag %d/%d\n", i+1, cr->ntags);
1921
0
        if (!c->comp_hdr->codecs[DS_TN]) return -1;
1922
0
        r |= c->comp_hdr->codecs[DS_TN]->decode(s, c->comp_hdr->codecs[DS_TN],
1923
0
                                                blk, (char *)&id, &out_sz);
1924
0
        if (out_sz == 3) {
1925
            // Tag name stored as 3 chars instead of an int?
1926
0
            memcpy(tag_data, &id, 3);
1927
0
        } else {
1928
0
            tag_data[0] = (id>>16) & 0xff;
1929
0
            tag_data[1] = (id>>8)  & 0xff;
1930
0
            tag_data[2] = id       & 0xff;
1931
0
        }
1932
1933
0
        m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
1934
0
        if (!m)
1935
0
            return -1;
1936
0
        BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
1937
1938
0
        if (!m->codec) return -1;
1939
0
        r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
1940
1941
0
        cr->aux_size += out_sz + 3;
1942
0
    }
1943
1944
0
    return r;
1945
1946
0
 block_err:
1947
0
    return -1;
1948
0
}
1949
1950
// has_MD and has_NM are filled out with 0 for none present,
1951
// 1 for present and verbatim, and -pos for present as placeholder
1952
// (MD*, NM*) to be generated and filled out at offset +pos.
1953
static int cram_decode_aux(cram_fd *fd,
1954
                           cram_container *c, cram_slice *s,
1955
                           cram_block *blk, cram_record *cr,
1956
0
                           int *has_MD, int *has_NM) {
1957
0
    int i, r = 0, out_sz = 1;
1958
0
    int32_t TL = 0;
1959
0
    unsigned char *TN;
1960
0
    uint32_t ds = s->data_series;
1961
1962
0
    if (!(ds & (CRAM_TL|CRAM_aux))) {
1963
0
        cr->aux = 0;
1964
0
        cr->aux_size = 0;
1965
0
        return 0;
1966
0
    }
1967
1968
0
    if (!c->comp_hdr->codecs[DS_TL]) return -1;
1969
0
    r |= c->comp_hdr->codecs[DS_TL]->decode(s, c->comp_hdr->codecs[DS_TL], blk,
1970
0
                                            (char *)&TL, &out_sz);
1971
0
    if (r || TL < 0 || TL >= c->comp_hdr->nTL)
1972
0
        return -1;
1973
1974
0
    TN = c->comp_hdr->TL[TL];
1975
0
    cr->ntags = strlen((char *)TN)/3; // optimise to remove strlen
1976
1977
    //printf("TC=%d\n", cr->ntags);
1978
0
    cr->aux_size = 0;
1979
0
    cr->aux = BLOCK_SIZE(s->aux_blk);
1980
1981
0
    if (!(ds & CRAM_aux))
1982
0
        return 0;
1983
1984
0
    for (i = 0; i < cr->ntags; i++) {
1985
0
        int32_t id, out_sz = 1;
1986
0
        unsigned char tag_data[7];
1987
0
        cram_map *m;
1988
1989
0
        if (TN[0] == 'M' && TN[1] == 'D' && has_MD)
1990
0
            *has_MD = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1);
1991
0
        if (TN[0] == 'N' && TN[1] == 'M' && has_NM)
1992
0
            *has_NM = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1);;
1993
1994
        //printf("Tag %d/%d\n", i+1, cr->ntags);
1995
0
        tag_data[0] = TN[0];
1996
0
        tag_data[1] = TN[1];
1997
0
        tag_data[2] = TN[2];
1998
0
        id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2];
1999
2000
0
        if (CRAM_MAJOR_VERS(fd->version) >= 4 && TN[2] == '*') {
2001
            // Place holder, fill out contents later.
2002
0
            int tag_data_size;
2003
0
            if (TN[0] == 'N' && TN[1] == 'M') {
2004
                // Use a fixed size, so we can allocate room for it now.
2005
0
                memcpy(&tag_data[2], "I\0\0\0\0", 5);
2006
0
                tag_data_size = 7;
2007
0
            } else if (TN[0] == 'R' && TN[1] == 'G') {
2008
                // RG is variable size, but known already.  Insert now
2009
0
                TN += 3;
2010
                // Equiv to fd->header->hrecs->rg[cr->rg], but this is the
2011
                // new header API equivalent.
2012
0
                const char *rg = sam_hdr_line_name(fd->header, "RG", cr->rg);
2013
0
                if (!rg)
2014
0
                    continue;
2015
2016
0
                size_t rg_len = strlen(rg);
2017
0
                tag_data[2] = 'Z';
2018
0
                BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
2019
0
                BLOCK_APPEND(s->aux_blk, rg, rg_len);
2020
0
                BLOCK_APPEND_CHAR(s->aux_blk, '\0');
2021
0
                cr->aux_size += 3 + rg_len + 1;
2022
0
                cr->rg = -1; // prevents auto-add later
2023
0
                continue;
2024
0
            } else {
2025
                // Unknown size.  We'll insert MD into stream later.
2026
0
                tag_data[2] = 'Z';
2027
0
                tag_data_size = 3;
2028
0
            }
2029
0
            BLOCK_APPEND(s->aux_blk, (char *)tag_data, tag_data_size);
2030
0
            cr->aux_size += tag_data_size;
2031
0
            TN += 3;
2032
0
        } else {
2033
0
            TN += 3;
2034
0
            m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id);
2035
0
            if (!m)
2036
0
                return -1;
2037
2038
0
            BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3);
2039
2040
0
            if (!m->codec) return -1;
2041
0
            r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz);
2042
0
            if (r) break;
2043
0
            cr->aux_size += out_sz + 3;
2044
2045
            // cF CRAM flags.
2046
0
            if (TN[-3]=='c' && TN[-2]=='F' && TN[-1]=='C' && out_sz == 1) {
2047
                // Remove cF tag
2048
0
                uint8_t cF = BLOCK_END(s->aux_blk)[-1];
2049
0
                BLOCK_SIZE(s->aux_blk) -= out_sz+3;
2050
0
                cr->aux_size -= out_sz+3;
2051
2052
                // bit 1 => don't auto-decode MD.
2053
                // Pretend MD is present verbatim, so we don't auto-generate
2054
0
                if ((cF & 1) && has_MD && *has_MD == 0)
2055
0
                    *has_MD = 1;
2056
2057
                // bit 1 => don't auto-decode NM
2058
0
                if ((cF & 2) && has_NM && *has_NM == 0)
2059
0
                    *has_NM = 1;
2060
0
            }
2061
0
        }
2062
2063
        // We could go to 2^32 fine, but we shouldn't be hitting this anyway,
2064
        // and it's protecting against memory hogs too.
2065
0
        if (BLOCK_SIZE(s->aux_blk) > (1u<<31)) {
2066
0
            hts_log_error("CRAM->BAM aux block size overflow");
2067
0
            goto block_err;
2068
0
        }
2069
0
    }
2070
2071
0
    return r;
2072
2073
0
 block_err:
2074
0
    return -1;
2075
0
}
2076
2077
/* Resolve mate pair cross-references between recs within this slice */
2078
561
static int cram_decode_slice_xref(cram_slice *s, int required_fields) {
2079
561
    int rec;
2080
2081
561
    if (!(required_fields & (SAM_RNEXT | SAM_PNEXT | SAM_TLEN))) {
2082
0
        for (rec = 0; rec < s->hdr->num_records; rec++) {
2083
0
            cram_record *cr = &s->crecs[rec];
2084
2085
0
            cr->tlen = 0;
2086
0
            cr->mate_pos = 0;
2087
0
            cr->mate_ref_id = -1;
2088
0
        }
2089
2090
0
        return 0;
2091
0
    }
2092
2093
561
    for (rec = 0; rec < s->hdr->num_records; rec++) {
2094
0
        cram_record *cr = &s->crecs[rec];
2095
2096
0
        if (cr->mate_line >= 0) {
2097
0
            if (cr->mate_line < s->hdr->num_records) {
2098
                /*
2099
                 * On the first read, loop through computing lengths.
2100
                 * It's not perfect as we have one slice per reference so we
2101
                 * cannot detect when TLEN should be zero due to seqs that
2102
                 * map to multiple references.
2103
                 *
2104
                 * We also cannot set tlen correct when it spans a slice for
2105
                 * other reasons. This may make tlen too small. Should we
2106
                 * fix this by forcing TLEN to be stored verbatim in such cases?
2107
                 *
2108
                 * Or do we just admit defeat and output 0 for tlen? It's the
2109
                 * safe option...
2110
                 */
2111
0
                if (cr->tlen == INT64_MIN) {
2112
0
                    int id1 = rec, id2 = rec;
2113
0
                    int64_t aleft = cr->apos, aright = cr->aend;
2114
0
                    int64_t tlen;
2115
0
                    int ref = cr->ref_id;
2116
2117
                    // number of segments starting at the same point.
2118
0
                    int left_cnt = 0;
2119
0
                    int right_cnt = 0;
2120
2121
0
                    do {
2122
0
                        if (aleft > s->crecs[id2].apos)
2123
0
                            aleft = s->crecs[id2].apos, left_cnt = 1;
2124
0
                        else if (aleft == s->crecs[id2].apos)
2125
0
                            left_cnt++;
2126
0
                        if (aright < s->crecs[id2].aend) {
2127
0
                            aright = s->crecs[id2].aend;
2128
0
                            right_cnt = 1;
2129
0
                        } else if (aright == s->crecs[id2].aend) {
2130
0
                            right_cnt++;
2131
0
                        }
2132
0
                        if (s->crecs[id2].mate_line == -1) {
2133
0
                            s->crecs[id2].mate_line = rec;
2134
0
                            break;
2135
0
                        }
2136
0
                        if (s->crecs[id2].mate_line <= id2 ||
2137
0
                            s->crecs[id2].mate_line >= s->hdr->num_records)
2138
0
                            return -1;
2139
0
                        id2 = s->crecs[id2].mate_line;
2140
2141
0
                        if (s->crecs[id2].ref_id != ref)
2142
0
                            ref = -1;
2143
0
                    } while (id2 != id1);
2144
2145
0
                    if (ref != -1) {
2146
0
                        tlen = aright - aleft + 1;
2147
0
                        id1 = id2 = rec;
2148
2149
                        /*
2150
                         * When we have two seqs with identical start and
2151
                         * end coordinates, set +/- tlen based on 1st/last
2152
                         * bit flags instead, as a tie breaker.
2153
                         */
2154
0
                        if (s->crecs[id2].apos == aleft &&
2155
0
                            (s->crecs[id2].aend < aright ||
2156
0
                             left_cnt <= 1)) {
2157
                            // Leftmost, and not the rightmost
2158
0
                            s->crecs[id2].tlen = tlen;
2159
0
                            tlen = -tlen;
2160
0
                        } else if (s->crecs[id2].apos == aleft &&
2161
0
                                   s->crecs[id2].aend == aright &&
2162
0
                                   left_cnt > 1 && right_cnt > 1) {
2163
                            // Both leftmost and rightmost, resolve tie via
2164
                            // the BAM flags so changing order doesn't change
2165
                            // TLEN signs.
2166
0
                            if (s->crecs[id2].flags & BAM_FREAD1) {
2167
0
                                s->crecs[id2].tlen = tlen;
2168
0
                                tlen = -tlen;
2169
0
                            } else {
2170
0
                                s->crecs[id2].tlen = -tlen;
2171
0
                            }
2172
0
                        } else {
2173
                            // Rightmost or an internal
2174
0
                            s->crecs[id2].tlen = -tlen;
2175
0
                        }
2176
2177
0
                        id2 = s->crecs[id2].mate_line;
2178
0
                        while (id2 != id1) {
2179
0
                            s->crecs[id2].tlen = tlen;
2180
0
                            id2 = s->crecs[id2].mate_line;
2181
0
                        }
2182
0
                    } else {
2183
0
                        id1 = id2 = rec;
2184
2185
0
                        s->crecs[id2].tlen = 0;
2186
0
                        id2 = s->crecs[id2].mate_line;
2187
0
                        while (id2 != id1) {
2188
0
                            s->crecs[id2].tlen = 0;
2189
0
                            id2 = s->crecs[id2].mate_line;
2190
0
                        }
2191
0
                    }
2192
0
                }
2193
2194
0
                cr->mate_pos = s->crecs[cr->mate_line].apos;
2195
0
                cr->mate_ref_id = s->crecs[cr->mate_line].ref_id;
2196
2197
                // paired
2198
0
                cr->flags |= BAM_FPAIRED;
2199
2200
                // set mate unmapped if needed
2201
0
                if (s->crecs[cr->mate_line].flags & BAM_FUNMAP) {
2202
0
                    cr->flags |= BAM_FMUNMAP;
2203
0
                    cr->tlen = 0;
2204
0
                }
2205
0
                if (cr->flags & BAM_FUNMAP) {
2206
0
                    cr->tlen = 0;
2207
0
                }
2208
2209
                // set mate reversed if needed
2210
0
                if (s->crecs[cr->mate_line].flags & BAM_FREVERSE)
2211
0
                    cr->flags |= BAM_FMREVERSE;
2212
0
            } else {
2213
0
                hts_log_error("Mate line out of bounds: %d vs [0, %d]",
2214
0
                              cr->mate_line, s->hdr->num_records-1);
2215
0
            }
2216
2217
            /* FIXME: construct read names here too if needed */
2218
0
        } else {
2219
0
            if (cr->mate_flags & CRAM_M_REVERSE) {
2220
0
                cr->flags |= BAM_FPAIRED | BAM_FMREVERSE;
2221
0
            }
2222
0
            if (cr->mate_flags & CRAM_M_UNMAP) {
2223
0
                cr->flags |= BAM_FMUNMAP;
2224
                //cr->mate_ref_id = -1;
2225
0
            }
2226
0
            if (!(cr->flags & BAM_FPAIRED))
2227
0
                cr->mate_ref_id = -1;
2228
0
        }
2229
2230
0
        if (cr->tlen == INT64_MIN)
2231
0
            cr->tlen = 0; // Just incase
2232
0
    }
2233
2234
561
    for (rec = 0; rec < s->hdr->num_records; rec++) {
2235
0
        cram_record *cr = &s->crecs[rec];
2236
0
        if (cr->explicit_tlen != INT64_MIN)
2237
0
            cr->tlen = cr->explicit_tlen;
2238
0
    }
2239
2240
561
    return 0;
2241
561
}
2242
2243
0
static char *md5_print(unsigned char *md5, char *out) {
2244
0
    int i;
2245
0
    for (i = 0; i < 16; i++) {
2246
0
        out[i*2+0] = "0123456789abcdef"[md5[i]>>4];
2247
0
        out[i*2+1] = "0123456789abcdef"[md5[i]&15];
2248
0
    }
2249
0
    out[32] = 0;
2250
2251
0
    return out;
2252
0
}
2253
2254
/*
2255
 * Utility function to decode tlen (ISIZE), as it's called
2256
 * in multiple places.
2257
 *
2258
 * Returns codec return value (0 on success).
2259
 */
2260
static int cram_decode_tlen(cram_fd *fd, cram_container *c, cram_slice *s,
2261
0
                            cram_block *blk, int64_t *tlen) {
2262
0
    int out_sz = 1, r = 0;
2263
2264
0
    if (!c->comp_hdr->codecs[DS_TS]) return -1;
2265
0
    if (CRAM_MAJOR_VERS(fd->version) < 4) {
2266
0
        int32_t i32;
2267
0
        r |= c->comp_hdr->codecs[DS_TS]
2268
0
            ->decode(s, c->comp_hdr->codecs[DS_TS], blk,
2269
0
                     (char *)&i32, &out_sz);
2270
0
        *tlen = i32;
2271
0
    } else {
2272
0
        r |= c->comp_hdr->codecs[DS_TS]
2273
0
            ->decode(s, c->comp_hdr->codecs[DS_TS], blk,
2274
0
                     (char *)tlen, &out_sz);
2275
0
    }
2276
0
    return r;
2277
0
}
2278
2279
/*
2280
 * Decode an entire slice from container blocks. Fills out s->crecs[] array.
2281
 * Returns 0 on success
2282
 *        -1 on failure
2283
 */
2284
int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s,
2285
570
                      sam_hdr_t *sh) {
2286
570
    cram_block *blk = s->block[0];
2287
570
    int32_t bf, ref_id;
2288
570
    unsigned char cf;
2289
570
    int out_sz, r = 0;
2290
570
    int rec;
2291
570
    char *seq = NULL, *qual = NULL;
2292
570
    int unknown_rg = -1;
2293
570
    int embed_ref;
2294
570
    char **refs = NULL;
2295
570
    uint32_t ds;
2296
570
    sam_hrecs_t *bfd = sh->hrecs;
2297
2298
570
    if (cram_dependent_data_series(fd, c->comp_hdr, s) != 0)
2299
9
        return -1;
2300
2301
561
    ds = s->data_series;
2302
2303
561
    blk->bit = 7; // MSB first
2304
2305
    // Study the blocks and estimate approx sizes to preallocate.
2306
    // This looks to speed up decoding by around 8-9%.
2307
    // We can always shrink back down at the end if we overestimated.
2308
    // However it's likely that this also saves memory as own growth
2309
    // factor (*=1.5) is never applied.
2310
561
    {
2311
561
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
2312
561
        int qsize=0, nsize=0, q_id=0;
2313
#else
2314
        int qsize, nsize, q_id;
2315
        cram_decode_estimate_sizes(c->comp_hdr, s, &qsize, &nsize, &q_id);
2316
        //fprintf(stderr, "qsize=%d nsize=%d\n", qsize, nsize);
2317
#endif
2318
2319
561
        if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->seqs_blk, qsize+1);
2320
561
        if (qsize && (ds & CRAM_RL)) BLOCK_RESIZE_EXACT(s->qual_blk, qsize+1);
2321
561
        if (nsize && (ds & CRAM_NS)) BLOCK_RESIZE_EXACT(s->name_blk, nsize+1);
2322
2323
        // To do - consider using q_id here to usurp the quality block and
2324
        // avoid a memcpy during decode.
2325
        // Specifically when quality is an external block uniquely used by
2326
        // DS_QS only, then we can set s->qual_blk directly to this
2327
        // block and save the codec->decode() calls. (Approx 3% cpu saving)
2328
561
    }
2329
2330
    /* Look for unknown RG, added as last by Java CRAM? */
2331
561
    if (bfd->nrg > 0 &&
2332
480
        bfd->rg[bfd->nrg-1].name != NULL &&
2333
480
        !strcmp(bfd->rg[bfd->nrg-1].name, "UNKNOWN"))
2334
0
        unknown_rg = bfd->nrg-1;
2335
2336
561
    if (blk->content_type != CORE)
2337
0
        return -1;
2338
2339
561
    if (s->crecs)
2340
0
        free(s->crecs);
2341
561
    if (!(s->crecs = malloc(s->hdr->num_records * sizeof(*s->crecs))))
2342
0
        return -1;
2343
2344
561
    ref_id = s->hdr->ref_seq_id;
2345
561
    if (CRAM_MAJOR_VERS(fd->version) < 4)
2346
561
       embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0;
2347
0
    else
2348
0
       embed_ref = s->hdr->ref_base_id > 0 ? 1 : 0;
2349
2350
561
    if (ref_id >= 0) {
2351
561
        if (embed_ref) {
2352
561
            cram_block *b;
2353
561
            if (s->hdr->ref_base_id < 0) {
2354
0
                hts_log_error("No reference specified and no embedded reference is available"
2355
0
                              " at #%d:%"PRId64"-%"PRId64, ref_id, s->hdr->ref_seq_start,
2356
0
                              s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2357
0
                return -1;
2358
0
            }
2359
561
            b = cram_get_block_by_id(s, s->hdr->ref_base_id);
2360
561
            if (!b)
2361
0
                return -1;
2362
561
            if (cram_uncompress_block(b) != 0)
2363
0
                return -1;
2364
561
            s->ref = (char *)BLOCK_DATA(b);
2365
561
            s->ref_start = s->hdr->ref_seq_start;
2366
561
            s->ref_end   = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
2367
561
            if (s->hdr->ref_seq_span > b->uncomp_size) {
2368
0
                hts_log_error("Embedded reference is too small at #%d:%"PRIhts_pos"-%"PRIhts_pos,
2369
0
                              ref_id, s->ref_start, s->ref_end);
2370
0
                return -1;
2371
0
            }
2372
561
        } else if (!c->comp_hdr->no_ref) {
2373
            //// Avoid Java cramtools bug by loading entire reference seq
2374
            //s->ref = cram_get_ref(fd, s->hdr->ref_seq_id, 1, 0);
2375
            //s->ref_start = 1;
2376
2377
0
            if (fd->required_fields & SAM_SEQ) {
2378
0
                s->ref =
2379
0
                cram_get_ref(fd, s->hdr->ref_seq_id,
2380
0
                             s->hdr->ref_seq_start,
2381
0
                             s->hdr->ref_seq_start + s->hdr->ref_seq_span -1);
2382
0
            }
2383
0
            s->ref_start = s->hdr->ref_seq_start;
2384
0
            s->ref_end   = s->hdr->ref_seq_start + s->hdr->ref_seq_span-1;
2385
2386
            /* Sanity check */
2387
0
            if (s->ref_start < 0) {
2388
0
                hts_log_warning("Slice starts before base 1"
2389
0
                                " at #%d:%"PRId64"-%"PRId64, ref_id, s->hdr->ref_seq_start,
2390
0
                                s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2391
0
                s->ref_start = 0;
2392
0
            }
2393
0
            pthread_mutex_lock(&fd->ref_lock);
2394
0
            pthread_mutex_lock(&fd->refs->lock);
2395
0
            if ((fd->required_fields & SAM_SEQ) &&
2396
0
                ref_id < fd->refs->nref && fd->refs->ref_id &&
2397
0
                s->ref_end > fd->refs->ref_id[ref_id]->length) {
2398
0
                s->ref_end = fd->refs->ref_id[ref_id]->length;
2399
0
            }
2400
0
            pthread_mutex_unlock(&fd->refs->lock);
2401
0
            pthread_mutex_unlock(&fd->ref_lock);
2402
0
        }
2403
561
    }
2404
2405
561
    if ((fd->required_fields & SAM_SEQ) &&
2406
561
        s->ref == NULL && s->hdr->ref_seq_id >= 0 && !c->comp_hdr->no_ref) {
2407
0
        hts_log_error("Unable to fetch reference %s:%"PRId64"-%"PRId64,
2408
0
                      fd->refs->ref_id && ref_id >= 0 && ref_id < fd->refs->nref
2409
0
                      ? fd->refs->ref_id[ref_id]->name
2410
0
                      : "unknown",
2411
0
                      s->hdr->ref_seq_start,
2412
0
                      s->hdr->ref_seq_start + s->hdr->ref_seq_span-1);
2413
0
        return -1;
2414
0
    }
2415
2416
561
    if (CRAM_MAJOR_VERS(fd->version) != 1
2417
0
        && (fd->required_fields & SAM_SEQ)
2418
0
        && s->hdr->ref_seq_id >= 0
2419
0
        && !fd->ignore_md5
2420
0
        && memcmp(s->hdr->md5, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 16)) {
2421
0
        hts_md5_context *md5;
2422
0
        unsigned char digest[16];
2423
2424
0
        if (s->ref && s->hdr->ref_seq_id >= 0) {
2425
0
            int start, len;
2426
2427
0
            if (s->hdr->ref_seq_start >= s->ref_start) {
2428
0
                start = s->hdr->ref_seq_start - s->ref_start;
2429
0
            } else {
2430
0
                hts_log_warning("Slice starts before base 1 at #%d:%"PRIhts_pos"-%"PRIhts_pos,
2431
0
                                ref_id, s->ref_start, s->ref_end);
2432
0
                start = 0;
2433
0
            }
2434
2435
0
            if (s->hdr->ref_seq_span <= s->ref_end - s->ref_start + 1) {
2436
0
                len = s->hdr->ref_seq_span;
2437
0
            } else {
2438
0
                hts_log_warning("Slice ends beyond reference end at #%d:%"PRIhts_pos"-%"PRIhts_pos,
2439
0
                                ref_id, s->ref_start, s->ref_end);
2440
0
                len = s->ref_end - s->ref_start + 1;
2441
0
            }
2442
2443
0
            if (!(md5 = hts_md5_init()))
2444
0
                return -1;
2445
0
            if (start + len > s->ref_end - s->ref_start + 1)
2446
0
                len = s->ref_end - s->ref_start + 1 - start;
2447
0
            if (len >= 0)
2448
0
                hts_md5_update(md5, s->ref + start, len);
2449
0
            hts_md5_final(digest, md5);
2450
0
            hts_md5_destroy(md5);
2451
0
        } else if (!s->ref && s->hdr->ref_base_id >= 0) {
2452
0
            cram_block *b = cram_get_block_by_id(s, s->hdr->ref_base_id);
2453
0
            if (b) {
2454
0
                if (!(md5 = hts_md5_init()))
2455
0
                    return -1;
2456
0
                hts_md5_update(md5, b->data, b->uncomp_size);
2457
0
                hts_md5_final(digest, md5);
2458
0
                hts_md5_destroy(md5);
2459
0
            }
2460
0
        }
2461
2462
0
        if (!c->comp_hdr->no_ref &&
2463
0
            ((!s->ref && s->hdr->ref_base_id < 0)
2464
0
             || memcmp(digest, s->hdr->md5, 16) != 0)) {
2465
0
            char M[33];
2466
0
            const char *rname = sam_hdr_tid2name(sh, ref_id);
2467
0
            if (!rname) rname="?"; // cannot happen normally
2468
0
            hts_log_error("MD5 checksum reference mismatch at %s:%"PRIhts_pos"-%"PRIhts_pos,
2469
0
                          rname, s->ref_start, s->ref_end);
2470
0
            hts_log_error("CRAM  : %s", md5_print(s->hdr->md5, M));
2471
0
            hts_log_error("Ref   : %s", md5_print(digest, M));
2472
0
            kstring_t ks = KS_INITIALIZE;
2473
0
            if (sam_hdr_find_tag_id(sh, "SQ", "SN", rname, "M5", &ks) == 0)
2474
0
                hts_log_error("@SQ M5: %s", ks.s);
2475
0
            hts_log_error("Please check the reference given is correct");
2476
0
            ks_free(&ks);
2477
0
            return -1;
2478
0
        }
2479
0
    }
2480
2481
561
    if (ref_id == -2) {
2482
0
        pthread_mutex_lock(&fd->ref_lock);
2483
0
        pthread_mutex_lock(&fd->refs->lock);
2484
0
        refs = calloc(fd->refs->nref, sizeof(char *));
2485
0
        pthread_mutex_unlock(&fd->refs->lock);
2486
0
        pthread_mutex_unlock(&fd->ref_lock);
2487
0
        if (!refs)
2488
0
            return -1;
2489
0
    }
2490
2491
561
    int last_ref_id = -9; // Arbitrary -ve marker for not-yet-set
2492
561
    for (rec = 0; rec < s->hdr->num_records; rec++) {
2493
0
        cram_record *cr = &s->crecs[rec];
2494
0
        int has_MD, has_NM;
2495
2496
        //fprintf(stderr, "Decode seq %d, %d/%d\n", rec, blk->byte, blk->bit);
2497
2498
0
        cr->s = s;
2499
2500
0
        out_sz = 1; /* decode 1 item */
2501
0
        if (ds & CRAM_BF) {
2502
0
            if (!c->comp_hdr->codecs[DS_BF]) goto block_err;
2503
0
            r |= c->comp_hdr->codecs[DS_BF]
2504
0
                            ->decode(s, c->comp_hdr->codecs[DS_BF], blk,
2505
0
                                     (char *)&bf, &out_sz);
2506
0
            if (r || bf < 0 ||
2507
0
                bf >= sizeof(fd->bam_flag_swap)/sizeof(*fd->bam_flag_swap))
2508
0
                goto block_err;
2509
0
            bf = fd->bam_flag_swap[bf];
2510
0
            cr->flags = bf;
2511
0
        } else {
2512
0
            cr->flags = bf = 0x4; // unmapped
2513
0
        }
2514
2515
0
        if (ds & CRAM_CF) {
2516
0
            if (CRAM_MAJOR_VERS(fd->version) == 1) {
2517
                /* CF is byte in 1.0, int32 in 2.0 */
2518
0
                if (!c->comp_hdr->codecs[DS_CF]) goto block_err;
2519
0
                r |= c->comp_hdr->codecs[DS_CF]
2520
0
                                ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2521
0
                                         (char *)&cf, &out_sz);
2522
0
                if (r) goto block_err;
2523
0
                cr->cram_flags = cf;
2524
0
            } else {
2525
0
                if (!c->comp_hdr->codecs[DS_CF]) goto block_err;
2526
0
                r |= c->comp_hdr->codecs[DS_CF]
2527
0
                                ->decode(s, c->comp_hdr->codecs[DS_CF], blk,
2528
0
                                         (char *)&cr->cram_flags, &out_sz);
2529
0
                if (r) goto block_err;
2530
0
                cf = cr->cram_flags;
2531
0
            }
2532
0
        } else {
2533
0
            cf = cr->cram_flags = 0;
2534
0
        }
2535
2536
0
        if (CRAM_MAJOR_VERS(fd->version) != 1 && ref_id == -2) {
2537
0
            if (ds & CRAM_RI) {
2538
0
                if (!c->comp_hdr->codecs[DS_RI]) goto block_err;
2539
0
                r |= c->comp_hdr->codecs[DS_RI]
2540
0
                                ->decode(s, c->comp_hdr->codecs[DS_RI], blk,
2541
0
                                         (char *)&cr->ref_id, &out_sz);
2542
0
                if (r) goto block_err;
2543
0
                if ((fd->required_fields & (SAM_SEQ|SAM_TLEN))
2544
0
                    && cr->ref_id >= 0
2545
0
                    && cr->ref_id != last_ref_id) {
2546
0
                    if (!c->comp_hdr->no_ref) {
2547
                        // Range(fd):  seq >= 0, unmapped -1, unspecified   -2
2548
                        // Slice(s):   seq >= 0, unmapped -1, multiple refs -2
2549
                        // Record(cr): seq >= 0, unmapped -1
2550
0
                        pthread_mutex_lock(&fd->range_lock);
2551
0
                        int need_ref = (fd->range.refid == -2 || cr->ref_id == fd->range.refid);
2552
0
                        pthread_mutex_unlock(&fd->range_lock);
2553
0
                        if  (need_ref) {
2554
0
                            if (!refs[cr->ref_id])
2555
0
                                refs[cr->ref_id] = cram_get_ref(fd, cr->ref_id, 1, 0);
2556
0
                            if (!(s->ref = refs[cr->ref_id]))
2557
0
                                goto block_err;
2558
0
                        } else {
2559
                            // For multi-ref containers, we don't need to fetch all
2560
                            // refs if we're only querying one.
2561
0
                            s->ref = NULL;
2562
0
                        }
2563
2564
0
                        pthread_mutex_lock(&fd->range_lock);
2565
0
                        int discard_last_ref = (last_ref_id >= 0 &&
2566
0
                                                refs[last_ref_id] &&
2567
0
                                                (fd->range.refid == -2 ||
2568
0
                                                 last_ref_id == fd->range.refid));
2569
0
                        pthread_mutex_unlock(&fd->range_lock);
2570
0
                        if (discard_last_ref) {
2571
0
                            pthread_mutex_lock(&fd->ref_lock);
2572
0
                            discard_last_ref = !fd->unsorted;
2573
0
                            pthread_mutex_unlock(&fd->ref_lock);
2574
0
                        }
2575
0
                        if (discard_last_ref) {
2576
0
                            cram_ref_decr(fd->refs, last_ref_id);
2577
0
                            refs[last_ref_id] = NULL;
2578
0
                        }
2579
0
                    }
2580
0
                    s->ref_start = 1;
2581
0
                    pthread_mutex_lock(&fd->ref_lock);
2582
0
                    pthread_mutex_lock(&fd->refs->lock);
2583
0
                    s->ref_end = fd->refs->ref_id[cr->ref_id]->length;
2584
0
                    pthread_mutex_unlock(&fd->refs->lock);
2585
0
                    pthread_mutex_unlock(&fd->ref_lock);
2586
2587
0
                    last_ref_id = cr->ref_id;
2588
0
                }
2589
0
            } else {
2590
0
                cr->ref_id = -1;
2591
0
            }
2592
0
        } else {
2593
0
            cr->ref_id = ref_id; // Forced constant in CRAM 1.0
2594
0
        }
2595
0
        if (cr->ref_id < -1 || cr->ref_id >= bfd->nref) {
2596
0
            hts_log_error("Requested unknown reference ID %d", cr->ref_id);
2597
0
            goto block_err;
2598
0
        }
2599
2600
0
        if (ds & CRAM_RL) {
2601
0
            if (!c->comp_hdr->codecs[DS_RL]) goto block_err;
2602
0
            r |= c->comp_hdr->codecs[DS_RL]
2603
0
                            ->decode(s, c->comp_hdr->codecs[DS_RL], blk,
2604
0
                                     (char *)&cr->len, &out_sz);
2605
0
            if (r) goto block_err;
2606
0
            if (cr->len < 0) {
2607
0
                hts_log_error("Read has negative length");
2608
0
                goto block_err;
2609
0
            }
2610
0
        }
2611
2612
0
        if (ds & CRAM_AP) {
2613
0
            if (!c->comp_hdr->codecs[DS_AP]) goto block_err;
2614
0
            if (CRAM_MAJOR_VERS(fd->version) >= 4) {
2615
0
                r |= c->comp_hdr->codecs[DS_AP]
2616
0
                                ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
2617
0
                                         (char *)&cr->apos, &out_sz);
2618
0
            } else  {
2619
0
                int32_t i32;
2620
0
                r |= c->comp_hdr->codecs[DS_AP]
2621
0
                                ->decode(s, c->comp_hdr->codecs[DS_AP], blk,
2622
0
                                         (char *)&i32, &out_sz);
2623
0
                cr->apos = i32;
2624
0
            }
2625
0
            if (r) goto block_err;;
2626
0
            if (c->comp_hdr->AP_delta) {
2627
0
                if (cr->apos < 0 && c->unsorted == 0) {
2628
                    // cache locally in c->unsorted so we don't have an
2629
                    // excessive number of locks
2630
0
                    pthread_mutex_lock(&fd->ref_lock);
2631
0
                    c->unsorted = fd->unsorted = 1;
2632
0
                    pthread_mutex_unlock(&fd->ref_lock);
2633
0
                }
2634
0
                cr->apos += s->last_apos;
2635
0
            }
2636
0
            s->last_apos=  cr->apos;
2637
0
        } else {
2638
0
            cr->apos = c->ref_seq_start;
2639
0
        }
2640
2641
0
        if (ds & CRAM_RG) {
2642
0
            if (!c->comp_hdr->codecs[DS_RG]) goto block_err;
2643
0
            r |= c->comp_hdr->codecs[DS_RG]
2644
0
                           ->decode(s, c->comp_hdr->codecs[DS_RG], blk,
2645
0
                                    (char *)&cr->rg, &out_sz);
2646
0
            if (r) goto block_err;
2647
0
            if (cr->rg == unknown_rg)
2648
0
                cr->rg = -1;
2649
0
        } else {
2650
0
            cr->rg = -1;
2651
0
        }
2652
2653
0
        cr->name_len = 0;
2654
2655
0
        if (c->comp_hdr->read_names_included) {
2656
0
            int32_t out_sz2 = 1; // block auto grows in decode()
2657
2658
            // Read directly into name cram_block
2659
0
            cr->name = BLOCK_SIZE(s->name_blk);
2660
0
            if (ds & CRAM_RN) {
2661
0
                if (!c->comp_hdr->codecs[DS_RN]) goto block_err;
2662
0
                r |= c->comp_hdr->codecs[DS_RN]
2663
0
                                ->decode(s, c->comp_hdr->codecs[DS_RN], blk,
2664
0
                                         (char *)s->name_blk, &out_sz2);
2665
0
                if (r) goto block_err;
2666
0
                cr->name_len = out_sz2;
2667
0
            }
2668
0
        }
2669
2670
0
        cr->mate_pos = 0;
2671
0
        cr->mate_line = -1;
2672
0
        cr->mate_ref_id = -1;
2673
0
        cr->explicit_tlen = INT64_MIN;
2674
0
        if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) {
2675
0
            if (ds & CRAM_MF) {
2676
0
                if (CRAM_MAJOR_VERS(fd->version) == 1) {
2677
                    /* MF is byte in 1.0, int32 in 2.0 */
2678
0
                    unsigned char mf;
2679
0
                    if (!c->comp_hdr->codecs[DS_MF]) goto block_err;
2680
0
                    r |= c->comp_hdr->codecs[DS_MF]
2681
0
                                    ->decode(s, c->comp_hdr->codecs[DS_MF],
2682
0
                                             blk, (char *)&mf, &out_sz);
2683
0
                    if (r) goto block_err;
2684
0
                    cr->mate_flags = mf;
2685
0
                } else {
2686
0
                    if (!c->comp_hdr->codecs[DS_MF]) goto block_err;
2687
0
                    r |= c->comp_hdr->codecs[DS_MF]
2688
0
                                    ->decode(s, c->comp_hdr->codecs[DS_MF],
2689
0
                                             blk,
2690
0
                                             (char *)&cr->mate_flags,
2691
0
                                             &out_sz);
2692
0
                    if (r) goto block_err;
2693
0
                }
2694
0
            } else {
2695
0
                cr->mate_flags = 0;
2696
0
            }
2697
2698
0
            if (!c->comp_hdr->read_names_included) {
2699
0
                int32_t out_sz2 = 1;
2700
2701
                // Read directly into name cram_block
2702
0
                cr->name = BLOCK_SIZE(s->name_blk);
2703
0
                if (ds & CRAM_RN) {
2704
0
                    if (!c->comp_hdr->codecs[DS_RN]) goto block_err;
2705
0
                    r |= c->comp_hdr->codecs[DS_RN]
2706
0
                                    ->decode(s, c->comp_hdr->codecs[DS_RN],
2707
0
                                             blk, (char *)s->name_blk,
2708
0
                                             &out_sz2);
2709
0
                    if (r) goto block_err;
2710
0
                    cr->name_len = out_sz2;
2711
0
                }
2712
0
            }
2713
2714
0
            if (ds & CRAM_NS) {
2715
0
                if (!c->comp_hdr->codecs[DS_NS]) goto block_err;
2716
0
                r |= c->comp_hdr->codecs[DS_NS]
2717
0
                                ->decode(s, c->comp_hdr->codecs[DS_NS], blk,
2718
0
                                         (char *)&cr->mate_ref_id, &out_sz);
2719
0
                if (r) goto block_err;
2720
0
            }
2721
2722
            // Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown.
2723
            // if (cr->mate_ref_id == -1 && cr->flags & 0x01) {
2724
            //     /* Paired, but unmapped */
2725
            //     cr->flags |= BAM_FMUNMAP;
2726
            // }
2727
2728
0
            if (ds & CRAM_NP) {
2729
0
                if (!c->comp_hdr->codecs[DS_NP]) goto block_err;;
2730
0
                if (CRAM_MAJOR_VERS(fd->version) < 4) {
2731
0
                    int32_t i32;
2732
0
                    r |= c->comp_hdr->codecs[DS_NP]
2733
0
                                    ->decode(s, c->comp_hdr->codecs[DS_NP], blk,
2734
0
                                             (char *)&i32, &out_sz);
2735
0
                    cr->mate_pos = i32;
2736
0
                } else {
2737
0
                    r |= c->comp_hdr->codecs[DS_NP]
2738
0
                        ->decode(s, c->comp_hdr->codecs[DS_NP], blk,
2739
0
                                 (char *)&cr->mate_pos, &out_sz);
2740
0
                }
2741
0
                if (r) goto block_err;
2742
0
            }
2743
2744
0
            if (ds & CRAM_TS) {
2745
0
                if (!c->comp_hdr->codecs[DS_TS]) goto block_err;
2746
0
                r = cram_decode_tlen(fd, c, s, blk, &cr->tlen);
2747
0
                if (r) goto block_err;
2748
0
            } else {
2749
0
                cr->tlen = INT64_MIN;
2750
0
            }
2751
0
        } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) {
2752
            // else not detached
2753
0
            if (ds & CRAM_NF) {
2754
0
                if (!c->comp_hdr->codecs[DS_NF]) goto block_err;
2755
0
                r |= c->comp_hdr->codecs[DS_NF]
2756
0
                                ->decode(s, c->comp_hdr->codecs[DS_NF], blk,
2757
0
                                         (char *)&cr->mate_line, &out_sz);
2758
0
                if (r) goto block_err;
2759
0
                cr->mate_line += rec + 1;
2760
2761
                //cr->name_len = sprintf(name, "%d", name_id++);
2762
                //cr->name = DSTRING_LEN(name_ds);
2763
                //dstring_nappend(name_ds, name, cr->name_len);
2764
2765
0
                cr->mate_ref_id = -1;
2766
0
                cr->tlen = INT64_MIN;
2767
0
                cr->mate_pos = 0;
2768
0
            } else  {
2769
0
                cr->mate_flags = 0;
2770
0
                cr->tlen = INT64_MIN;
2771
0
            }
2772
0
            if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) {
2773
0
                if (ds & CRAM_TS) {
2774
0
                    r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen);
2775
0
                    if (r) return r;
2776
0
                } else {
2777
0
                    cr->mate_flags = 0;
2778
0
                    cr->tlen = INT64_MIN;
2779
0
                }
2780
0
            }
2781
0
        } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) {
2782
0
            if (ds & CRAM_TS) {
2783
0
                r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen);
2784
0
                if (r) return r;
2785
0
            } else {
2786
0
                cr->mate_flags = 0;
2787
0
                cr->tlen = INT64_MIN;
2788
0
            }
2789
0
        } else {
2790
0
            cr->mate_flags = 0;
2791
0
            cr->tlen = INT64_MIN;
2792
0
        }
2793
        /*
2794
        else if (!name[0]) {
2795
            //name[0] = '?'; name[1] = 0;
2796
            //cr->name_len = 1;
2797
            //cr->name=  DSTRING_LEN(s->name_ds);
2798
            //dstring_nappend(s->name_ds, "?", 1);
2799
2800
            cr->mate_ref_id = -1;
2801
            cr->tlen = 0;
2802
            cr->mate_pos = 0;
2803
        }
2804
        */
2805
2806
        /* Auxiliary tags */
2807
0
        has_MD = has_NM = 0;
2808
0
        if (CRAM_MAJOR_VERS(fd->version) == 1)
2809
0
            r |= cram_decode_aux_1_0(c, s, blk, cr);
2810
0
        else
2811
0
            r |= cram_decode_aux(fd, c, s, blk, cr, &has_MD, &has_NM);
2812
0
        if (r) goto block_err;
2813
2814
        /* Fake up dynamic string growth and appending */
2815
0
        if (ds & CRAM_RL) {
2816
0
            cr->seq = BLOCK_SIZE(s->seqs_blk);
2817
0
            BLOCK_RESIZE(s->seqs_blk, cr->seq + cr->len);
2818
0
            seq = (char *)BLOCK_END(s->seqs_blk);
2819
0
            BLOCK_SIZE(s->seqs_blk) += cr->len;
2820
2821
0
            if (!seq)
2822
0
                goto block_err;
2823
2824
0
            cr->qual = BLOCK_SIZE(s->qual_blk);
2825
0
            BLOCK_RESIZE(s->qual_blk, cr->qual + cr->len);
2826
0
            qual = (char *)BLOCK_END(s->qual_blk);
2827
0
            BLOCK_SIZE(s->qual_blk) += cr->len;
2828
2829
0
            if (!s->ref)
2830
0
                memset(seq, '=', cr->len);
2831
0
        }
2832
2833
0
        if (!(bf & BAM_FUNMAP)) {
2834
0
            if ((ds & CRAM_AP) && cr->apos <= 0) {
2835
0
                hts_log_error("Read has alignment position %"PRId64
2836
0
                              " but no unmapped flag",
2837
0
                              cr->apos);
2838
0
                goto block_err;
2839
0
            }
2840
            /* Decode sequence and generate CIGAR */
2841
0
            if (ds & (CRAM_SEQ | CRAM_MQ)) {
2842
0
                r |= cram_decode_seq(fd, c, s, blk, cr, sh, cf, seq, qual,
2843
0
                                     has_MD, has_NM);
2844
0
                if (r) goto block_err;
2845
0
            } else {
2846
0
                cr->cigar = 0;
2847
0
                cr->ncigar = 0;
2848
0
                cr->aend = cr->apos;
2849
0
                cr->mqual = 0;
2850
0
            }
2851
0
        } else {
2852
0
            int out_sz2 = cr->len;
2853
2854
            //puts("Unmapped");
2855
0
            cr->cigar = 0;
2856
0
            cr->ncigar = 0;
2857
0
            cr->aend = cr->apos;
2858
0
            cr->mqual = 0;
2859
2860
0
            if (ds & CRAM_BA && cr->len) {
2861
0
                if (!c->comp_hdr->codecs[DS_BA]) goto block_err;
2862
0
                r |= c->comp_hdr->codecs[DS_BA]
2863
0
                                ->decode(s, c->comp_hdr->codecs[DS_BA], blk,
2864
0
                                         (char *)seq, &out_sz2);
2865
0
                if (r) goto block_err;
2866
0
            }
2867
2868
0
            if ((ds & CRAM_CF) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
2869
0
                out_sz2 = cr->len;
2870
0
                if (ds & CRAM_QS && cr->len >= 0) {
2871
0
                    if (!c->comp_hdr->codecs[DS_QS]) goto block_err;
2872
0
                    r |= c->comp_hdr->codecs[DS_QS]
2873
0
                                    ->decode(s, c->comp_hdr->codecs[DS_QS],
2874
0
                                             blk, qual, &out_sz2);
2875
0
                    if (r) goto block_err;
2876
0
                }
2877
0
            } else {
2878
0
                if (ds & CRAM_RL)
2879
0
                    memset(qual, 255, cr->len);
2880
0
            }
2881
0
        }
2882
2883
0
        if (!c->comp_hdr->qs_seq_orient && (ds & CRAM_QS) && (cr->flags & BAM_FREVERSE)) {
2884
0
            int i, j;
2885
0
            for (i = 0, j = cr->len-1; i < j; i++, j--) {
2886
0
                unsigned char c;
2887
0
                c = qual[i];
2888
0
                qual[i] = qual[j];
2889
0
                qual[j] = c;
2890
0
            }
2891
0
        }
2892
0
    }
2893
2894
561
    pthread_mutex_lock(&fd->ref_lock);
2895
561
    if (refs) {
2896
0
        int i;
2897
0
        for (i = 0; i < fd->refs->nref; i++) {
2898
0
            if (refs[i])
2899
0
                cram_ref_decr(fd->refs, i);
2900
0
        }
2901
0
        free(refs);
2902
0
        refs = NULL;
2903
561
    } else if (ref_id >= 0 && s->ref != fd->ref_free && !embed_ref) {
2904
0
        cram_ref_decr(fd->refs, ref_id);
2905
0
    }
2906
561
    pthread_mutex_unlock(&fd->ref_lock);
2907
2908
    /* Resolve mate pair cross-references between recs within this slice */
2909
561
    r |= cram_decode_slice_xref(s, fd->required_fields);
2910
2911
    // Free the original blocks as we no longer need these.
2912
561
    {
2913
561
        int i;
2914
2.80k
        for (i = 0; i < s->hdr->num_blocks; i++) {
2915
2.24k
            cram_block *b = s->block[i];
2916
2.24k
            cram_free_block(b);
2917
2.24k
            s->block[i] = NULL;
2918
2.24k
        }
2919
561
    }
2920
2921
    // Also see initial BLOCK_RESIZE_EXACT at top of function.
2922
    // As we grow blocks we overallocate by up to 50%. So shrink
2923
    // back to their final sizes here.
2924
    //
2925
    //fprintf(stderr, "%d %d // %d %d // %d %d // %d %d\n",
2926
    //      (int)s->seqs_blk->byte, (int)s->seqs_blk->alloc,
2927
    //      (int)s->qual_blk->byte, (int)s->qual_blk->alloc,
2928
    //      (int)s->name_blk->byte, (int)s->name_blk->alloc,
2929
    //      (int)s->aux_blk->byte,  (int)s->aux_blk->alloc);
2930
561
    BLOCK_RESIZE_EXACT(s->seqs_blk, BLOCK_SIZE(s->seqs_blk)+1);
2931
561
    BLOCK_RESIZE_EXACT(s->qual_blk, BLOCK_SIZE(s->qual_blk)+1);
2932
561
    BLOCK_RESIZE_EXACT(s->name_blk, BLOCK_SIZE(s->name_blk)+1);
2933
561
    BLOCK_RESIZE_EXACT(s->aux_blk,  BLOCK_SIZE(s->aux_blk)+1);
2934
2935
561
    return r;
2936
2937
0
 block_err:
2938
0
    if (refs) {
2939
0
        int i;
2940
0
        pthread_mutex_lock(&fd->ref_lock);
2941
0
        for (i = 0; i < fd->refs->nref; i++) {
2942
0
            if (refs[i])
2943
0
                cram_ref_decr(fd->refs, i);
2944
0
        }
2945
0
        free(refs);
2946
0
        pthread_mutex_unlock(&fd->ref_lock);
2947
0
    }
2948
2949
0
    return -1;
2950
561
}
2951
2952
typedef struct {
2953
    cram_fd *fd;
2954
    cram_container *c;
2955
    cram_slice *s;
2956
    sam_hdr_t *h;
2957
    int exit_code;
2958
} cram_decode_job;
2959
2960
0
void *cram_decode_slice_thread(void *arg) {
2961
0
    cram_decode_job *j = (cram_decode_job *)arg;
2962
2963
0
    j->exit_code = cram_decode_slice(j->fd, j->c, j->s, j->h);
2964
2965
0
    return j;
2966
0
}
2967
2968
/*
2969
 * Spawn a multi-threaded version of cram_decode_slice().
2970
 */
2971
int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s,
2972
570
                         sam_hdr_t *bfd) {
2973
570
    cram_decode_job *j;
2974
570
    int nonblock;
2975
2976
570
    if (!fd->pool)
2977
570
        return cram_decode_slice(fd, c, s, bfd);
2978
2979
0
    if (!(j = malloc(sizeof(*j))))
2980
0
        return -1;
2981
2982
0
    j->fd = fd;
2983
0
    j->c  = c;
2984
0
    j->s  = s;
2985
0
    j->h  = bfd;
2986
2987
0
    nonblock = hts_tpool_process_sz(fd->rqueue) ? 1 : 0;
2988
2989
0
    int saved_errno = errno;
2990
0
    errno = 0;
2991
0
    if (-1 == hts_tpool_dispatch2(fd->pool, fd->rqueue, cram_decode_slice_thread,
2992
0
                                  j, nonblock)) {
2993
        /* Would block */
2994
0
        if (errno != EAGAIN)
2995
0
            return -1;
2996
0
        fd->job_pending = j;
2997
0
    } else {
2998
0
        fd->job_pending = NULL;
2999
0
    }
3000
0
    errno = saved_errno;
3001
3002
    // flush too
3003
0
    return 0;
3004
0
}
3005
3006
3007
/* ----------------------------------------------------------------------
3008
 * CRAM sequence iterators.
3009
 */
3010
3011
/*
3012
 * Converts a cram in-memory record into a bam in-memory record. We
3013
 * pass a pointer to a bam_seq_t pointer along with the a pointer to
3014
 * the allocated size. These can initially be pointers to NULL and zero.
3015
 *
3016
 * This function will reallocate the bam buffer as required and update
3017
 * (*bam)->alloc accordingly, allowing it to be used within a loop
3018
 * efficiently without needing to allocate new bam objects over and
3019
 * over again.
3020
 *
3021
 * Returns the used size of the bam record on success
3022
 *         -1 on failure.
3023
 */
3024
int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s,
3025
0
                cram_record *cr, int rec, bam_seq_t **bam) {
3026
0
    int ret, rg_len;
3027
0
    char name_a[1024], *name;
3028
0
    int name_len;
3029
0
    char *aux;
3030
0
    char *seq, *qual;
3031
0
    sam_hrecs_t *bfd = sh->hrecs;
3032
3033
    /* Assign names if not explicitly set */
3034
0
    if (fd->required_fields & SAM_QNAME) {
3035
0
        if (cr->name_len) {
3036
0
            name = (char *)BLOCK_DATA(s->name_blk) + cr->name;
3037
0
            name_len = cr->name_len;
3038
0
        } else {
3039
0
            name = name_a;
3040
0
            if (cr->mate_line >= 0 && cr->mate_line < s->max_rec &&
3041
0
                s->crecs[cr->mate_line].name_len > 0) {
3042
                // Copy our mate if non-zero.
3043
0
                memcpy(name_a, BLOCK_DATA(s->name_blk)+s->crecs[cr->mate_line].name,
3044
0
                       s->crecs[cr->mate_line].name_len);
3045
0
                name = name_a + s->crecs[cr->mate_line].name_len;
3046
0
            } else {
3047
                // Otherwise generate a name based on prefix
3048
0
                name_len = strlen(fd->prefix);
3049
0
                memcpy(name, fd->prefix, name_len);
3050
0
                name += name_len;
3051
0
                *name++ = ':';
3052
0
                if (cr->mate_line >= 0 && cr->mate_line < rec) {
3053
0
                    name = (char *)append_uint64((unsigned char *)name,
3054
0
                                                 s->hdr->record_counter +
3055
0
                                                 cr->mate_line + 1);
3056
0
                } else {
3057
0
                    name = (char *)append_uint64((unsigned char *)name,
3058
0
                                                 s->hdr->record_counter +
3059
0
                                                 rec + 1);
3060
0
                }
3061
0
            }
3062
0
            name_len = name - name_a;
3063
0
            name = name_a;
3064
0
        }
3065
0
    } else {
3066
0
        name = "?";
3067
0
        name_len = 1;
3068
0
    }
3069
3070
    /* Generate BAM record */
3071
0
    if (cr->rg < -1 || cr->rg >= bfd->nrg)
3072
0
        return -1;
3073
0
    rg_len = (cr->rg != -1) ? bfd->rg[cr->rg].name_len + 4 : 0;
3074
3075
0
    if (fd->required_fields & (SAM_SEQ | SAM_QUAL)) {
3076
0
        if (!BLOCK_DATA(s->seqs_blk))
3077
0
            return -1;
3078
0
        seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
3079
0
    } else {
3080
0
        seq = "*";
3081
0
        cr->len = 0;
3082
0
    }
3083
3084
0
    if (fd->required_fields & SAM_QUAL) {
3085
0
        if (!BLOCK_DATA(s->qual_blk))
3086
0
            return -1;
3087
0
        qual = (char *)BLOCK_DATA(s->qual_blk) + cr->qual;
3088
0
    } else {
3089
0
        qual = NULL;
3090
0
    }
3091
3092
0
    ret = bam_set1(*bam,
3093
0
                   name_len, name,
3094
0
                   cr->flags, cr->ref_id, cr->apos - 1, cr->mqual,
3095
0
                   cr->ncigar, &s->cigar[cr->cigar],
3096
0
                   cr->mate_ref_id, cr->mate_pos - 1, cr->tlen,
3097
0
                   cr->len, seq, qual,
3098
0
                   cr->aux_size + rg_len);
3099
0
    if (ret < 0) {
3100
0
        return ret;
3101
0
    }
3102
3103
0
    aux = (char *)bam_aux(*bam);
3104
3105
    /* Auxiliary strings */
3106
0
    if (cr->aux_size != 0) {
3107
0
        memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size);
3108
0
        aux += cr->aux_size;
3109
0
        (*bam)->l_data += cr->aux_size;
3110
0
    }
3111
3112
    /* RG:Z: */
3113
0
    if (rg_len > 0) {
3114
0
        *aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z';
3115
0
        int len = bfd->rg[cr->rg].name_len;
3116
0
        memcpy(aux, bfd->rg[cr->rg].name, len);
3117
0
        aux += len;
3118
0
        *aux++ = 0;
3119
0
        (*bam)->l_data += rg_len;
3120
0
    }
3121
3122
0
    return (*bam)->l_data;
3123
0
}
3124
3125
/*
3126
 * Here be dragons! The multi-threading code in this is crufty beyond belief.
3127
 */
3128
3129
/*
3130
 * Load first container.
3131
 * Called when fd->ctr is NULL>
3132
 *
3133
 * Returns container on success
3134
 *        NULL on failure.
3135
 */
3136
1.07k
static cram_container *cram_first_slice(cram_fd *fd) {
3137
1.07k
    cram_container *c;
3138
3139
8.04k
    do {
3140
8.04k
        if (fd->ctr)
3141
6.97k
            cram_free_container(fd->ctr);
3142
3143
8.04k
        if (!(c = fd->ctr = cram_read_container(fd)))
3144
18
            return NULL;
3145
8.02k
        c->curr_slice_mt = c->curr_slice;
3146
8.02k
    } while (c->length == 0);
3147
3148
    /*
3149
     * The first container may be a result of a sub-range query.
3150
     * In which case it may still not be the optimal starting point
3151
     * due to skipped containers/slices in the index.
3152
     */
3153
    // No need for locks here as we're in the main thread.
3154
1.05k
    if (fd->range.refid != -2) {
3155
0
        while (c->ref_seq_id != -2 &&
3156
0
               (c->ref_seq_id < fd->range.refid ||
3157
0
                (fd->range.refid >= 0 && c->ref_seq_id == fd->range.refid
3158
0
                 && c->ref_seq_start + c->ref_seq_span-1 < fd->range.start))) {
3159
0
            if (0 != cram_seek(fd, c->length, SEEK_CUR))
3160
0
                return NULL;
3161
0
            cram_free_container(fd->ctr);
3162
0
            do {
3163
0
                if (!(c = fd->ctr = cram_read_container(fd)))
3164
0
                    return NULL;
3165
0
            } while (c->length == 0);
3166
0
        }
3167
3168
0
        if (c->ref_seq_id != -2 && c->ref_seq_id != fd->range.refid) {
3169
0
            fd->eof = 1;
3170
0
            return NULL;
3171
0
        }
3172
0
    }
3173
3174
1.05k
    if (!(c->comp_hdr_block = cram_read_block(fd)))
3175
3
        return NULL;
3176
1.05k
    if (c->comp_hdr_block->content_type != COMPRESSION_HEADER)
3177
36
        return NULL;
3178
3179
1.01k
    c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
3180
1.01k
    if (!c->comp_hdr)
3181
957
        return NULL;
3182
57
    if (!c->comp_hdr->AP_delta &&
3183
0
        sam_hrecs_sort_order(fd->header->hrecs) != ORDER_COORD) {
3184
0
        pthread_mutex_lock(&fd->ref_lock);
3185
0
        fd->unsorted = 1;
3186
0
        pthread_mutex_unlock(&fd->ref_lock);
3187
0
    }
3188
3189
57
    return c;
3190
1.01k
}
3191
3192
1.63k
cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) {
3193
1.63k
    cram_container *c_curr;  // container being consumed via cram_get_seq()
3194
1.63k
    cram_slice *s_curr = NULL;
3195
3196
    // Populate the first container if unknown.
3197
1.63k
    if (!(c_curr = fd->ctr)) {
3198
1.07k
        if (!(c_curr = cram_first_slice(fd)))
3199
1.01k
            return NULL;
3200
1.07k
    }
3201
3202
    // Discard previous slice
3203
618
    if ((s_curr = c_curr->slice)) {
3204
561
        c_curr->slice = NULL;
3205
561
        cram_free_slice(s_curr);
3206
561
        s_curr = NULL;
3207
561
    }
3208
3209
    // If we've consumed all slices in this container, also discard
3210
    // the container too.
3211
618
    if (c_curr->curr_slice == c_curr->max_slice) {
3212
48
        if (fd->ctr == c_curr)
3213
48
            fd->ctr = NULL;
3214
48
        if (fd->ctr_mt == c_curr)
3215
45
            fd->ctr_mt = NULL;
3216
48
        cram_free_container(c_curr);
3217
48
        c_curr = NULL;
3218
48
    }
3219
3220
618
    if (!fd->ctr_mt)
3221
102
        fd->ctr_mt = c_curr;
3222
3223
    // Fetch the next slice (and the container if necessary).
3224
    //
3225
    // If single threaded this loop bails out as soon as it finds
3226
    // a slice in range.  In this case c_next and c_curr end up being
3227
    // the same thing.
3228
    //
3229
    // If multi-threaded, we loop until we have filled out
3230
    // thread pool input queue.  Here c_next and c_curr *may* differ, as
3231
    // can fd->ctr and fd->ctr_mt.
3232
618
    for (;;) {
3233
618
        cram_container *c_next = fd->ctr_mt;
3234
618
        cram_slice *s_next = NULL;
3235
3236
        // Next slice; either from the last job we failed to push
3237
        // to the input queue or via more I/O.
3238
618
        if (fd->job_pending) {
3239
0
            cram_decode_job *j = (cram_decode_job *)fd->job_pending;
3240
0
            c_next = j->c;
3241
0
            s_next = j->s;
3242
0
            free(fd->job_pending);
3243
0
            fd->job_pending = NULL;
3244
618
        } else if (!fd->ooc) {
3245
618
        empty_container:
3246
618
            if (!c_next || c_next->curr_slice_mt == c_next->max_slice) {
3247
                // new container
3248
6.10k
                for(;;) {
3249
6.10k
                    if (!(c_next = cram_read_container(fd))) {
3250
9
                        if (fd->pool) {
3251
0
                            fd->ooc = 1;
3252
0
                            break;
3253
0
                        }
3254
3255
9
                        return NULL;
3256
9
                    }
3257
6.09k
                    c_next->curr_slice_mt = c_next->curr_slice;
3258
3259
6.09k
                    if (c_next->length != 0)
3260
39
                        break;
3261
3262
6.06k
                    cram_free_container(c_next);
3263
6.06k
                }
3264
39
                if (fd->ooc)
3265
0
                    break;
3266
3267
//                printf("%p %d:%ld-%ld vs %d:%ld-%ld\n", fd,
3268
//                       c_next->ref_seq_id, c_next->ref_seq_start, c_next->ref_seq_start+c_next->ref_seq_span-1,
3269
//                       fd->range.refid, fd->range.start, fd->range.end);
3270
3271
                /* Skip containers not yet spanning our range */
3272
39
                if (fd->range.refid != -2 && c_next->ref_seq_id != -2) {
3273
                    // ref_id beyond end of range; bail out
3274
0
                    if (c_next->ref_seq_id != fd->range.refid) {
3275
0
                        cram_free_container(c_next);
3276
0
                        fd->ctr_mt = NULL;
3277
0
                        fd->ooc = 1;
3278
0
                        break;
3279
0
                    }
3280
3281
                    // position beyond end of range; bail out
3282
0
                    if (fd->range.refid != -1 &&
3283
0
                        c_next->ref_seq_start > fd->range.end) {
3284
0
                        cram_free_container(c_next);
3285
0
                        fd->ctr_mt = NULL;
3286
0
                        fd->ooc = 1;
3287
0
                        break;
3288
0
                    }
3289
3290
                    // before start of range; skip to next container
3291
0
                    if (fd->range.refid != -1 &&
3292
0
                        c_next->ref_seq_start + c_next->ref_seq_span-1 <
3293
0
                        fd->range.start) {
3294
0
                        c_next->curr_slice_mt = c_next->max_slice;
3295
0
                        cram_seek(fd, c_next->length, SEEK_CUR);
3296
0
                        cram_free_container(c_next);
3297
0
                        c_next = NULL;
3298
0
                        continue;
3299
0
                    }
3300
0
                }
3301
3302
                // Container is valid range, so remember it for restarting
3303
                // this function.
3304
39
                fd->ctr_mt = c_next;
3305
3306
39
                if (!(c_next->comp_hdr_block = cram_read_block(fd)))
3307
9
                    return NULL;
3308
30
                if (c_next->comp_hdr_block->content_type != COMPRESSION_HEADER)
3309
0
                    return NULL;
3310
3311
30
                c_next->comp_hdr =
3312
30
                    cram_decode_compression_header(fd, c_next->comp_hdr_block);
3313
30
                if (!c_next->comp_hdr)
3314
0
                    return NULL;
3315
3316
30
                if (!c_next->comp_hdr->AP_delta &&
3317
0
                    sam_hrecs_sort_order(fd->header->hrecs) != ORDER_COORD) {
3318
0
                    pthread_mutex_lock(&fd->ref_lock);
3319
0
                    fd->unsorted = 1;
3320
0
                    pthread_mutex_unlock(&fd->ref_lock);
3321
0
                }
3322
30
            }
3323
3324
600
            if (c_next->num_records == 0) {
3325
0
                if (fd->ctr == c_next)
3326
0
                    fd->ctr = NULL;
3327
0
                if (c_curr == c_next)
3328
0
                    c_curr = NULL;
3329
0
                if (fd->ctr_mt == c_next)
3330
0
                    fd->ctr_mt = NULL;
3331
0
                cram_free_container(c_next);
3332
0
                c_next = NULL;
3333
0
                goto empty_container;
3334
0
            }
3335
3336
600
            if (!(s_next = c_next->slice = cram_read_slice(fd)))
3337
30
                return NULL;
3338
3339
570
            s_next->slice_num = ++c_next->curr_slice_mt;
3340
570
            s_next->curr_rec = 0;
3341
570
            s_next->max_rec = s_next->hdr->num_records;
3342
3343
570
            s_next->last_apos = s_next->hdr->ref_seq_start;
3344
3345
            // We know the container overlaps our range, but with multi-slice
3346
            // containers we may have slices that do not.  Skip these also.
3347
570
            if (fd->range.refid != -2 && s_next->hdr->ref_seq_id != -2) {
3348
                // ref_id beyond end of range; bail out
3349
0
                if (s_next->hdr->ref_seq_id != fd->range.refid) {
3350
0
                    fd->ooc = 1;
3351
0
                    cram_free_slice(s_next);
3352
0
                    c_next->slice = s_next = NULL;
3353
0
                    break;
3354
0
                }
3355
3356
                // position beyond end of range; bail out
3357
0
                if (fd->range.refid != -1 &&
3358
0
                    s_next->hdr->ref_seq_start > fd->range.end) {
3359
0
                    fd->ooc = 1;
3360
0
                    cram_free_slice(s_next);
3361
0
                    c_next->slice = s_next = NULL;
3362
0
                    break;
3363
0
                }
3364
3365
                // before start of range; skip to next slice
3366
0
                if (fd->range.refid != -1 &&
3367
0
                    s_next->hdr->ref_seq_start + s_next->hdr->ref_seq_span-1 <
3368
0
                    fd->range.start) {
3369
0
                    cram_free_slice(s_next);
3370
0
                    c_next->slice = s_next = NULL;
3371
0
                    continue;
3372
0
                }
3373
0
            }
3374
570
        } // end: if (!fd->ooc)
3375
3376
570
        if (!c_next || !s_next)
3377
0
            break;
3378
3379
        // Decode the slice, either right now (non-threaded) or by pushing
3380
        // it to the a decode queue (threaded).
3381
570
        if (cram_decode_slice_mt(fd, c_next, s_next, fd->header) != 0) {
3382
9
            hts_log_error("Failure to decode slice");
3383
9
            cram_free_slice(s_next);
3384
9
            c_next->slice = NULL;
3385
9
            return NULL;
3386
9
        }
3387
3388
        // No thread pool, so don't loop again
3389
561
        if (!fd->pool) {
3390
561
            c_curr = c_next;
3391
561
            s_curr = s_next;
3392
561
            break;
3393
561
        }
3394
3395
        // With thread pool, but we have a job pending so our decode queue
3396
        // is full.
3397
0
        if (fd->job_pending)
3398
0
            break;
3399
3400
        // Otherwise we're threaded with room in the decode input queue, so
3401
        // keep reading slices for decode.
3402
        // Push it a bit far, to qsize in queue rather than pending arrival,
3403
        // as cram tends to be a bit bursty in decode timings.
3404
0
        if (hts_tpool_process_len(fd->rqueue) >
3405
0
            hts_tpool_process_qsize(fd->rqueue))
3406
0
            break;
3407
0
    } // end of for(;;)
3408
3409
3410
    // When not threaded we've already have c_curr and s_curr.
3411
    // Otherwise we need get them by pulling off the decode output queue.
3412
561
    if (fd->pool) {
3413
0
        hts_tpool_result *res;
3414
0
        cram_decode_job *j;
3415
3416
0
        if (fd->ooc && hts_tpool_process_empty(fd->rqueue)) {
3417
0
            fd->eof = 1;
3418
0
            return NULL;
3419
0
        }
3420
3421
0
        res = hts_tpool_next_result_wait(fd->rqueue);
3422
3423
0
        if (!res || !hts_tpool_result_data(res)) {
3424
0
            hts_log_error("Call to hts_tpool_next_result failed");
3425
0
            return NULL;
3426
0
        }
3427
3428
0
        j = (cram_decode_job *)hts_tpool_result_data(res);
3429
0
        c_curr = j->c;
3430
0
        s_curr = j->s;
3431
3432
0
        if (j->exit_code != 0) {
3433
0
            hts_log_error("Slice decode failure");
3434
0
            fd->eof = 0;
3435
0
            hts_tpool_delete_result(res, 1);
3436
0
            return NULL;
3437
0
        }
3438
3439
0
        hts_tpool_delete_result(res, 1);
3440
0
    }
3441
3442
561
    *cp = c_curr;
3443
3444
    // Update current slice being processed (as opposed to current
3445
    // slice in the multi-threaded reahead.
3446
561
    fd->ctr = c_curr;
3447
561
    if (c_curr) {
3448
561
        c_curr->slice = s_curr;
3449
561
        if (s_curr)
3450
561
            c_curr->curr_slice = s_curr->slice_num;
3451
561
    }
3452
561
    if (s_curr)
3453
561
        s_curr->curr_rec = 0;
3454
0
    else
3455
0
        fd->eof = 1;
3456
3457
561
    return s_curr;
3458
561
}
3459
3460
/*
3461
 * Read the next cram record and return it.
3462
 * Note that to decode cram_record the caller will need to look up some data
3463
 * in the current slice, pointed to by fd->ctr->slice. This is valid until
3464
 * the next call to cram_get_seq (which may invalidate it).
3465
 *
3466
 * Returns record pointer on success (do not free)
3467
 *        NULL on failure
3468
 */
3469
1.07k
cram_record *cram_get_seq(cram_fd *fd) {
3470
1.07k
    cram_container *c;
3471
1.07k
    cram_slice *s;
3472
3473
1.63k
    for (;;) {
3474
1.63k
        c = fd->ctr;
3475
1.63k
        if (c && c->slice && c->slice->curr_rec < c->slice->max_rec) {
3476
0
            s = c->slice;
3477
1.63k
        } else {
3478
1.63k
            if (!(s = cram_next_slice(fd, &c)))
3479
1.07k
                return NULL;
3480
561
            continue; /* In case slice contains no records */
3481
1.63k
        }
3482
3483
        // No need to lock here as get_seq is running in the main thread,
3484
        // which is also the same one that does the range modifications.
3485
0
        if (fd->range.refid != -2) {
3486
0
            if (fd->range.refid == -1 && s->crecs[s->curr_rec].ref_id != -1) {
3487
                // Special case when looking for unmapped blocks at end.
3488
                // If these are mixed in with mapped data (c->ref_id == -2)
3489
                // then we need skip until we find the unmapped data, if at all
3490
0
                s->curr_rec++;
3491
0
                continue;
3492
0
            }
3493
0
            if (s->crecs[s->curr_rec].ref_id < fd->range.refid &&
3494
0
                s->crecs[s->curr_rec].ref_id != -1) {
3495
                // Looking for a mapped read, but not there yet.  Special case
3496
                // as -1 (unmapped) shouldn't be considered < refid.
3497
0
                s->curr_rec++;
3498
0
                continue;
3499
0
            }
3500
3501
0
            if (s->crecs[s->curr_rec].ref_id != fd->range.refid) {
3502
0
                fd->eof = 1;
3503
0
                cram_free_slice(s);
3504
0
                c->slice = NULL;
3505
0
                return NULL;
3506
0
            }
3507
3508
0
            if (fd->range.refid != -1 && s->crecs[s->curr_rec].apos > fd->range.end) {
3509
0
                fd->eof = 1;
3510
0
                cram_free_slice(s);
3511
0
                c->slice = NULL;
3512
0
                return NULL;
3513
0
            }
3514
3515
0
            if (fd->range.refid != -1 && s->crecs[s->curr_rec].aend < fd->range.start) {
3516
0
                s->curr_rec++;
3517
0
                continue;
3518
0
            }
3519
0
        }
3520
3521
0
        break;
3522
0
    }
3523
3524
0
    fd->ctr = c;
3525
0
    c->slice = s;
3526
0
    return &s->crecs[s->curr_rec++];
3527
1.07k
}
3528
3529
/*
3530
 * Read the next cram record and convert it to a bam_seq_t struct.
3531
 *
3532
 * Returns >= 0 success (number of bytes written to *bam)
3533
 *        -1 on EOF or failure (check fd->err)
3534
 */
3535
1.07k
int cram_get_bam_seq(cram_fd *fd, bam_seq_t **bam) {
3536
1.07k
    cram_record *cr;
3537
1.07k
    cram_container *c;
3538
1.07k
    cram_slice *s;
3539
3540
1.07k
    if (!(cr = cram_get_seq(fd)))
3541
1.07k
        return -1;
3542
3543
0
    c = fd->ctr;
3544
0
    s = c->slice;
3545
3546
0
    return cram_to_bam(fd->header, fd, s, cr, s->curr_rec-1, bam);
3547
1.07k
}
3548
3549
/*
3550
 * Drains and frees the decode read-queue for a multi-threaded reader.
3551
 */
3552
1.58k
void cram_drain_rqueue(cram_fd *fd) {
3553
1.58k
    cram_container *lc = NULL;
3554
3555
1.58k
    if (!fd->pool || !fd->rqueue)
3556
1.58k
        return;
3557
3558
    // drain queue of any in-flight decode jobs
3559
0
    while (!hts_tpool_process_empty(fd->rqueue)) {
3560
0
        hts_tpool_result *r = hts_tpool_next_result_wait(fd->rqueue);
3561
0
        if (!r)
3562
0
            break;
3563
0
        cram_decode_job *j = (cram_decode_job *)hts_tpool_result_data(r);
3564
0
        if (j->c->slice == j->s)
3565
0
            j->c->slice = NULL;
3566
0
        if (j->c != lc) {
3567
0
            if (lc) {
3568
0
                if (fd->ctr == lc)
3569
0
                    fd->ctr = NULL;
3570
0
                if (fd->ctr_mt == lc)
3571
0
                    fd->ctr_mt = NULL;
3572
0
                cram_free_container(lc);
3573
0
            }
3574
0
            lc = j->c;
3575
0
        }
3576
0
        cram_free_slice(j->s);
3577
0
        hts_tpool_delete_result(r, 1);
3578
0
    }
3579
3580
    // Also tidy up any pending decode job that we didn't submit to the workers
3581
    // due to the input queue being full.
3582
0
    if (fd->job_pending) {
3583
0
        cram_decode_job *j = (cram_decode_job *)fd->job_pending;
3584
0
        if (j->c->slice == j->s)
3585
0
            j->c->slice = NULL;
3586
0
        if (j->c != lc) {
3587
0
            if (lc) {
3588
0
                if (fd->ctr == lc)
3589
0
                    fd->ctr = NULL;
3590
0
                if (fd->ctr_mt == lc)
3591
0
                    fd->ctr_mt = NULL;
3592
0
                cram_free_container(lc);
3593
0
            }
3594
0
            lc = j->c;
3595
0
        }
3596
0
        cram_free_slice(j->s);
3597
0
        free(j);
3598
0
        fd->job_pending = NULL;
3599
0
    }
3600
3601
0
    if (lc) {
3602
0
        if (fd->ctr == lc)
3603
0
            fd->ctr = NULL;
3604
0
        if (fd->ctr_mt == lc)
3605
0
            fd->ctr_mt = NULL;
3606
0
        cram_free_container(lc);
3607
0
    }
3608
0
}