Coverage Report

Created: 2025-11-24 06:38

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/cram/cram_encode.c
Line
Count
Source
1
/*
2
Copyright (c) 2012-2020, 2022-2024 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
32
#include <config.h>
33
34
#include <stdio.h>
35
#include <errno.h>
36
#include <assert.h>
37
#include <stdlib.h>
38
#include <string.h>
39
#include <strings.h>
40
#include <zlib.h>
41
#include <sys/types.h>
42
#include <sys/stat.h>
43
#include <math.h>
44
#include <inttypes.h>
45
#include <ctype.h>
46
47
#include "cram.h"
48
#include "os.h"
49
#include "../sam_internal.h" // for nibble2base
50
#include "../htslib/hts.h"
51
#include "../htslib/hts_endian.h"
52
#include "../textutils_internal.h"
53
54
KHASH_MAP_INIT_STR(m_s2u64, uint64_t)
55
56
#define Z_CRAM_STRAT Z_FILTERED
57
//#define Z_CRAM_STRAT Z_RLE
58
//#define Z_CRAM_STRAT Z_HUFFMAN_ONLY
59
//#define Z_CRAM_STRAT Z_DEFAULT_STRATEGY
60
61
static int process_one_read(cram_fd *fd, cram_container *c,
62
                            cram_slice *s, cram_record *cr,
63
                            bam_seq_t *b, int rnum, kstring_t *MD,
64
                            int embed_ref, int no_ref);
65
66
/*
67
 * Returns index of val into key.
68
 * Basically strchr(key, val)-key;
69
 */
70
553k
static int sub_idx(char *key, char val) {
71
553k
    int i;
72
73
1.38M
    for (i = 0; i < 4 && *key++ != val; i++);
74
553k
    return i;
75
553k
}
76
77
/*
78
 * Encodes a compression header block into a generic cram_block structure.
79
 *
80
 * Returns cram_block ptr on success
81
 *         NULL on failure
82
 */
83
cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
84
                                           cram_block_compression_hdr *h,
85
29.8k
                                           int embed_ref) {
86
29.8k
    cram_block *cb  = cram_new_block(COMPRESSION_HEADER, 0);
87
29.8k
    cram_block *map = cram_new_block(COMPRESSION_HEADER, 0);
88
29.8k
    int i, mc, r = 0;
89
90
29.8k
    int no_ref = c->no_ref;
91
92
29.8k
    if (!cb || !map)
93
0
        return NULL;
94
95
    /*
96
     * This is a concatenation of several blocks of data:
97
     * header + landmarks, preservation map, read encoding map, and the tag
98
     * encoding map.
99
     * All 4 are variable sized and we need to know how large these are
100
     * before creating the compression header itself as this starts with
101
     * the total size (stored as a variable length string).
102
     */
103
104
    // Duplicated from container itself, and removed in 1.1
105
29.8k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
106
0
        r |= itf8_put_blk(cb, h->ref_seq_id);
107
0
        r |= itf8_put_blk(cb, h->ref_seq_start);
108
0
        r |= itf8_put_blk(cb, h->ref_seq_span);
109
0
        r |= itf8_put_blk(cb, h->num_records);
110
0
        r |= itf8_put_blk(cb, h->num_landmarks);
111
0
        for (i = 0; i < h->num_landmarks; i++) {
112
0
            r |= itf8_put_blk(cb, h->landmark[i]);
113
0
        }
114
0
    }
115
116
29.8k
    if (h->preservation_map) {
117
0
        kh_destroy(map, h->preservation_map);
118
0
        h->preservation_map = NULL;
119
0
    }
120
121
    /* Create in-memory preservation map */
122
    /* FIXME: should create this when we create the container */
123
29.8k
    if (c->num_records > 0) {
124
27.6k
        khint_t k;
125
27.6k
        int r;
126
127
27.6k
        if (!(h->preservation_map = kh_init(map)))
128
0
            return NULL;
129
130
27.6k
        k = kh_put(map, h->preservation_map, "RN", &r);
131
27.6k
        if (-1 == r) return NULL;
132
27.6k
        kh_val(h->preservation_map, k).i = !fd->lossy_read_names;
133
134
27.6k
        if (CRAM_MAJOR_VERS(fd->version) == 1) {
135
0
            k = kh_put(map, h->preservation_map, "PI", &r);
136
0
            if (-1 == r) return NULL;
137
0
            kh_val(h->preservation_map, k).i = 0;
138
139
0
            k = kh_put(map, h->preservation_map, "UI", &r);
140
0
            if (-1 == r) return NULL;
141
0
            kh_val(h->preservation_map, k).i = 1;
142
143
0
            k = kh_put(map, h->preservation_map, "MI", &r);
144
0
            if (-1 == r) return NULL;
145
0
            kh_val(h->preservation_map, k).i = 1;
146
147
27.6k
        } else {
148
            // Technically SM was in 1.0, but wasn't in Java impl.
149
27.6k
            k = kh_put(map, h->preservation_map, "SM", &r);
150
27.6k
            if (-1 == r) return NULL;
151
27.6k
            kh_val(h->preservation_map, k).i = 0;
152
153
27.6k
            k = kh_put(map, h->preservation_map, "TD", &r);
154
27.6k
            if (-1 == r) return NULL;
155
27.6k
            kh_val(h->preservation_map, k).i = 0;
156
157
27.6k
            k = kh_put(map, h->preservation_map, "AP", &r);
158
27.6k
            if (-1 == r) return NULL;
159
27.6k
            kh_val(h->preservation_map, k).i = h->AP_delta;
160
161
27.6k
            if (CRAM_MAJOR_VERS(fd->version) >= 4) {
162
0
                k = kh_put(map, h->preservation_map, "QO", &r);
163
0
                if (-1 == r) return NULL;
164
0
                kh_val(h->preservation_map, k).i = h->qs_seq_orient;
165
0
            }
166
167
27.6k
            if (no_ref || embed_ref>0) {
168
                // Reference Required == No
169
26.6k
                k = kh_put(map, h->preservation_map, "RR", &r);
170
26.6k
                if (-1 == r) return NULL;
171
26.6k
                kh_val(h->preservation_map, k).i = 0;
172
26.6k
            }
173
27.6k
        }
174
27.6k
    }
175
176
    /* Encode preservation map; could collapse this and above into one */
177
29.8k
    mc = 0;
178
29.8k
    BLOCK_SIZE(map) = 0;
179
29.8k
    if (h->preservation_map) {
180
27.6k
        khint_t k;
181
182
27.6k
        for (k = kh_begin(h->preservation_map);
183
248k
             k != kh_end(h->preservation_map);
184
221k
             k++) {
185
221k
            const char *key;
186
221k
            khash_t(map) *pmap = h->preservation_map;
187
188
189
221k
            if (!kh_exist(pmap, k))
190
84.0k
                continue;
191
192
137k
            key = kh_key(pmap, k);
193
137k
            BLOCK_APPEND(map, key, 2);
194
195
137k
            switch(CRAM_KEY(key[0], key[1])) {
196
0
            case CRAM_KEY('M','I'):
197
0
            case CRAM_KEY('U','I'):
198
0
            case CRAM_KEY('P','I'):
199
27.6k
            case CRAM_KEY('A','P'):
200
55.3k
            case CRAM_KEY('R','N'):
201
81.9k
            case CRAM_KEY('R','R'):
202
81.9k
            case CRAM_KEY('Q','O'):
203
81.9k
                BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
204
81.9k
                break;
205
206
81.9k
            case CRAM_KEY('S','M'): {
207
27.6k
                char smat[5], *mp = smat;
208
                // Output format is for order ACGTN (minus ref base)
209
                // to store the code value 0-3 for each symbol.
210
                //
211
                // Note this is different to storing the symbols in order
212
                // that the codes occur from 0-3, which is what we used to
213
                // do.  (It didn't matter as we always had a fixed table in
214
                // the order.)
215
27.6k
                *mp++ =
216
27.6k
                    (sub_idx(h->substitution_matrix[0], 'C') << 6) |
217
27.6k
                    (sub_idx(h->substitution_matrix[0], 'G') << 4) |
218
27.6k
                    (sub_idx(h->substitution_matrix[0], 'T') << 2) |
219
27.6k
                    (sub_idx(h->substitution_matrix[0], 'N') << 0);
220
27.6k
                *mp++ =
221
27.6k
                    (sub_idx(h->substitution_matrix[1], 'A') << 6) |
222
27.6k
                    (sub_idx(h->substitution_matrix[1], 'G') << 4) |
223
27.6k
                    (sub_idx(h->substitution_matrix[1], 'T') << 2) |
224
27.6k
                    (sub_idx(h->substitution_matrix[1], 'N') << 0);
225
27.6k
                *mp++ =
226
27.6k
                    (sub_idx(h->substitution_matrix[2], 'A') << 6) |
227
27.6k
                    (sub_idx(h->substitution_matrix[2], 'C') << 4) |
228
27.6k
                    (sub_idx(h->substitution_matrix[2], 'T') << 2) |
229
27.6k
                    (sub_idx(h->substitution_matrix[2], 'N') << 0);
230
27.6k
                *mp++ =
231
27.6k
                    (sub_idx(h->substitution_matrix[3], 'A') << 6) |
232
27.6k
                    (sub_idx(h->substitution_matrix[3], 'C') << 4) |
233
27.6k
                    (sub_idx(h->substitution_matrix[3], 'G') << 2) |
234
27.6k
                    (sub_idx(h->substitution_matrix[3], 'N') << 0);
235
27.6k
                *mp++ =
236
27.6k
                    (sub_idx(h->substitution_matrix[4], 'A') << 6) |
237
27.6k
                    (sub_idx(h->substitution_matrix[4], 'C') << 4) |
238
27.6k
                    (sub_idx(h->substitution_matrix[4], 'G') << 2) |
239
27.6k
                    (sub_idx(h->substitution_matrix[4], 'T') << 0);
240
27.6k
                BLOCK_APPEND(map, smat, 5);
241
27.6k
                break;
242
27.6k
            }
243
244
27.6k
            case CRAM_KEY('T','D'): {
245
27.6k
                r |= (fd->vv.varint_put32_blk(map, BLOCK_SIZE(h->TD_blk)) <= 0);
246
27.6k
                BLOCK_APPEND(map,
247
27.6k
                             BLOCK_DATA(h->TD_blk),
248
27.6k
                             BLOCK_SIZE(h->TD_blk));
249
27.6k
                break;
250
27.6k
            }
251
252
27.6k
            default:
253
0
                hts_log_warning("Unknown preservation key '%.2s'", key);
254
0
                break;
255
137k
            }
256
257
137k
            mc++;
258
137k
        }
259
27.6k
    }
260
29.8k
    r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
261
29.8k
    r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
262
29.8k
    BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
263
264
    /* rec encoding map */
265
29.8k
    mc = 0;
266
29.8k
    BLOCK_SIZE(map) = 0;
267
29.8k
    if (h->codecs[DS_BF]) {
268
27.6k
        if (-1 == h->codecs[DS_BF]->store(h->codecs[DS_BF], map, "BF",
269
27.6k
                                          fd->version))
270
0
            return NULL;
271
27.6k
        mc++;
272
27.6k
    }
273
29.8k
    if (h->codecs[DS_CF]) {
274
27.6k
        if (-1 == h->codecs[DS_CF]->store(h->codecs[DS_CF], map, "CF",
275
27.6k
                                          fd->version))
276
0
            return NULL;
277
27.6k
        mc++;
278
27.6k
    }
279
29.8k
    if (h->codecs[DS_RL]) {
280
27.6k
        if (-1 == h->codecs[DS_RL]->store(h->codecs[DS_RL], map, "RL",
281
27.6k
                                          fd->version))
282
0
            return NULL;
283
27.6k
        mc++;
284
27.6k
    }
285
29.8k
    if (h->codecs[DS_AP]) {
286
27.6k
        if (-1 == h->codecs[DS_AP]->store(h->codecs[DS_AP], map, "AP",
287
27.6k
                                          fd->version))
288
0
            return NULL;
289
27.6k
        mc++;
290
27.6k
    }
291
29.8k
    if (h->codecs[DS_RG]) {
292
27.6k
        if (-1 == h->codecs[DS_RG]->store(h->codecs[DS_RG], map, "RG",
293
27.6k
                                          fd->version))
294
0
            return NULL;
295
27.6k
        mc++;
296
27.6k
    }
297
29.8k
    if (h->codecs[DS_MF]) {
298
27.6k
        if (-1 == h->codecs[DS_MF]->store(h->codecs[DS_MF], map, "MF",
299
27.6k
                                          fd->version))
300
0
            return NULL;
301
27.6k
        mc++;
302
27.6k
    }
303
29.8k
    if (h->codecs[DS_NS]) {
304
27.6k
        if (-1 == h->codecs[DS_NS]->store(h->codecs[DS_NS], map, "NS",
305
27.6k
                                          fd->version))
306
0
            return NULL;
307
27.6k
        mc++;
308
27.6k
    }
309
29.8k
    if (h->codecs[DS_NP]) {
310
27.6k
        if (-1 == h->codecs[DS_NP]->store(h->codecs[DS_NP], map, "NP",
311
27.6k
                                          fd->version))
312
0
            return NULL;
313
27.6k
        mc++;
314
27.6k
    }
315
29.8k
    if (h->codecs[DS_TS]) {
316
27.6k
        if (-1 == h->codecs[DS_TS]->store(h->codecs[DS_TS], map, "TS",
317
27.6k
                                          fd->version))
318
0
            return NULL;
319
27.6k
        mc++;
320
27.6k
    }
321
29.8k
    if (h->codecs[DS_NF]) {
322
0
        if (-1 == h->codecs[DS_NF]->store(h->codecs[DS_NF], map, "NF",
323
0
                                          fd->version))
324
0
            return NULL;
325
0
        mc++;
326
0
    }
327
29.8k
    if (h->codecs[DS_TC]) {
328
0
        if (-1 == h->codecs[DS_TC]->store(h->codecs[DS_TC], map, "TC",
329
0
                                          fd->version))
330
0
            return NULL;
331
0
        mc++;
332
0
    }
333
29.8k
    if (h->codecs[DS_TN]) {
334
0
        if (-1 == h->codecs[DS_TN]->store(h->codecs[DS_TN], map, "TN",
335
0
                                          fd->version))
336
0
            return NULL;
337
0
        mc++;
338
0
    }
339
29.8k
    if (h->codecs[DS_TL]) {
340
27.6k
        if (-1 == h->codecs[DS_TL]->store(h->codecs[DS_TL], map, "TL",
341
27.6k
                                          fd->version))
342
0
            return NULL;
343
27.6k
        mc++;
344
27.6k
    }
345
29.8k
    if (h->codecs[DS_FN]) {
346
13.4k
        if (-1 == h->codecs[DS_FN]->store(h->codecs[DS_FN], map, "FN",
347
13.4k
                                          fd->version))
348
0
            return NULL;
349
13.4k
        mc++;
350
13.4k
    }
351
29.8k
    if (h->codecs[DS_FC]) {
352
13.4k
        if (-1 == h->codecs[DS_FC]->store(h->codecs[DS_FC], map, "FC",
353
13.4k
                                          fd->version))
354
0
            return NULL;
355
13.4k
        mc++;
356
13.4k
    }
357
29.8k
    if (h->codecs[DS_FP]) {
358
13.4k
        if (-1 == h->codecs[DS_FP]->store(h->codecs[DS_FP], map, "FP",
359
13.4k
                                          fd->version))
360
0
            return NULL;
361
13.4k
        mc++;
362
13.4k
    }
363
29.8k
    if (h->codecs[DS_BS]) {
364
776
        if (-1 == h->codecs[DS_BS]->store(h->codecs[DS_BS], map, "BS",
365
776
                                          fd->version))
366
0
            return NULL;
367
776
        mc++;
368
776
    }
369
29.8k
    if (h->codecs[DS_IN]) {
370
27.6k
        if (-1 == h->codecs[DS_IN]->store(h->codecs[DS_IN], map, "IN",
371
27.6k
                                          fd->version))
372
0
            return NULL;
373
27.6k
        mc++;
374
27.6k
    }
375
29.8k
    if (h->codecs[DS_DL]) {
376
2.75k
        if (-1 == h->codecs[DS_DL]->store(h->codecs[DS_DL], map, "DL",
377
2.75k
                                          fd->version))
378
0
            return NULL;
379
2.75k
        mc++;
380
2.75k
    }
381
29.8k
    if (h->codecs[DS_BA]) {
382
2.64k
        if (-1 == h->codecs[DS_BA]->store(h->codecs[DS_BA], map, "BA",
383
2.64k
                                          fd->version))
384
0
            return NULL;
385
2.64k
        mc++;
386
2.64k
    }
387
29.8k
    if (h->codecs[DS_BB]) {
388
27.6k
        if (-1 == h->codecs[DS_BB]->store(h->codecs[DS_BB], map, "BB",
389
27.6k
                                          fd->version))
390
0
            return NULL;
391
27.6k
        mc++;
392
27.6k
    }
393
29.8k
    if (h->codecs[DS_MQ]) {
394
27.6k
        if (-1 == h->codecs[DS_MQ]->store(h->codecs[DS_MQ], map, "MQ",
395
27.6k
                                          fd->version))
396
0
            return NULL;
397
27.6k
        mc++;
398
27.6k
    }
399
29.8k
    if (h->codecs[DS_RN]) {
400
27.6k
        if (-1 == h->codecs[DS_RN]->store(h->codecs[DS_RN], map, "RN",
401
27.6k
                                          fd->version))
402
0
            return NULL;
403
27.6k
        mc++;
404
27.6k
    }
405
29.8k
    if (h->codecs[DS_QS]) {
406
27.6k
        if (-1 == h->codecs[DS_QS]->store(h->codecs[DS_QS], map, "QS",
407
27.6k
                                          fd->version))
408
0
            return NULL;
409
27.6k
        mc++;
410
27.6k
    }
411
29.8k
    if (h->codecs[DS_QQ]) {
412
0
        if (-1 == h->codecs[DS_QQ]->store(h->codecs[DS_QQ], map, "QQ",
413
0
                                          fd->version))
414
0
            return NULL;
415
0
        mc++;
416
0
    }
417
29.8k
    if (h->codecs[DS_RI]) {
418
27.6k
        if (-1 == h->codecs[DS_RI]->store(h->codecs[DS_RI], map, "RI",
419
27.6k
                                          fd->version))
420
0
            return NULL;
421
27.6k
        mc++;
422
27.6k
    }
423
29.8k
    if (CRAM_MAJOR_VERS(fd->version) != 1) {
424
29.8k
        if (h->codecs[DS_SC]) {
425
27.6k
            if (-1 == h->codecs[DS_SC]->store(h->codecs[DS_SC], map, "SC",
426
27.6k
                                              fd->version))
427
0
                return NULL;
428
27.6k
            mc++;
429
27.6k
        }
430
29.8k
        if (h->codecs[DS_RS]) {
431
28
            if (-1 == h->codecs[DS_RS]->store(h->codecs[DS_RS], map, "RS",
432
28
                                              fd->version))
433
0
                return NULL;
434
28
            mc++;
435
28
        }
436
29.8k
        if (h->codecs[DS_PD]) {
437
4
            if (-1 == h->codecs[DS_PD]->store(h->codecs[DS_PD], map, "PD",
438
4
                                              fd->version))
439
0
                return NULL;
440
4
            mc++;
441
4
        }
442
29.8k
        if (h->codecs[DS_HC]) {
443
2.58k
            if (-1 == h->codecs[DS_HC]->store(h->codecs[DS_HC], map, "HC",
444
2.58k
                                              fd->version))
445
0
                return NULL;
446
2.58k
            mc++;
447
2.58k
        }
448
29.8k
    }
449
29.8k
    if (h->codecs[DS_TM]) {
450
0
        if (-1 == h->codecs[DS_TM]->store(h->codecs[DS_TM], map, "TM",
451
0
                                          fd->version))
452
0
            return NULL;
453
0
        mc++;
454
0
    }
455
29.8k
    if (h->codecs[DS_TV]) {
456
0
        if (-1 == h->codecs[DS_TV]->store(h->codecs[DS_TV], map, "TV",
457
0
                                          fd->version))
458
0
            return NULL;
459
0
        mc++;
460
0
    }
461
29.8k
    r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
462
29.8k
    r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
463
29.8k
    BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
464
465
    /* tag encoding map */
466
29.8k
    mc = 0;
467
29.8k
    BLOCK_SIZE(map) = 0;
468
29.8k
    if (c->tags_used) {
469
27.6k
        khint_t k;
470
471
262k
        for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
472
235k
            int key;
473
235k
            if (!kh_exist(c->tags_used, k))
474
119k
                continue;
475
476
115k
            key = kh_key(c->tags_used, k);
477
115k
            cram_codec *cd = kh_val(c->tags_used, k)->codec;
478
479
115k
            r |= (fd->vv.varint_put32_blk(map, key) <= 0);
480
115k
            if (-1 == cd->store(cd, map, NULL, fd->version))
481
0
                return NULL;
482
483
115k
            mc++;
484
115k
        }
485
27.6k
    }
486
487
29.8k
    r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
488
29.8k
    r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
489
29.8k
    BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
490
491
29.8k
    hts_log_info("Wrote compression block header in %d bytes", (int)BLOCK_SIZE(cb));
492
493
29.8k
    BLOCK_UPLEN(cb);
494
495
29.8k
    cram_free_block(map);
496
497
29.8k
    if (r >= 0)
498
29.8k
        return cb;
499
500
0
 block_err:
501
0
    return NULL;
502
29.8k
}
503
504
505
/*
506
 * Encodes a slice compression header.
507
 *
508
 * Returns cram_block on success
509
 *         NULL on failure
510
 */
511
27.6k
cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) {
512
27.6k
    char *buf;
513
27.6k
    char *cp;
514
27.6k
    cram_block *b = cram_new_block(MAPPED_SLICE, 0);
515
27.6k
    int j;
516
517
27.6k
    if (!b)
518
0
        return NULL;
519
520
27.6k
    cp = buf = malloc(22+16+5*(8+s->hdr->num_blocks));
521
27.6k
    if (NULL == buf) {
522
0
        cram_free_block(b);
523
0
        return NULL;
524
0
    }
525
526
27.6k
    cp += fd->vv.varint_put32s(cp, NULL, s->hdr->ref_seq_id);
527
27.6k
    if (CRAM_MAJOR_VERS(fd->version) >= 4) {
528
0
        cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_start);
529
0
        cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_span);
530
27.6k
    } else {
531
27.6k
        if (s->hdr->ref_seq_start < 0 || s->hdr->ref_seq_start > INT_MAX) {
532
14
            hts_log_error("Reference position too large for CRAM 3");
533
14
            cram_free_block(b);
534
14
            free(buf);
535
14
            return NULL;
536
14
        }
537
27.6k
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_start);
538
27.6k
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_span);
539
27.6k
    }
540
27.6k
    cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_records);
541
27.6k
    if (CRAM_MAJOR_VERS(fd->version) == 2)
542
0
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->record_counter);
543
27.6k
    else if (CRAM_MAJOR_VERS(fd->version) >= 3)
544
27.6k
        cp += fd->vv.varint_put64(cp, NULL, s->hdr->record_counter);
545
27.6k
    cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_blocks);
546
27.6k
    cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_content_ids);
547
220k
    for (j = 0; j < s->hdr->num_content_ids; j++) {
548
192k
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->block_content_ids[j]);
549
192k
    }
550
27.6k
    if (s->hdr->content_type == MAPPED_SLICE)
551
27.6k
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_base_id);
552
553
27.6k
    if (CRAM_MAJOR_VERS(fd->version) != 1) {
554
27.6k
        memcpy(cp, s->hdr->md5, 16); cp += 16;
555
27.6k
    }
556
557
27.6k
    assert(cp-buf <= 22+16+5*(8+s->hdr->num_blocks));
558
559
27.6k
    b->data = (unsigned char *)buf;
560
27.6k
    b->comp_size = b->uncomp_size = cp-buf;
561
562
27.6k
    return b;
563
27.6k
}
564
565
566
/*
567
 * Encodes a single read.
568
 *
569
 * Returns 0 on success
570
 *        -1 on failure
571
 */
572
static int cram_encode_slice_read(cram_fd *fd,
573
                                  cram_container *c,
574
                                  cram_block_compression_hdr *h,
575
                                  cram_slice *s,
576
                                  cram_record *cr,
577
4.63M
                                  int64_t *last_pos) {
578
4.63M
    int r = 0;
579
4.63M
    int32_t i32;
580
4.63M
    int64_t i64;
581
4.63M
    unsigned char uc;
582
583
    //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name);
584
585
    //printf("BF=0x%x\n", cr->flags);
586
    //      bf = cram_flag_swap[cr->flags];
587
4.63M
    i32 = fd->cram_flag_swap[cr->flags & 0xfff];
588
4.63M
    r |= h->codecs[DS_BF]->encode(s, h->codecs[DS_BF], (char *)&i32, 1);
589
590
4.63M
    i32 = cr->cram_flags & CRAM_FLAG_MASK;
591
4.63M
    r |= h->codecs[DS_CF]->encode(s, h->codecs[DS_CF], (char *)&i32, 1);
592
593
4.63M
    if (CRAM_MAJOR_VERS(fd->version) != 1 && s->hdr->ref_seq_id == -2)
594
10.1k
        r |= h->codecs[DS_RI]->encode(s, h->codecs[DS_RI], (char *)&cr->ref_id, 1);
595
596
4.63M
    r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1);
597
598
4.63M
    if (c->pos_sorted) {
599
4.61M
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
600
0
            i64 = cr->apos - *last_pos;
601
0
            r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1);
602
4.61M
        } else {
603
4.61M
            i32 = cr->apos - *last_pos;
604
4.61M
            r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
605
4.61M
        }
606
4.61M
        *last_pos = cr->apos;
607
4.61M
    } else {
608
12.8k
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
609
0
            i64 = cr->apos;
610
0
            r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1);
611
12.8k
        } else {
612
12.8k
            i32 = cr->apos;
613
12.8k
            r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
614
12.8k
        }
615
12.8k
    }
616
617
4.63M
    r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1);
618
619
4.63M
    if (cr->cram_flags & CRAM_FLAG_DETACHED) {
620
4.63M
        i32 = cr->mate_flags;
621
4.63M
        r |= h->codecs[DS_MF]->encode(s, h->codecs[DS_MF], (char *)&i32, 1);
622
623
4.63M
        r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS],
624
4.63M
                                      (char *)&cr->mate_ref_id, 1);
625
626
4.63M
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
627
0
            r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
628
0
                                          (char *)&cr->mate_pos, 1);
629
0
            r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
630
0
                                          (char *)&cr->tlen, 1);
631
4.63M
        } else {
632
4.63M
            i32 = cr->mate_pos;
633
4.63M
            r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
634
4.63M
                                          (char *)&i32, 1);
635
4.63M
            i32 = cr->tlen;
636
4.63M
            r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
637
4.63M
                                          (char *)&i32, 1);
638
4.63M
        }
639
4.63M
    } else {
640
0
        if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) {
641
0
            r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF],
642
0
                                          (char *)&cr->mate_line, 1);
643
0
        }
644
0
        if (cr->cram_flags & CRAM_FLAG_EXPLICIT_TLEN) {
645
0
            if (CRAM_MAJOR_VERS(fd->version) >= 4) {
646
0
                r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
647
0
                                              (char *)&cr->tlen, 1);
648
0
            }
649
0
        }
650
0
    }
651
652
    /* Aux tags */
653
4.63M
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
654
0
        int j;
655
0
        uc = cr->ntags;
656
0
        r |= h->codecs[DS_TC]->encode(s, h->codecs[DS_TC], (char *)&uc, 1);
657
658
0
        for (j = 0; j < cr->ntags; j++) {
659
0
            uint32_t i32 = s->TN[cr->TN_idx + j]; // id
660
0
            r |= h->codecs[DS_TN]->encode(s, h->codecs[DS_TN], (char *)&i32, 1);
661
0
        }
662
4.63M
    } else {
663
4.63M
        r |= h->codecs[DS_TL]->encode(s, h->codecs[DS_TL], (char *)&cr->TL, 1);
664
4.63M
    }
665
666
    // qual
667
    // QS codec : Already stored in block[2].
668
669
    // features (diffs)
670
4.63M
    if (!(cr->flags & BAM_FUNMAP)) {
671
40.4k
        int prev_pos = 0, j;
672
673
40.4k
        r |= h->codecs[DS_FN]->encode(s, h->codecs[DS_FN],
674
40.4k
                                      (char *)&cr->nfeature, 1);
675
129k
        for (j = 0; j < cr->nfeature; j++) {
676
89.1k
            cram_feature *f = &s->features[cr->feature + j];
677
678
89.1k
            uc = f->X.code;
679
89.1k
            r |= h->codecs[DS_FC]->encode(s, h->codecs[DS_FC], (char *)&uc, 1);
680
89.1k
            i32 = f->X.pos - prev_pos;
681
89.1k
            r |= h->codecs[DS_FP]->encode(s, h->codecs[DS_FP], (char *)&i32, 1);
682
89.1k
            prev_pos = f->X.pos;
683
684
89.1k
            switch(f->X.code) {
685
                //char *seq;
686
687
1.50k
            case 'X':
688
                //fprintf(stderr, "    FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base);
689
690
1.50k
                uc = f->X.base;
691
1.50k
                r |= h->codecs[DS_BS]->encode(s, h->codecs[DS_BS],
692
1.50k
                                              (char *)&uc, 1);
693
1.50k
                break;
694
27.6k
            case 'S':
695
                // Already done
696
                //r |= h->codecs[DS_SC]->encode(s, h->codecs[DS_SC],
697
                //                              BLOCK_DATA(s->soft_blk) + f->S.seq_idx,
698
                //                              f->S.len);
699
700
                //if (CRAM_MAJOR_VERS(fd->version) >= 3) {
701
                //    r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
702
                //                                  BLOCK_DATA(s->seqs_blk) + f->S.seq_idx,
703
                //                                  f->S.len);
704
                //}
705
27.6k
                break;
706
51
            case 'I':
707
                //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
708
                //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
709
                //                           seq, f->S.len);
710
                //if (CRAM_MAJOR_VERS(fd->version) >= 3) {
711
                //    r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
712
                //                                  BLOCK_DATA(s->seqs_blk) + f->I.seq_idx,
713
                //                                  f->I.len);
714
                //}
715
51
                break;
716
47
            case 'i':
717
47
                uc = f->i.base;
718
47
                r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
719
47
                                              (char *)&uc, 1);
720
                //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
721
                //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
722
                //                           seq, 1);
723
47
                break;
724
40.2k
            case 'D':
725
40.2k
                i32 = f->D.len;
726
40.2k
                r |= h->codecs[DS_DL]->encode(s, h->codecs[DS_DL],
727
40.2k
                                              (char *)&i32, 1);
728
40.2k
                break;
729
730
1.45k
            case 'B':
731
                //                  // Used when we try to store a non ACGTN base or an N
732
                //                  // that aligns against a non ACGTN reference
733
734
1.45k
                uc  = f->B.base;
735
1.45k
                r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
736
1.45k
                                              (char *)&uc, 1);
737
738
                //                  Already added
739
                //                  uc  = f->B.qual;
740
                //                  r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
741
                //                                           (char *)&uc, 1);
742
1.45k
                break;
743
744
1.87k
            case 'b':
745
                // string of bases
746
1.87k
                r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
747
1.87k
                                              (char *)BLOCK_DATA(s->seqs_blk)
748
1.87k
                                                      + f->b.seq_idx,
749
1.87k
                                              f->b.len);
750
1.87k
                break;
751
752
17
            case 'Q':
753
                //                  Already added
754
                //                  uc  = f->B.qual;
755
                //                  r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
756
                //                                           (char *)&uc, 1);
757
17
                break;
758
759
28
            case 'N':
760
28
                i32 = f->N.len;
761
28
                r |= h->codecs[DS_RS]->encode(s, h->codecs[DS_RS],
762
28
                                              (char *)&i32, 1);
763
28
                break;
764
765
4
            case 'P':
766
4
                i32 = f->P.len;
767
4
                r |= h->codecs[DS_PD]->encode(s, h->codecs[DS_PD],
768
4
                                              (char *)&i32, 1);
769
4
                break;
770
771
16.3k
            case 'H':
772
16.3k
                i32 = f->H.len;
773
16.3k
                r |= h->codecs[DS_HC]->encode(s, h->codecs[DS_HC],
774
16.3k
                                              (char *)&i32, 1);
775
16.3k
                break;
776
777
778
0
            default:
779
0
                hts_log_error("Unhandled feature code %c", f->X.code);
780
0
                return -1;
781
89.1k
            }
782
89.1k
        }
783
784
40.4k
        r |= h->codecs[DS_MQ]->encode(s, h->codecs[DS_MQ],
785
40.4k
                                      (char *)&cr->mqual, 1);
786
4.59M
    } else {
787
4.59M
        char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
788
4.59M
        if (cr->len)
789
155k
            r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], seq, cr->len);
790
4.59M
    }
791
792
4.63M
    return r ? -1 : 0;
793
4.63M
}
794
795
796
/*
797
 * Applies various compression methods to specific blocks, depending on
798
 * known observations of how data series compress.
799
 *
800
 * Returns 0 on success
801
 *        -1 on failure
802
 */
803
27.6k
static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) {
804
27.6k
    int level = fd->level, i;
805
27.6k
    int method = 1<<GZIP | 1<<GZIP_RLE, methodF = method;
806
27.6k
    int v31_or_above = (fd->version >= (3<<8)+1);
807
808
    /* Compress the CORE Block too, with minimal zlib level */
809
27.6k
    if (level > 5 && s->block[0]->uncomp_size > 500)
810
0
        cram_compress_block2(fd, s, s->block[0], NULL, 1<<GZIP, 1);
811
812
27.6k
    if (fd->use_bz2)
813
0
        method |= 1<<BZIP2;
814
815
27.6k
    int method_rans   = (1<<RANS0) | (1<<RANS1);
816
27.6k
    int method_ranspr = method_rans;
817
818
27.6k
    if (fd->use_rans) {
819
27.6k
        method_ranspr = (1<<RANS_PR0)   | (1<<RANS_PR1);
820
27.6k
        if (level > 1)
821
27.6k
            method_ranspr |=
822
27.6k
                  (1<<RANS_PR64)  | (1<<RANS_PR9)
823
27.6k
                | (1<<RANS_PR128) | (1<<RANS_PR193);
824
27.6k
        if (level > 5)
825
0
            method_ranspr |= (1<<RANS_PR129) | (1<<RANS_PR192);
826
27.6k
    }
827
828
27.6k
    if (fd->use_rans) {
829
27.6k
        methodF |= v31_or_above ? method_ranspr : method_rans;
830
27.6k
        method  |= v31_or_above ? method_ranspr : method_rans;
831
27.6k
    }
832
833
27.6k
    int method_arith   = 0;
834
27.6k
    if (fd->use_arith) {
835
0
        method_arith = (1<<ARITH_PR0)   | (1<<ARITH_PR1);
836
0
        if (level > 1)
837
0
            method_arith |=
838
0
                  (1<<ARITH_PR64)  | (1<<ARITH_PR9)
839
0
                | (1<<ARITH_PR128) | (1<<ARITH_PR129)
840
0
                | (1<<ARITH_PR192) | (1u<<ARITH_PR193);
841
0
    }
842
27.6k
    if (fd->use_arith && v31_or_above) {
843
0
        methodF |= method_arith;
844
0
        method  |= method_arith;
845
0
    }
846
847
27.6k
    if (fd->use_lzma)
848
0
        method |= (1<<LZMA);
849
850
    /* Faster method for data series we only need entropy encoding on */
851
27.6k
    methodF = method & ~(1<<GZIP | 1<<BZIP2 | 1<<LZMA);
852
27.6k
    if (level >= 5) {
853
27.6k
        method |= 1<<GZIP_1;
854
27.6k
        methodF = method;
855
27.6k
    }
856
27.6k
    if (level == 1) {
857
0
        method &= ~(1<<GZIP);
858
0
        method |=   1<<GZIP_1;
859
0
        methodF = method;
860
0
    }
861
862
27.6k
    int qmethod  = method;
863
27.6k
    int qmethodF = method;
864
27.6k
    if (v31_or_above && fd->use_fqz) {
865
0
        qmethod  |= 1<<FQZ;
866
0
        qmethodF |= 1<<FQZ;
867
0
        if (fd->level > 4) {
868
0
            qmethod  |= 1<<FQZ_b;
869
0
            qmethodF |= 1<<FQZ_b;
870
0
        }
871
0
        if (fd->level > 6) {
872
0
            qmethod  |= (1<<FQZ_c) | (1<<FQZ_d);
873
0
            qmethodF |= (1<<FQZ_c) | (1<<FQZ_d);
874
0
        }
875
0
    }
876
877
27.6k
    pthread_mutex_lock(&fd->metrics_lock);
878
1.32M
    for (i = 0; i < DS_END; i++)
879
1.30M
        if (c->stats[i] && c->stats[i]->nvals > 16)
880
183
            fd->m[i]->unpackable = 1;
881
27.6k
    pthread_mutex_unlock(&fd->metrics_lock);
882
883
    /* Specific compression methods for certain block types */
884
27.6k
    if (cram_compress_block2(fd, s, s->block[DS_IN], fd->m[DS_IN], //IN (seq)
885
27.6k
                             method, level))
886
0
        return -1;
887
888
27.6k
    if (fd->level == 0) {
889
        /* Do nothing */
890
27.6k
    } else if (fd->level == 1) {
891
0
        if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
892
0
                                 qmethodF, 1))
893
0
            return -1;
894
0
        for (i = DS_aux; i <= DS_aux_oz; i++) {
895
0
            if (s->block[i])
896
0
                if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
897
0
                                         method, 1))
898
0
                    return -1;
899
0
        }
900
27.6k
    } else if (fd->level < 3) {
901
0
        if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
902
0
                                 qmethod, 1))
903
0
            return -1;
904
0
        if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA],
905
0
                                 method, 1))
906
0
            return -1;
907
0
        if (s->block[DS_BB])
908
0
            if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB],
909
0
                                     method, 1))
910
0
                return -1;
911
0
        for (i = DS_aux; i <= DS_aux_oz; i++) {
912
0
            if (s->block[i])
913
0
                if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
914
0
                                         method, level))
915
0
                    return -1;
916
0
        }
917
27.6k
    } else {
918
27.6k
        if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
919
27.6k
                                 qmethod, level))
920
0
            return -1;
921
27.6k
        if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA],
922
27.6k
                                 method, level))
923
0
            return -1;
924
27.6k
        if (s->block[DS_BB])
925
27.6k
            if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB],
926
27.6k
                                     method, level))
927
0
                return -1;
928
276k
        for (i = DS_aux; i <= DS_aux_oz; i++) {
929
249k
            if (s->block[i])
930
0
                if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
931
0
                                         method, level))
932
0
                    return -1;
933
249k
        }
934
27.6k
    }
935
936
    // NAME: best is generally xz, bzip2, zlib then rans1
937
27.6k
    int method_rn = method & ~(method_rans | method_ranspr | 1<<GZIP_RLE);
938
27.6k
    if (fd->version >= (3<<8)+1 && fd->use_tok)
939
27.6k
        method_rn |= fd->use_arith ? (1<<TOKA) : (1<<TOK3);
940
27.6k
    if (cram_compress_block2(fd, s, s->block[DS_RN], fd->m[DS_RN],
941
27.6k
                             method_rn, level))
942
0
        return -1;
943
944
    // NS shows strong local correlation as rearrangements are localised
945
27.6k
    if (s->block[DS_NS] && s->block[DS_NS] != s->block[0])
946
1.54k
        if (cram_compress_block2(fd, s, s->block[DS_NS], fd->m[DS_NS],
947
1.54k
                                 method, level))
948
0
            return -1;
949
950
951
    /*
952
     * Compress any auxiliary tags with their own per-tag metrics
953
     */
954
27.6k
    {
955
27.6k
        int i;
956
143k
        for (i = DS_END /*num_blk - naux_blk*/; i < s->hdr->num_blocks; i++) {
957
115k
            if (!s->block[i] || s->block[i] == s->block[0])
958
0
                continue;
959
960
115k
            if (s->block[i]->method != RAW)
961
0
                continue;
962
963
115k
            if (cram_compress_block2(fd, s, s->block[i], s->block[i]->m,
964
115k
                                     method, level))
965
0
                return -1;
966
115k
        }
967
27.6k
    }
968
969
    /*
970
     * Minimal compression of any block still uncompressed, bar CORE
971
     */
972
27.6k
    {
973
27.6k
        int i;
974
1.30M
        for (i = 1; i < s->hdr->num_blocks && i < DS_END; i++) {
975
1.27M
            if (!s->block[i] || s->block[i] == s->block[0])
976
1.06M
                continue;
977
978
208k
            if (s->block[i]->method != RAW)
979
7.61k
                continue;
980
981
201k
            if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
982
201k
                                    methodF, level))
983
0
                return -1;
984
201k
        }
985
27.6k
    }
986
987
27.6k
    return 0;
988
27.6k
}
989
990
/*
991
 * Allocates a block associated with the cram codec associated with
992
 * data series ds_id or the internal codec_id (depending on codec
993
 * type).
994
 *
995
 * The ds_ids are what end up written to disk as an external block.
996
 * The c_ids are internal and used when daisy-chaining transforms
997
 * such as MAP and RLE.  These blocks are also allocated, but
998
 * are ephemeral in nature.  (The codecs themselves cannot allocate
999
 * these as the same codec pointer may be operating on multiple slices
1000
 * if we're using a multi-slice container.)
1001
 *
1002
 * Returns 0 on success
1003
 *        -1 on failure
1004
 */
1005
802k
static int cram_allocate_block(cram_codec *codec, cram_slice *s, int ds_id) {
1006
802k
    if (!codec)
1007
255k
        return 0;
1008
1009
547k
    switch(codec->codec) {
1010
    // Codecs which are hard-coded to use the CORE block
1011
0
    case E_GOLOMB:
1012
350k
    case E_HUFFMAN:
1013
351k
    case E_BETA:
1014
351k
    case E_SUBEXP:
1015
351k
    case E_GOLOMB_RICE:
1016
351k
    case E_GAMMA:
1017
351k
        codec->out = s->block[0];
1018
351k
        break;
1019
1020
    // Codecs which don't use external blocks
1021
0
    case E_CONST_BYTE:
1022
0
    case E_CONST_INT:
1023
0
       codec->out = NULL;
1024
0
       break;
1025
1026
    // Codecs that emit directly to external blocks
1027
113k
    case E_EXTERNAL:
1028
113k
    case E_VARINT_UNSIGNED:
1029
113k
    case E_VARINT_SIGNED:
1030
113k
        if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id)))
1031
0
            return -1;
1032
113k
        codec->u.external.content_id = ds_id;
1033
113k
        codec->out = s->block[ds_id];
1034
113k
        break;
1035
1036
55.3k
    case E_BYTE_ARRAY_STOP: // Why no sub-codec?
1037
55.3k
        if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id)))
1038
0
            return -1;
1039
55.3k
        codec->u.byte_array_stop.content_id = ds_id;
1040
55.3k
        codec->out = s->block[ds_id];
1041
55.3k
        break;
1042
1043
1044
    // Codecs that contain sub-codecs which may in turn emit to external blocks
1045
27.6k
    case E_BYTE_ARRAY_LEN: {
1046
27.6k
        cram_codec *bal = codec->u.e_byte_array_len.len_codec;
1047
27.6k
        if (cram_allocate_block(bal, s, bal->u.external.content_id))
1048
0
            return -1;
1049
27.6k
        bal = codec->u.e_byte_array_len.val_codec;
1050
27.6k
        if (cram_allocate_block(bal, s, bal->u.external.content_id))
1051
0
            return -1;
1052
1053
27.6k
        break;
1054
27.6k
    }
1055
1056
27.6k
    case E_XRLE:
1057
0
        if (cram_allocate_block(codec->u.e_xrle.len_codec, s, ds_id))
1058
                                //ds_id == DS_QS ? DS_QS_len : ds_id))
1059
0
            return -1;
1060
0
        if (cram_allocate_block(codec->u.e_xrle.lit_codec, s, ds_id))
1061
0
            return -1;
1062
1063
0
        break;
1064
1065
0
    case E_XPACK:
1066
0
        if (cram_allocate_block(codec->u.e_xpack.sub_codec, s, ds_id))
1067
0
            return -1;
1068
0
        codec->out = cram_new_block(0, 0); // ephemeral
1069
0
        if (!codec->out)
1070
0
            return -1;
1071
1072
0
        break;
1073
1074
0
    case E_XDELTA:
1075
0
        if (cram_allocate_block(codec->u.e_xdelta.sub_codec, s, ds_id))
1076
0
            return -1;
1077
0
        codec->out = cram_new_block(0, 0); // ephemeral
1078
0
        if (!codec->out)
1079
0
            return -1;
1080
1081
0
        break;
1082
1083
0
    default:
1084
0
        break;
1085
547k
    }
1086
1087
547k
    return 0;
1088
547k
}
1089
1090
/*
1091
 * Encodes a single slice from a container
1092
 *
1093
 * Returns 0 on success
1094
 *        -1 on failure
1095
 */
1096
static int cram_encode_slice(cram_fd *fd, cram_container *c,
1097
                             cram_block_compression_hdr *h, cram_slice *s,
1098
27.6k
                             int embed_ref) {
1099
27.6k
    int rec, r = 0;
1100
27.6k
    int64_t last_pos;
1101
27.6k
    enum cram_DS_ID id;
1102
1103
    /*
1104
     * Slice external blocks:
1105
     * ID 0 => base calls (insertions, soft-clip)
1106
     * ID 1 => qualities
1107
     * ID 2 => names
1108
     * ID 3 => TS (insert size), NP (next frag)
1109
     * ID 4 => tag values
1110
     * ID 6 => tag IDs (TN), if CRAM_V1.0
1111
     * ID 7 => TD tag dictionary, if !CRAM_V1.0
1112
     */
1113
1114
    /* Create cram slice header */
1115
27.6k
    s->hdr->ref_base_id = embed_ref>0 && s->hdr->ref_seq_span > 0
1116
27.6k
        ? DS_ref
1117
27.6k
        : (CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : -1);
1118
27.6k
    s->hdr->record_counter = c->num_records + c->record_counter;
1119
27.6k
    c->num_records += s->hdr->num_records;
1120
1121
27.6k
    int ntags = c->tags_used ? c->tags_used->n_occupied : 0;
1122
27.6k
    s->block = calloc(DS_END + ntags*2, sizeof(s->block[0]));
1123
27.6k
    s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t));
1124
27.6k
    if (!s->block || !s->hdr->block_content_ids)
1125
0
        return -1;
1126
1127
    // Create first fixed blocks, always external.
1128
    // CORE
1129
27.6k
    if (!(s->block[0] = cram_new_block(CORE, 0)))
1130
0
        return -1;
1131
1132
    // TN block for CRAM v1
1133
27.6k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
1134
0
        if (h->codecs[DS_TN]->codec == E_EXTERNAL) {
1135
0
            if (!(s->block[DS_TN] = cram_new_block(EXTERNAL,DS_TN))) return -1;
1136
0
            h->codecs[DS_TN]->u.external.content_id = DS_TN;
1137
0
        } else {
1138
0
            s->block[DS_TN] = s->block[0];
1139
0
        }
1140
0
    }
1141
1142
    // Embedded reference
1143
27.6k
    if (embed_ref>0) {
1144
12.8k
        if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref)))
1145
0
            return -1;
1146
12.8k
        s->ref_id = DS_ref; // needed?
1147
12.8k
        BLOCK_APPEND(s->block[DS_ref],
1148
12.8k
                     c->ref + s->hdr->ref_seq_start - c->ref_start,
1149
12.8k
                     s->hdr->ref_seq_span);
1150
12.8k
    }
1151
1152
    /*
1153
     * All the data-series blocks if appropriate.
1154
     */
1155
774k
    for (id = DS_QS; id < DS_TN; id++) {
1156
747k
        if (cram_allocate_block(h->codecs[id], s, id) < 0)
1157
0
            return -1;
1158
747k
    }
1159
1160
    /*
1161
     * Add in the external tag blocks too.
1162
     */
1163
27.6k
    if (c->tags_used) {
1164
27.6k
        int n;
1165
27.6k
        s->hdr->num_blocks = DS_END;
1166
143k
        for (n = 0; n < s->naux_block; n++) {
1167
115k
            s->block[s->hdr->num_blocks++] = s->aux_block[n];
1168
115k
            s->aux_block[n] = NULL;
1169
115k
        }
1170
27.6k
    }
1171
1172
    /* Encode reads */
1173
27.6k
    last_pos = s->hdr->ref_seq_start;
1174
4.65M
    for (rec = 0; rec < s->hdr->num_records; rec++) {
1175
4.63M
        cram_record *cr = &s->crecs[rec];
1176
4.63M
        if (cram_encode_slice_read(fd, c, h, s, cr, &last_pos) == -1)
1177
0
            return -1;
1178
4.63M
    }
1179
1180
27.6k
    s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7);
1181
27.6k
    s->block[0]->comp_size = s->block[0]->uncomp_size;
1182
1183
    // Make sure the fixed blocks point to the correct sources
1184
27.6k
    if (s->block[DS_IN]) cram_free_block(s->block[DS_IN]);
1185
27.6k
    s->block[DS_IN] = s->base_blk; s->base_blk = NULL;
1186
27.6k
    if (s->block[DS_QS]) cram_free_block(s->block[DS_QS]);
1187
27.6k
    s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL;
1188
27.6k
    if (s->block[DS_RN]) cram_free_block(s->block[DS_RN]);
1189
27.6k
    s->block[DS_RN] = s->name_blk; s->name_blk = NULL;
1190
27.6k
    if (s->block[DS_SC]) cram_free_block(s->block[DS_SC]);
1191
27.6k
    s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL;
1192
1193
    // Finalise any data transforms.
1194
774k
    for (id = DS_QS; id < DS_TN; id++) {
1195
747k
       if (h->codecs[id] && h->codecs[id]->flush)
1196
0
           h->codecs[id]->flush(h->codecs[id]);
1197
747k
    }
1198
1199
    // Ensure block sizes are up to date.
1200
1.41M
    for (id = 1; id < s->hdr->num_blocks; id++) {
1201
1.38M
        if (!s->block[id] || s->block[id] == s->block[0])
1202
1.06M
            continue;
1203
1204
324k
        if (s->block[id]->uncomp_size == 0)
1205
324k
            BLOCK_UPLEN(s->block[id]);
1206
324k
    }
1207
1208
    // Compress it all
1209
27.6k
    if (cram_compress_slice(fd, c, s) == -1)
1210
0
        return -1;
1211
1212
    // Collapse empty blocks and create hdr_block
1213
27.6k
    {
1214
27.6k
        int i, j;
1215
1216
27.6k
        s->hdr->block_content_ids = realloc(s->hdr->block_content_ids,
1217
27.6k
                                            s->hdr->num_blocks * sizeof(int32_t));
1218
27.6k
        if (!s->hdr->block_content_ids)
1219
0
            return -1;
1220
1221
1.41M
        for (i = j = 1; i < s->hdr->num_blocks; i++) {
1222
1.38M
            if (!s->block[i] || s->block[i] == s->block[0])
1223
1.06M
                continue;
1224
324k
            if (s->block[i]->uncomp_size == 0) {
1225
132k
                cram_free_block(s->block[i]);
1226
132k
                s->block[i] = NULL;
1227
132k
                continue;
1228
132k
            }
1229
192k
            s->block[j] = s->block[i];
1230
192k
            s->hdr->block_content_ids[j-1] = s->block[i]->content_id;
1231
192k
            j++;
1232
192k
        }
1233
27.6k
        s->hdr->num_content_ids = j-1;
1234
27.6k
        s->hdr->num_blocks = j;
1235
1236
27.6k
        if (!(s->hdr_block = cram_encode_slice_header(fd, s)))
1237
14
            return -1;
1238
27.6k
    }
1239
1240
27.6k
    return r ? -1 : 0;
1241
1242
0
 block_err:
1243
0
    return -1;
1244
27.6k
}
1245
1246
4.63M
static inline const char *bam_data_end(bam1_t *b) {
1247
4.63M
    return (const char *)b->data + b->l_data;
1248
4.63M
}
1249
1250
/*
1251
 * A bounds checking version of bam_aux2i.
1252
 */
1253
0
static inline int bam_aux2i_end(const uint8_t *aux, const uint8_t *aux_end) {
1254
0
    int type = *aux++;
1255
0
    switch (type) {
1256
0
        case 'c':
1257
0
            if (aux_end - aux < 1) {
1258
0
                errno = EINVAL;
1259
0
                return 0;
1260
0
            }
1261
0
            return *(int8_t *)aux;
1262
0
        case 'C':
1263
0
            if (aux_end - aux < 1) {
1264
0
                errno = EINVAL;
1265
0
                return 0;
1266
0
            }
1267
0
            return *aux;
1268
0
        case 's':
1269
0
            if (aux_end - aux < 2) {
1270
0
                errno = EINVAL;
1271
0
                return 0;
1272
0
            }
1273
0
            return le_to_i16(aux);
1274
0
        case 'S':
1275
0
            if (aux_end - aux < 2) {
1276
0
                errno = EINVAL;
1277
0
                return 0;
1278
0
            }
1279
0
            return le_to_u16(aux);
1280
0
        case 'i':
1281
0
            if (aux_end - aux < 4) {
1282
0
                errno = EINVAL;
1283
0
                return 0;
1284
0
            }
1285
0
            return le_to_i32(aux);
1286
0
        case 'I':
1287
0
            if (aux_end - aux < 4) {
1288
0
                errno = EINVAL;
1289
0
                return 0;
1290
0
            }
1291
0
            return le_to_u32(aux);
1292
0
        default:
1293
0
            errno = EINVAL;
1294
0
    }
1295
0
    return 0;
1296
0
}
1297
1298
/*
1299
 * Returns the number of expected read names for this record.
1300
 */
1301
0
static int expected_template_count(bam_seq_t *b) {
1302
0
    int expected = bam_flag(b) & BAM_FPAIRED ? 2 : 1;
1303
1304
0
    uint8_t *TC = (uint8_t *)bam_aux_get(b, "TC");
1305
0
    if (TC) {
1306
0
        int n = bam_aux2i_end(TC, (uint8_t *)bam_data_end(b));
1307
0
        if (expected < n)
1308
0
            expected = n;
1309
0
    }
1310
1311
0
    if (!TC && bam_aux_get(b, "SA")) {
1312
        // We could count the semicolons, but we'd have to do this for
1313
        // read1, read2 and read(not-1-or-2) combining the results
1314
        // together.  This is a cheap and safe alternative for now.
1315
0
        expected = INT_MAX;
1316
0
    }
1317
1318
0
    return expected;
1319
0
}
1320
1321
/*
1322
 * Lossily reject read names.
1323
 *
1324
 * The rule here is that if all reads for this template reside in the
1325
 * same slice then we can lose the name.  Otherwise we keep them as we
1326
 * do not know when (or if) the other reads will turn up.
1327
 *
1328
 * Note there may be only 1 read (non-paired library) or more than 2
1329
 * reads (paired library with supplementary reads), or other weird
1330
 * setups.  We need to know how many are expected.  Ways to guess:
1331
 *
1332
 * - Flags (0x1 - has > 1 read)
1333
 * - TC aux field (not mandatory)
1334
 * - SA tags (count semicolons, NB per fragment so sum - hard)
1335
 * - RNEXT/PNEXT uniqueness count. (not implemented, tricky)
1336
 *
1337
 * Returns 0 on success
1338
 *        -1 on failure
1339
 */
1340
static int lossy_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
1341
28.1k
                            int bam_start) {
1342
28.1k
    int r1, r2, ret = -1;
1343
1344
    // Initialise cram_flags
1345
4.66M
    for (r2 = 0; r2 < s->hdr->num_records; r2++)
1346
4.63M
        s->crecs[r2].cram_flags = 0;
1347
1348
28.1k
    if (!fd->lossy_read_names)
1349
28.1k
        return 0;
1350
1351
0
    khash_t(m_s2u64) *names = kh_init(m_s2u64);
1352
0
    if (!names)
1353
0
        goto fail;
1354
1355
    // 1: Iterate through names to count frequency
1356
0
    for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
1357
        //cram_record *cr = &s->crecs[r2];
1358
0
        bam_seq_t *b = c->bams[r1];
1359
0
        khint_t k;
1360
0
        int n;
1361
0
        uint64_t e;
1362
0
        union {
1363
0
            uint64_t i64;
1364
0
            struct {
1365
0
                int32_t e,c; // expected & observed counts.
1366
0
            } counts;
1367
0
        } u;
1368
1369
0
        e = expected_template_count(b);
1370
0
        u.counts.e = e; u.counts.c = 1;
1371
1372
0
        k = kh_put(m_s2u64, names, bam_name(b), &n);
1373
0
        if (n == -1)
1374
0
            goto fail;
1375
1376
0
        if (n == 0) {
1377
            // not a new name
1378
0
            u.i64 = kh_val(names, k);
1379
0
            if (u.counts.e != e) {
1380
                // different expectation or already hit the max
1381
                //fprintf(stderr, "Err computing no. %s recs\n", bam_name(b));
1382
0
                kh_val(names, k) = 0;
1383
0
            } else {
1384
0
                u.counts.c++;
1385
0
                if (u.counts.e == u.counts.c) {
1386
                    // Reached expected count.
1387
0
                    kh_val(names, k) = -1;
1388
0
                } else {
1389
0
                    kh_val(names, k) = u.i64;
1390
0
                }
1391
0
            }
1392
0
        } else {
1393
            // new name
1394
0
            kh_val(names, k) = u.i64;
1395
0
        }
1396
0
    }
1397
1398
    // 2: Remove names if all present (hd.i == -1)
1399
0
    for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
1400
0
        cram_record *cr = &s->crecs[r2];
1401
0
        bam_seq_t *b = c->bams[r1];
1402
0
        khint_t k;
1403
1404
0
        k = kh_get(m_s2u64, names, bam_name(b));
1405
1406
0
        if (k == kh_end(names))
1407
0
            goto fail;
1408
1409
0
        if (kh_val(names, k) == -1)
1410
0
            cr->cram_flags = CRAM_FLAG_DISCARD_NAME;
1411
0
    }
1412
1413
0
    ret = 0;
1414
0
 fail: // ret==-1
1415
1416
0
    if (names)
1417
0
        kh_destroy(m_s2u64, names);
1418
1419
0
    return ret;
1420
0
}
1421
1422
/*
1423
 * Adds the reading names.  We do this here as a separate pass rather
1424
 * than per record in the process_one_read calls as that function can
1425
 * go back and change the CRAM_FLAG_DETACHED status of a previously
1426
 * processed read if it subsequently determines the TLEN field is
1427
 * incorrect.  Given DETACHED reads always try to decode read names,
1428
 * we need to know their status before generating the read-name block.
1429
 *
1430
 * Output is an update s->name_blk, and cr->name / cr->name_len
1431
 * fields.
1432
 */
1433
static int add_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
1434
27.6k
                          int bam_start) {
1435
27.6k
    int r1, r2;
1436
27.6k
    int keep_names = !fd->lossy_read_names;
1437
1438
27.6k
    for (r1 = bam_start, r2 = 0;
1439
4.65M
         r1 < c->curr_c_rec && r2 < s->hdr->num_records;
1440
4.63M
         r1++, r2++) {
1441
4.63M
        cram_record *cr = &s->crecs[r2];
1442
4.63M
        bam_seq_t *b = c->bams[r1];
1443
1444
4.63M
        cr->name        = BLOCK_SIZE(s->name_blk);
1445
4.63M
        if ((cr->cram_flags & CRAM_FLAG_DETACHED) || keep_names) {
1446
4.63M
            if (CRAM_MAJOR_VERS(fd->version) >= 4
1447
0
                && (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM)
1448
0
                && cr->mate_line) {
1449
                // Dedup read names in V4
1450
0
                BLOCK_APPEND(s->name_blk, "\0", 1);
1451
0
                cr->name_len    = 1;
1452
4.63M
            } else {
1453
4.63M
                BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b));
1454
4.63M
                cr->name_len    = bam_name_len(b);
1455
4.63M
            }
1456
4.63M
        } else {
1457
            // Can only discard duplicate names if not detached
1458
0
            cr->name_len = 0;
1459
0
        }
1460
1461
4.63M
        if (cram_stats_add(c->stats[DS_RN], cr->name_len) < 0)
1462
0
            goto block_err;
1463
4.63M
    }
1464
1465
27.6k
    return 0;
1466
1467
0
 block_err:
1468
0
    return -1;
1469
27.6k
}
1470
1471
// CRAM version >= 3.1
1472
41.7k
#define CRAM_ge31(v) ((v) >= 0x301)
1473
1474
// Returns the next cigar op code: one of the BAM_C* codes,
1475
// or -1 if no more are present.
1476
static inline
1477
int next_cigar_op(uint32_t *cigar, uint32_t ncigar, int *skip, int *spos,
1478
2.85k
                  uint32_t *cig_ind, uint32_t *cig_op, uint32_t *cig_len) {
1479
4.75k
    for(;;) {
1480
11.7k
        while (*cig_len == 0) {
1481
7.03k
            if (*cig_ind < ncigar) {
1482
7.03k
                *cig_op  = cigar[*cig_ind] & BAM_CIGAR_MASK;
1483
7.03k
                *cig_len = cigar[*cig_ind] >> BAM_CIGAR_SHIFT;
1484
7.03k
                (*cig_ind)++;
1485
7.03k
            } else {
1486
0
                return -1;
1487
0
            }
1488
7.03k
        }
1489
1490
4.75k
        if (skip[*cig_op]) {
1491
1.89k
            *spos += (bam_cigar_type(*cig_op)&1) * *cig_len;
1492
1.89k
            *cig_len = 0;
1493
1.89k
            continue;
1494
1.89k
        }
1495
1496
2.85k
        (*cig_len)--;
1497
2.85k
        break;
1498
4.75k
    }
1499
1500
2.85k
    return *cig_op;
1501
2.85k
}
1502
1503
// Ensure ref and hist are large enough.
1504
static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos,
1505
808k
                             hts_pos_t ref_start, hts_pos_t *ref_end) {
1506
808k
    if (pos < ref_start)
1507
115
        return -1;
1508
808k
    if (pos < *ref_end)
1509
792k
        return 0;
1510
1511
    // realloc
1512
15.8k
    if (pos - ref_start > UINT_MAX)
1513
7
        return -2; // protect overflow in new_end calculation
1514
1515
15.8k
    hts_pos_t old_end = *ref_end ? *ref_end : ref_start;
1516
15.8k
    hts_pos_t new_end = ref_start + 1000 + (pos-ref_start)*1.5;
1517
1518
    // Refuse to work on excessively large blocks.
1519
    // We'll just switch to referenceless encoding, which is probably better
1520
    // here as this must be very sparse data anyway.
1521
15.8k
    if (new_end - ref_start > UINT_MAX/sizeof(**hist)/2)
1522
408
        return -2;
1523
1524
15.4k
    char *tmp = realloc(*ref, new_end-ref_start+1);
1525
15.4k
    if (!tmp)
1526
0
        return -1;
1527
15.4k
    *ref = tmp;
1528
1529
15.4k
    uint32_t (*tmp5)[5] = realloc(**hist,
1530
15.4k
                                  (new_end - ref_start)*sizeof(**hist));
1531
15.4k
    if (!tmp5)
1532
0
        return -1;
1533
15.4k
    *hist = tmp5;
1534
15.4k
    *ref_end = new_end;
1535
1536
    // initialise
1537
15.4k
    old_end -= ref_start;
1538
15.4k
    new_end -= ref_start;
1539
15.4k
    memset(&(*ref)[old_end],  0,  new_end-old_end);
1540
15.4k
    memset(&(*hist)[old_end], 0, (new_end-old_end)*sizeof(**hist));
1541
1542
15.4k
    return 0;
1543
15.4k
}
1544
1545
// Walk through MD + seq to generate ref
1546
// Returns 1 on success, <0 on failure
1547
static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
1548
                              hts_pos_t ref_start, hts_pos_t *ref_end,
1549
9.19k
                              const uint8_t *MD) {
1550
9.19k
    uint8_t *seq = bam_get_seq(b);
1551
9.19k
    uint32_t *cigar = bam_get_cigar(b);
1552
9.19k
    uint32_t ncigar = b->core.n_cigar;
1553
9.19k
    uint32_t cig_op = 0, cig_len = 0, cig_ind = 0;
1554
1555
    // End position of the sequence on the reference.
1556
9.19k
    hts_pos_t rlen = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
1557
9.19k
    hts_pos_t rseq_end = b->core.pos + (rlen ? rlen : b->core.l_qseq);
1558
1559
    // No sequence means extend based on CIGAR instead
1560
9.19k
    if (!b->core.l_qseq && extend_ref(ref, hist, rseq_end,
1561
5.93k
                                      ref_start, ref_end) < 0)
1562
0
        return -1;
1563
1564
9.19k
    int iseq = 0, next_op;
1565
9.19k
    hts_pos_t iref = b->core.pos - ref_start;
1566
1567
    // Skip INS, REF_SKIP, *CLIP, PAD. and BACK.
1568
9.19k
    static int cig_skip[16] = {0,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1};
1569
10.0k
    while (iseq < b->core.l_qseq && *MD) {
1570
3.40k
        if (isdigit(*MD)) {
1571
            // match
1572
2.52k
            int overflow = 0;
1573
2.52k
            int len = hts_str2uint((char *)MD, (char **)&MD, 31, &overflow);
1574
2.52k
            if (overflow ||
1575
1.99k
                extend_ref(ref, hist, iref+ref_start + len,
1576
1.99k
                           ref_start, ref_end) < 0)
1577
674
                return -1;
1578
1.85k
            while (iseq < b->core.l_qseq && len) {
1579
                // rewrite to have internal loops?
1580
1.71k
                if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
1581
1.71k
                                             &iseq, &cig_ind, &cig_op,
1582
1.71k
                                             &cig_len)) < 0)
1583
0
                    return -1;
1584
1585
1.71k
                if (next_op != BAM_CMATCH &&
1586
1.71k
                    next_op != BAM_CEQUAL) {
1587
1.71k
                    hts_log_info("MD:Z and CIGAR are incompatible for "
1588
1.71k
                                 "record %s", bam_get_qname(b));
1589
1.71k
                    return -1;
1590
1.71k
                }
1591
1592
                // Short-cut loop over same cigar op for efficiency
1593
0
                cig_len++;
1594
0
                do {
1595
0
                    cig_len--;
1596
0
                    (*ref)[iref++] = seq_nt16_str[bam_seqi(seq, iseq)];
1597
0
                    iseq++;
1598
0
                    len--;
1599
0
                } while (cig_len && iseq < b->core.l_qseq && len);
1600
0
            }
1601
141
            if (len > 0)
1602
0
                return -1; // MD is longer than seq
1603
878
        } else if (*MD == '^') {
1604
            // deletion
1605
20
            MD++;
1606
292
            while (isalpha(*MD)) {
1607
292
                if (extend_ref(ref, hist, iref+ref_start, ref_start,
1608
292
                               ref_end) < 0)
1609
0
                    return -1;
1610
292
                if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
1611
292
                                             &iseq, &cig_ind, &cig_op,
1612
292
                                             &cig_len)) < 0)
1613
0
                    return -1;
1614
1615
292
                if (next_op != BAM_CDEL) {
1616
0
                    hts_log_info("MD:Z and CIGAR are incompatible");
1617
0
                    return -1;
1618
0
                }
1619
1620
292
                (*ref)[iref++] = *MD++ & ~0x20;
1621
292
            }
1622
858
        } else {
1623
            // substitution
1624
858
            if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end) < 0)
1625
4
                return -1;
1626
854
            if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
1627
854
                                         &iseq, &cig_ind, &cig_op,
1628
854
                                         &cig_len)) < 0)
1629
0
                return -1;
1630
1631
854
            if (next_op != BAM_CMATCH && next_op != BAM_CDIFF) {
1632
162
                hts_log_info("MD:Z and CIGAR are incompatible");
1633
162
                return -1;
1634
162
            }
1635
1636
692
            (*ref)[iref++] = *MD++ & ~0x20;
1637
692
            iseq++;
1638
692
        }
1639
3.40k
    }
1640
1641
6.64k
    return 1;
1642
9.19k
}
1643
1644
// Append a sequence to a ref/consensus structure.
1645
// We maintain both an absolute refefence (ACGTN where MD:Z is
1646
// present) and a 5-way frequency array for when no MD:Z is known.
1647
// We then subsequently convert the 5-way frequencies to a consensus
1648
// ref in a second pass.
1649
//
1650
// Returns >=0 on success,
1651
//         -1 on failure (eg inconsistent data)
1652
static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5],
1653
35.6k
                           hts_pos_t ref_start, hts_pos_t *ref_end) {
1654
35.6k
    const uint8_t *MD = bam_aux_get(b, "MD");
1655
35.6k
    int ret = 0;
1656
35.6k
    if (MD && *MD == 'Z') {
1657
        // We can use MD to directly compute the reference
1658
9.19k
        int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end, MD+1);
1659
1660
9.19k
        if (ret > 0)
1661
6.64k
            return ret;
1662
9.19k
    }
1663
1664
    // Otherwise we just use SEQ+CIGAR and build a consensus which we later
1665
    // turn into a fake reference
1666
29.0k
    uint32_t *cigar = bam_get_cigar(b);
1667
29.0k
    uint32_t ncigar = b->core.n_cigar;
1668
29.0k
    uint32_t i, j;
1669
29.0k
    hts_pos_t iseq = 0, iref = b->core.pos - ref_start;
1670
29.0k
    uint8_t *seq = bam_get_seq(b);
1671
924k
    for (i = 0; i < ncigar; i++) {
1672
896k
        switch (bam_cigar_op(cigar[i])) {
1673
31.6k
        case BAM_CSOFT_CLIP:
1674
43.8k
        case BAM_CINS:
1675
43.8k
            iseq += bam_cigar_oplen(cigar[i]);
1676
43.8k
            break;
1677
1678
775k
        case BAM_CMATCH:
1679
785k
        case BAM_CEQUAL:
1680
785k
        case BAM_CDIFF: {
1681
785k
            int len = bam_cigar_oplen(cigar[i]);
1682
            // Maps an nt16 (A=1 C=2 G=4 T=8 bits) to 0123 plus N=4
1683
785k
            static uint8_t L16[16] = {4,0,1,4, 2,4,4,4, 3,4,4,4, 4,4,4,4};
1684
1685
785k
            if (extend_ref(ref, hist, iref+ref_start + len,
1686
785k
                           ref_start, ref_end) < 0)
1687
249
                return -1;
1688
785k
            if (iseq + len <= b->core.l_qseq) {
1689
                // Nullify failed MD:Z if appropriate
1690
18.9k
                if (ret < 0)
1691
0
                    memset(&(*ref)[iref], 0, len);
1692
1693
27.8k
                for (j = 0; j < len; j++, iref++, iseq++)
1694
8.93k
                    (*hist)[iref][L16[bam_seqi(seq, iseq)]]++;
1695
766k
            } else {
1696
                // Probably a 2ndary read with seq "*"
1697
766k
                iseq += len;
1698
766k
                iref += len;
1699
766k
            }
1700
785k
            break;
1701
785k
        }
1702
1703
29.7k
        case BAM_CDEL:
1704
30.5k
        case BAM_CREF_SKIP:
1705
30.5k
            iref += bam_cigar_oplen(cigar[i]);
1706
896k
        }
1707
896k
    }
1708
1709
28.7k
    return 1;
1710
29.0k
}
1711
1712
// Automatically generates the reference and stashed it in c->ref, also
1713
// setting c->ref_start and c->ref_end.
1714
//
1715
// If we have MD:Z tags then we use them to directly infer the reference,
1716
// along with SEQ + CIGAR.  Otherwise we use SEQ/CIGAR only to build up
1717
// a consensus and then assume the reference as the majority rule.
1718
//
1719
// In this latter scenario we need to be wary of auto-generating MD and NM
1720
// during decode, but that's handled elsewhere via an additional aux tag.
1721
//
1722
// Returns 0 on success,
1723
//        -1 on failure
1724
13.2k
static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) {
1725
    // TODO: if we can find an external reference then use it, even if the
1726
    // user told us to do embed_ref=2.
1727
13.2k
    char *ref = NULL;
1728
13.2k
    uint32_t (*hist)[5] = NULL;
1729
13.2k
    hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0;
1730
13.2k
    if (ref_start < 0)
1731
1
        return -1; // cannot build consensus from unmapped data
1732
1733
    // initial allocation
1734
13.2k
    if (extend_ref(&ref, &hist,
1735
13.2k
                   c->bams[r1 + s->hdr->num_records-1]->core.pos +
1736
13.2k
                   c->bams[r1 + s->hdr->num_records-1]->core.l_qseq,
1737
13.2k
                   ref_start, &ref_end) < 0)
1738
136
        return -1;
1739
1740
    // Add each bam file to the reference/consensus arrays
1741
13.1k
    int r2;
1742
13.1k
    hts_pos_t last_pos = -1;
1743
48.5k
    for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) {
1744
35.7k
        if (c->bams[r1]->core.pos < last_pos) {
1745
30
            hts_log_error("Cannot build reference with unsorted data");
1746
30
            goto err;
1747
30
        }
1748
35.6k
        last_pos = c->bams[r1]->core.pos;
1749
35.6k
        if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end) < 0)
1750
249
            goto err;
1751
35.6k
    }
1752
1753
    // Compute the consensus
1754
12.8k
    hts_pos_t i;
1755
1.08G
    for (i = 0; i < ref_end-ref_start; i++) {
1756
1.08G
        if (!ref[i]) {
1757
1.08G
            int max_v = 0, max_j = 4, j;
1758
5.42G
            for (j = 0; j < 4; j++)
1759
                // don't call N (j==4) unless no coverage
1760
4.34G
                if (max_v < hist[i][j])
1761
1.66k
                    max_v = hist[i][j], max_j = j;
1762
1.08G
            ref[i] = "ACGTN"[max_j];
1763
1.08G
        }
1764
1.08G
    }
1765
12.8k
    free(hist);
1766
1767
    // Put the reference in place so it appears to be an external
1768
    // ref file.
1769
12.8k
    c->ref       = ref;
1770
12.8k
    c->ref_start = ref_start+1;
1771
12.8k
    c->ref_end   = ref_end+1;
1772
12.8k
    c->ref_free  = 1;
1773
12.8k
    return 0;
1774
1775
279
 err:
1776
279
    free(ref);
1777
279
    free(hist);
1778
279
    return -1;
1779
13.1k
}
1780
1781
// Check if the SQ M5 tag matches the reference we've loaded.
1782
0
static int validate_md5(cram_fd *fd, int ref_id) {
1783
0
    if (fd->ignore_md5 || ref_id < 0 || ref_id >= fd->refs->nref)
1784
0
        return 0;
1785
1786
    // Have we already checked this ref?
1787
0
    if (fd->refs->ref_id[ref_id]->validated_md5)
1788
0
        return 0;
1789
1790
    // Check if we have the MD5 known.
1791
    // We should, but maybe we're using embedded references?
1792
0
    sam_hrecs_t *hrecs = fd->header->hrecs;
1793
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, "SQ", "SN",
1794
0
                                                 hrecs->ref[ref_id].name);
1795
0
    if (!ty)
1796
0
        return 0;
1797
1798
0
    sam_hrec_tag_t *m5tag = sam_hrecs_find_key(ty, "M5", NULL);
1799
0
    if (!m5tag)
1800
0
        return 0;
1801
1802
    // It's known, so compute md5 on the loaded reference sequence.
1803
0
    char *ref = fd->refs->ref_id[ref_id]->seq;
1804
0
    int64_t len = fd->refs->ref_id[ref_id]->length;
1805
0
    hts_md5_context *md5;
1806
0
    char unsigned buf[16];
1807
0
    char buf2[33];
1808
1809
0
    if (!(md5 = hts_md5_init()))
1810
0
        return -1;
1811
0
    hts_md5_update(md5, ref, len);
1812
0
    hts_md5_final(buf, md5);
1813
0
    hts_md5_destroy(md5);
1814
0
    hts_md5_hex(buf2, buf);
1815
1816
    // Compare it to header @SQ M5 tag
1817
0
    if (strcmp(m5tag->str+3, buf2)) {
1818
0
        hts_log_error("SQ header M5 tag discrepancy for reference '%s'",
1819
0
                      hrecs->ref[ref_id].name);
1820
0
        hts_log_error("Please use the correct reference, or "
1821
0
                      "consider using embed_ref=2");
1822
0
        return -1;
1823
0
    }
1824
0
    fd->refs->ref_id[ref_id]->validated_md5 = 1;
1825
1826
0
    return 0;
1827
0
}
1828
1829
/*
1830
 * Encodes all slices in a container into blocks.
1831
 * Returns 0 on success
1832
 *        -1 on failure
1833
 */
1834
27.7k
int cram_encode_container(cram_fd *fd, cram_container *c) {
1835
27.7k
    int i, j, slice_offset;
1836
27.7k
    cram_block_compression_hdr *h = c->comp_hdr;
1837
27.7k
    cram_block *c_hdr;
1838
27.7k
    int multi_ref = 0;
1839
27.7k
    int r1, r2, sn, nref, embed_ref, no_ref;
1840
27.7k
    spare_bams *spares;
1841
1842
27.7k
    if (!c->bams)
1843
0
        goto err;
1844
1845
27.7k
    if (CRAM_MAJOR_VERS(fd->version) == 1)
1846
0
        goto err;
1847
1848
//#define goto_err {fprintf(stderr, "ERR at %s:%d\n", __FILE__, __LINE__);goto err;}
1849
27.7k
#define goto_err goto err
1850
1851
    // Don't try embed ref if we repeatedly fail
1852
27.7k
    pthread_mutex_lock(&fd->ref_lock);
1853
27.7k
    int failed_embed = (fd->no_ref_counter >= 5); // maximum 5 tries
1854
27.7k
    if (!failed_embed && c->embed_ref == -2 && c->ref_id >= 0) {
1855
316
        hts_log_warning("Retrying embed_ref=2 mode for #%d/5", fd->no_ref_counter);
1856
316
        fd->no_ref = c->no_ref = 0;
1857
316
        fd->embed_ref = c->embed_ref = 2;
1858
27.4k
    } else if (failed_embed && c->embed_ref == -2) {
1859
        // We've tried several times, so this time give up for good
1860
15
        hts_log_warning("Keeping non-ref mode from now on");
1861
15
        fd->embed_ref = c->embed_ref = 0;
1862
15
    }
1863
27.7k
    pthread_mutex_unlock(&fd->ref_lock);
1864
1865
28.2k
 restart:
1866
    /* Cache references up-front if we have unsorted access patterns */
1867
28.2k
    pthread_mutex_lock(&fd->ref_lock);
1868
28.2k
    nref = fd->refs->nref;
1869
28.2k
    pthread_mutex_unlock(&fd->ref_lock);
1870
28.2k
    embed_ref = c->embed_ref;
1871
28.2k
    no_ref = c->no_ref;
1872
1873
    /* To create M5 strings */
1874
    /* Fetch reference sequence */
1875
28.2k
    if (!no_ref) {
1876
27.2k
        if (!c->bams || !c->curr_c_rec || !c->bams[0])
1877
13
            goto_err;
1878
27.2k
        bam_seq_t *b = c->bams[0];
1879
1880
27.2k
        if (embed_ref <= 1) {
1881
1.08k
            char *ref = cram_get_ref(fd, bam_ref(b), 1, 0);
1882
1.08k
            if (!ref && bam_ref(b) >= 0) {
1883
3
                if (!c->pos_sorted) {
1884
                    // TODO: maybe also check fd->no_ref?
1885
0
                    hts_log_warning("Failed to load reference #%d",
1886
0
                                    bam_ref(b));
1887
0
                    hts_log_warning("Switching to non-ref mode");
1888
1889
0
                    pthread_mutex_lock(&fd->ref_lock);
1890
0
                    c->embed_ref = fd->embed_ref = 0;
1891
0
                    c->no_ref = fd->no_ref = 1;
1892
0
                    pthread_mutex_unlock(&fd->ref_lock);
1893
0
                    goto restart;
1894
0
                }
1895
1896
3
                if (c->multi_seq || embed_ref == 0) {
1897
0
                    hts_log_error("Failed to load reference #%d", bam_ref(b));
1898
0
                    return -1;
1899
0
                }
1900
3
                hts_log_warning("Failed to load reference #%d", bam_ref(b));
1901
3
                hts_log_warning("Enabling embed_ref=2 mode to auto-generate"
1902
3
                                " reference");
1903
3
                if (embed_ref <= 0)
1904
3
                    hts_log_warning("NOTE: the CRAM file will be bigger than"
1905
3
                                    " using an external reference");
1906
3
                pthread_mutex_lock(&fd->ref_lock);
1907
3
                embed_ref = c->embed_ref = fd->embed_ref = 2;
1908
3
                pthread_mutex_unlock(&fd->ref_lock);
1909
3
                goto auto_ref;
1910
1.08k
            } else if (ref) {
1911
0
                if (validate_md5(fd, c->ref_seq_id) < 0)
1912
0
                    goto_err;
1913
0
            }
1914
1.08k
            if ((c->ref_id = bam_ref(b)) >= 0) {
1915
0
                c->ref_seq_id = c->ref_id;
1916
0
                c->ref       = fd->refs->ref_id[c->ref_seq_id]->seq;
1917
0
                c->ref_start = 1;
1918
0
                c->ref_end   = fd->refs->ref_id[c->ref_seq_id]->length;
1919
0
            }
1920
26.1k
        } else {
1921
26.1k
        auto_ref:
1922
            // Auto-embed ref.
1923
            // This starts as 'N' and is amended on-the-fly as we go
1924
            // based on MD:Z tags.
1925
26.1k
            if ((c->ref_id = bam_ref(b)) >= 0) {
1926
13.2k
                c->ref = NULL;
1927
                // c->ref_free is boolean; whether to free c->ref.  In this
1928
                // case c->ref will be our auto-embedded sequence instead of
1929
                // a "global" portion of reference from fd->refs.
1930
                // Do not confuse with fd->ref_free which is a pointer to a
1931
                // reference string to free.
1932
13.2k
                c->ref_free = 1;
1933
13.2k
            } else {
1934
                // Double check for broken input.  We shouldn't have
1935
                // embedded references enabled for unmapped data, but our
1936
                // data could be broken.
1937
12.8k
                embed_ref = 0;
1938
12.8k
                no_ref = c->no_ref = 1;
1939
12.8k
            }
1940
26.1k
        }
1941
27.2k
        c->ref_seq_id = c->ref_id;
1942
27.2k
    } else {
1943
968
        c->ref_id = bam_ref(c->bams[0]);
1944
968
        cram_ref_incr(fd->refs, c->ref_id);
1945
968
        c->ref_seq_id = c->ref_id;
1946
968
    }
1947
1948
28.1k
    if (!no_ref && c->refs_used) {
1949
383
        for (i = 0; i < nref; i++) {
1950
236
            if (c->refs_used[i]) {
1951
142
                if (cram_get_ref(fd, i, 1, 0)) {
1952
0
                    if (validate_md5(fd, i) < 0)
1953
0
                        goto_err;
1954
142
                } else {
1955
142
                    hts_log_warning("Failed to find reference, "
1956
142
                                    "switching to non-ref mode");
1957
142
                    no_ref = c->no_ref = 1;
1958
142
                }
1959
142
            }
1960
236
        }
1961
147
    }
1962
1963
    /* Turn bams into cram_records and gather basic stats */
1964
55.8k
    for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) {
1965
28.1k
        cram_slice *s = c->slices[sn];
1966
28.1k
        int64_t first_base = INT64_MAX, last_base = INT64_MIN;
1967
1968
28.1k
        int r1_start = r1;
1969
1970
28.1k
        assert(sn < c->curr_slice);
1971
1972
        // Discover which read names *may* be safely removed.
1973
        // Ie which ones have all their records in this slice.
1974
28.1k
        if (lossy_read_names(fd, c, s, r1_start) != 0)
1975
0
            return -1;
1976
1977
        // Tracking of MD tags so we can spot when the auto-generated values
1978
        // will differ from the current stored ones.  The kstring here is
1979
        // simply to avoid excessive malloc and free calls.  All initialisation
1980
        // is done within process_one_read().
1981
28.1k
        kstring_t MD = {0};
1982
1983
        // Embed consensus / MD-generated ref
1984
28.1k
        if (embed_ref == 2) {
1985
13.2k
            if (c->ref_id < 0 || cram_generate_reference(c, s, r1) < 0) {
1986
                // Should this be a permanent thing via fd->no_ref?
1987
                // Doing so means we cannot easily switch back again should
1988
                // things fix themselves later on.  This is likely not a
1989
                // concern though as failure to generate a reference implies
1990
                // unsorted data which is rarely recovered from.
1991
1992
                // Only if sn == 0.  We're hosed if we're on the 2nd slice and
1993
                // the first worked, as no-ref is a container global param.
1994
416
                if (sn > 0) {
1995
0
                    hts_log_error("Failed to build reference, "
1996
0
                                  "switching to non-ref mode");
1997
0
                    return -1;
1998
416
                } else {
1999
416
                    hts_log_warning("Failed to build reference, "
2000
416
                                    "switching to non-ref mode");
2001
416
                }
2002
416
                pthread_mutex_lock(&fd->ref_lock);
2003
416
                c->embed_ref = fd->embed_ref = -2; // was previously embed_ref
2004
416
                c->no_ref = fd->no_ref = 1;
2005
416
                fd->no_ref_counter++; // more likely to keep permanent action
2006
416
                pthread_mutex_unlock(&fd->ref_lock);
2007
416
                failed_embed = 1;
2008
416
                goto restart;
2009
12.8k
            } else {
2010
12.8k
                pthread_mutex_lock(&fd->ref_lock);
2011
12.8k
                fd->no_ref_counter -= (fd->no_ref_counter > 0);
2012
12.8k
                pthread_mutex_unlock(&fd->ref_lock);
2013
12.8k
            }
2014
2015
12.8k
            if (c->ref_end > fd->refs->ref_id[c->ref_id]->LN_length)
2016
11.6k
                c->ref_end = fd->refs->ref_id[c->ref_id]->LN_length;
2017
12.8k
        }
2018
2019
        // Iterate through records creating the cram blocks for some
2020
        // fields and just gathering stats for others.
2021
4.66M
        for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) {
2022
4.63M
            cram_record *cr = &s->crecs[r2];
2023
4.63M
            bam_seq_t *b = c->bams[r1];
2024
2025
            /* If multi-ref we need to cope with changing reference per seq */
2026
4.63M
            if (c->multi_seq && !no_ref) {
2027
411
                if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) {
2028
0
                    if (c->ref_seq_id >= 0)
2029
0
                        cram_ref_decr(fd->refs, c->ref_seq_id);
2030
2031
0
                    if (!cram_get_ref(fd, bam_ref(b), 1, 0)) {
2032
0
                        hts_log_error("Failed to load reference #%d", bam_ref(b));
2033
0
                        free(MD.s);
2034
0
                        return -1;
2035
0
                    }
2036
0
                    if (validate_md5(fd, bam_ref(b)) < 0)
2037
0
                        return -1;
2038
2039
0
                    c->ref_seq_id = bam_ref(b); // overwritten later by -2
2040
0
                    if (!fd->refs->ref_id[c->ref_seq_id]->seq)
2041
0
                        return -1;
2042
0
                    c->ref       = fd->refs->ref_id[c->ref_seq_id]->seq;
2043
0
                    c->ref_start = 1;
2044
0
                    c->ref_end   = fd->refs->ref_id[c->ref_seq_id]->length;
2045
0
                }
2046
411
            }
2047
2048
4.63M
            if (process_one_read(fd, c, s, cr, b, r2, &MD, embed_ref,
2049
4.63M
                                 no_ref) != 0) {
2050
94
                free(MD.s);
2051
94
                return -1;
2052
94
            }
2053
2054
4.63M
            if (first_base > cr->apos)
2055
28.0k
                first_base = cr->apos;
2056
2057
4.63M
            if (last_base < cr->aend)
2058
27.9k
                last_base = cr->aend;
2059
4.63M
        }
2060
2061
27.6k
        free(MD.s);
2062
2063
        // Process_one_read doesn't add read names as it can change
2064
        // its mind during the loop on the CRAM_FLAG_DETACHED setting
2065
        // of earlier records (if it detects the auto-generation of
2066
        // TLEN is incorrect).  This affects which read-names can be
2067
        // lossily compressed, so we do these in another pass.
2068
27.6k
        if (add_read_names(fd, c, s, r1_start) < 0)
2069
0
            return -1;
2070
2071
27.6k
        if (c->multi_seq) {
2072
539
            s->hdr->ref_seq_id    = -2;
2073
539
            s->hdr->ref_seq_start = 0;
2074
539
            s->hdr->ref_seq_span  = 0;
2075
27.1k
        } else if (c->ref_id == -1 && CRAM_ge31(fd->version)) {
2076
            // Spec states span=0, but it broke our range queries.
2077
            // See commit message for this and prior.
2078
13.9k
            s->hdr->ref_seq_id    = -1;
2079
13.9k
            s->hdr->ref_seq_start = 0;
2080
13.9k
            s->hdr->ref_seq_span  = 0;
2081
13.9k
        } else {
2082
13.2k
            s->hdr->ref_seq_id    = c->ref_id;
2083
13.2k
            s->hdr->ref_seq_start = first_base;
2084
13.2k
            s->hdr->ref_seq_span  = MAX(0, last_base - first_base + 1);
2085
13.2k
        }
2086
27.6k
        s->hdr->num_records = r2;
2087
2088
        // Processed a slice, now stash the aux blocks so the next
2089
        // slice can start aggregating them from the start again.
2090
27.6k
        if (c->tags_used->n_occupied) {
2091
23.9k
            int ntags = c->tags_used->n_occupied;
2092
23.9k
            s->aux_block = calloc(ntags*2, sizeof(*s->aux_block));
2093
23.9k
            if (!s->aux_block)
2094
0
                return -1;
2095
2096
23.9k
            khint_t k;
2097
2098
23.9k
            s->naux_block = 0;
2099
259k
            for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
2100
235k
                if (!kh_exist(c->tags_used, k))
2101
119k
                    continue;
2102
2103
115k
                cram_tag_map *tm = kh_val(c->tags_used, k);
2104
115k
                if (!tm) goto_err;
2105
115k
                if (!tm->blk) continue;
2106
115k
                s->aux_block[s->naux_block++] = tm->blk;
2107
115k
                tm->blk = NULL;
2108
115k
                if (!tm->blk2) continue;
2109
0
                s->aux_block[s->naux_block++] = tm->blk2;
2110
0
                tm->blk2 = NULL;
2111
0
            }
2112
23.9k
            assert(s->naux_block <= 2*c->tags_used->n_occupied);
2113
23.9k
        }
2114
27.6k
    }
2115
2116
27.6k
    if (c->multi_seq && !no_ref) {
2117
5
        if (c->ref_seq_id >= 0)
2118
0
            cram_ref_decr(fd->refs, c->ref_seq_id);
2119
5
    }
2120
2121
    /* Link our bams[] array onto the spare bam list for reuse */
2122
27.6k
    spares = malloc(sizeof(*spares));
2123
27.6k
    if (!spares) goto_err;
2124
27.6k
    pthread_mutex_lock(&fd->bam_list_lock);
2125
27.6k
    spares->bams = c->bams;
2126
27.6k
    spares->next = fd->bl;
2127
27.6k
    fd->bl = spares;
2128
27.6k
    pthread_mutex_unlock(&fd->bam_list_lock);
2129
27.6k
    c->bams = NULL;
2130
2131
    /* Detect if a multi-seq container */
2132
27.6k
    cram_stats_encoding(fd, c->stats[DS_RI]);
2133
27.6k
    multi_ref = c->stats[DS_RI]->nvals > 1;
2134
27.6k
    pthread_mutex_lock(&fd->metrics_lock);
2135
27.6k
    fd->last_RI_count = c->stats[DS_RI]->nvals;
2136
27.6k
    pthread_mutex_unlock(&fd->metrics_lock);
2137
2138
2139
27.6k
    if (multi_ref) {
2140
238
        hts_log_info("Multi-ref container");
2141
238
        c->ref_seq_id = -2;
2142
238
        c->ref_seq_start = 0;
2143
238
        c->ref_seq_span = 0;
2144
238
    }
2145
2146
2147
    /* Compute MD5s */
2148
27.6k
    no_ref = c->no_ref;
2149
27.6k
    int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0;
2150
2151
55.3k
    for (i = 0; i < c->curr_slice; i++) {
2152
27.6k
        cram_slice *s = c->slices[i];
2153
2154
27.6k
        if (CRAM_MAJOR_VERS(fd->version) != 1) {
2155
27.6k
            if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !no_ref) {
2156
12.8k
                hts_md5_context *md5 = hts_md5_init();
2157
12.8k
                if (!md5)
2158
0
                    return -1;
2159
12.8k
                hts_md5_update(md5,
2160
12.8k
                               c->ref + s->hdr->ref_seq_start - c->ref_start,
2161
12.8k
                               s->hdr->ref_seq_span);
2162
12.8k
                hts_md5_final(s->hdr->md5, md5);
2163
12.8k
                hts_md5_destroy(md5);
2164
14.8k
            } else {
2165
14.8k
                memset(s->hdr->md5, 0, 16);
2166
14.8k
            }
2167
27.6k
        }
2168
27.6k
    }
2169
2170
27.6k
    c->num_records = 0;
2171
27.6k
    c->num_blocks = 1; // cram_block_compression_hdr
2172
27.6k
    c->length = 0;
2173
2174
    //fprintf(stderr, "=== BF ===\n");
2175
27.6k
    h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]),
2176
27.6k
                                         c->stats[DS_BF], E_INT, NULL,
2177
27.6k
                                         fd->version, &fd->vv);
2178
27.6k
    if (c->stats[DS_BF]->nvals && !h->codecs[DS_BF]) goto_err;
2179
2180
    //fprintf(stderr, "=== CF ===\n");
2181
27.6k
    h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]),
2182
27.6k
                                         c->stats[DS_CF], E_INT, NULL,
2183
27.6k
                                         fd->version, &fd->vv);
2184
27.6k
    if (c->stats[DS_CF]->nvals && !h->codecs[DS_CF]) goto_err;
2185
2186
    //fprintf(stderr, "=== RN ===\n");
2187
    //h->codecs[DS_RN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RN]),
2188
    //                                     c->stats[DS_RN], E_BYTE_ARRAY, NULL,
2189
    //                                     fd->version);
2190
2191
    //fprintf(stderr, "=== AP ===\n");
2192
27.6k
    if (c->pos_sorted || CRAM_MAJOR_VERS(fd->version) >= 4) {
2193
26.9k
        if (c->pos_sorted)
2194
26.9k
            h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]),
2195
26.9k
                                                 c->stats[DS_AP],
2196
26.9k
                                                 is_v4 ? E_LONG : E_INT,
2197
26.9k
                                                 NULL, fd->version, &fd->vv);
2198
0
        else
2199
            // Unsorted data has no stats, but hard-code VARINT_SIGNED / EXT.
2200
0
            h->codecs[DS_AP] = cram_encoder_init(is_v4 ? E_VARINT_SIGNED
2201
0
                                                       : E_EXTERNAL,
2202
0
                                                 NULL,
2203
0
                                                 is_v4 ? E_LONG : E_INT,
2204
0
                                                 NULL, fd->version, &fd->vv);
2205
26.9k
    } else {
2206
        // Removed BETA in v4.0.
2207
        // Should we consider dropping use of it for 3.0 too?
2208
699
        hts_pos_t p[2] = {0, c->max_apos};
2209
699
        h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL,
2210
699
                                             is_v4 ? E_LONG : E_INT,
2211
699
                                             p, fd->version, &fd->vv);
2212
//      cram_xdelta_encoder e;
2213
//      e.word_size = is_v4 ? 8 : 4;
2214
//      e.sub_encoding = E_EXTERNAL;
2215
//      e.sub_codec_dat = (void *)DS_AP;
2216
//
2217
//      h->codecs[DS_AP] = cram_encoder_init(E_XDELTA, NULL,
2218
//                                           is_v4 ? E_LONG : E_INT,
2219
//                                           &e, fd->version, &fd->vv);
2220
699
    }
2221
27.6k
    if (!h->codecs[DS_AP]) goto_err;
2222
2223
    //fprintf(stderr, "=== RG ===\n");
2224
27.6k
    h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]),
2225
27.6k
                                         c->stats[DS_RG],
2226
27.6k
                                         E_INT,
2227
27.6k
                                         NULL,
2228
27.6k
                                         fd->version, &fd->vv);
2229
27.6k
    if (c->stats[DS_RG]->nvals && !h->codecs[DS_RG]) goto_err;
2230
2231
    //fprintf(stderr, "=== MQ ===\n");
2232
27.6k
    h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]),
2233
27.6k
                                         c->stats[DS_MQ], E_INT, NULL,
2234
27.6k
                                         fd->version, &fd->vv);
2235
27.6k
    if (c->stats[DS_MQ]->nvals && !h->codecs[DS_MQ]) goto_err;
2236
2237
    //fprintf(stderr, "=== NS ===\n");
2238
27.6k
    h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]),
2239
27.6k
                                         c->stats[DS_NS], E_INT, NULL,
2240
27.6k
                                         fd->version, &fd->vv);
2241
27.6k
    if (c->stats[DS_NS]->nvals && !h->codecs[DS_NS]) goto_err;
2242
2243
    //fprintf(stderr, "=== MF ===\n");
2244
27.6k
    h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]),
2245
27.6k
                                         c->stats[DS_MF], E_INT, NULL,
2246
27.6k
                                         fd->version, &fd->vv);
2247
27.6k
    if (c->stats[DS_MF]->nvals && !h->codecs[DS_MF]) goto_err;
2248
2249
    //fprintf(stderr, "=== TS ===\n");
2250
27.6k
    h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]),
2251
27.6k
                                         c->stats[DS_TS],
2252
27.6k
                                         is_v4 ? E_LONG : E_INT,
2253
27.6k
                                         NULL, fd->version, &fd->vv);
2254
27.6k
    if (c->stats[DS_TS]->nvals && !h->codecs[DS_TS]) goto_err;
2255
2256
    //fprintf(stderr, "=== NP ===\n");
2257
27.6k
    h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]),
2258
27.6k
                                         c->stats[DS_NP],
2259
27.6k
                                         is_v4 ? E_LONG : E_INT,
2260
27.6k
                                         NULL, fd->version, &fd->vv);
2261
27.6k
    if (c->stats[DS_NP]->nvals && !h->codecs[DS_NP]) goto_err;
2262
2263
    //fprintf(stderr, "=== NF ===\n");
2264
27.6k
    h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]),
2265
27.6k
                                         c->stats[DS_NF], E_INT, NULL,
2266
27.6k
                                         fd->version, &fd->vv);
2267
27.6k
    if (c->stats[DS_NF]->nvals && !h->codecs[DS_NF]) goto_err;
2268
2269
    //fprintf(stderr, "=== RL ===\n");
2270
27.6k
    h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]),
2271
27.6k
                                         c->stats[DS_RL], E_INT, NULL,
2272
27.6k
                                         fd->version, &fd->vv);
2273
27.6k
    if (c->stats[DS_RL]->nvals && !h->codecs[DS_RL]) goto_err;
2274
2275
    //fprintf(stderr, "=== FN ===\n");
2276
27.6k
    h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]),
2277
27.6k
                                         c->stats[DS_FN], E_INT, NULL,
2278
27.6k
                                         fd->version, &fd->vv);
2279
27.6k
    if (c->stats[DS_FN]->nvals && !h->codecs[DS_FN]) goto_err;
2280
2281
    //fprintf(stderr, "=== FC ===\n");
2282
27.6k
    h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]),
2283
27.6k
                                         c->stats[DS_FC], E_BYTE, NULL,
2284
27.6k
                                         fd->version, &fd->vv);
2285
27.6k
    if (c->stats[DS_FC]->nvals && !h->codecs[DS_FC]) goto_err;
2286
2287
    //fprintf(stderr, "=== FP ===\n");
2288
27.6k
    h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]),
2289
27.6k
                                         c->stats[DS_FP], E_INT, NULL,
2290
27.6k
                                         fd->version, &fd->vv);
2291
27.6k
    if (c->stats[DS_FP]->nvals && !h->codecs[DS_FP]) goto_err;
2292
2293
    //fprintf(stderr, "=== DL ===\n");
2294
27.6k
    h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]),
2295
27.6k
                                         c->stats[DS_DL], E_INT, NULL,
2296
27.6k
                                         fd->version, &fd->vv);
2297
27.6k
    if (c->stats[DS_DL]->nvals && !h->codecs[DS_DL]) goto_err;
2298
2299
    //fprintf(stderr, "=== BA ===\n");
2300
27.6k
    h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]),
2301
27.6k
                                         c->stats[DS_BA], E_BYTE, NULL,
2302
27.6k
                                         fd->version, &fd->vv);
2303
27.6k
    if (c->stats[DS_BA]->nvals && !h->codecs[DS_BA]) goto_err;
2304
2305
27.6k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2306
27.6k
        cram_byte_array_len_encoder e;
2307
2308
27.6k
        e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
2309
27.6k
            ? E_VARINT_UNSIGNED
2310
27.6k
            : E_EXTERNAL;
2311
27.6k
        e.len_dat = (void *)DS_BB_len;
2312
        //e.len_dat = (void *)DS_BB;
2313
2314
27.6k
        e.val_encoding = E_EXTERNAL;
2315
27.6k
        e.val_dat = (void *)DS_BB;
2316
2317
27.6k
        h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
2318
27.6k
                                             E_BYTE_ARRAY, (void *)&e,
2319
27.6k
                                             fd->version, &fd->vv);
2320
27.6k
        if (!h->codecs[DS_BB]) goto_err;
2321
27.6k
    } else {
2322
0
        h->codecs[DS_BB] = NULL;
2323
0
    }
2324
2325
    //fprintf(stderr, "=== BS ===\n");
2326
27.6k
    h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]),
2327
27.6k
                                         c->stats[DS_BS], E_BYTE, NULL,
2328
27.6k
                                         fd->version, &fd->vv);
2329
27.6k
    if (c->stats[DS_BS]->nvals && !h->codecs[DS_BS]) goto_err;
2330
2331
27.6k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
2332
0
        h->codecs[DS_TL] = NULL;
2333
0
        h->codecs[DS_RI] = NULL;
2334
0
        h->codecs[DS_RS] = NULL;
2335
0
        h->codecs[DS_PD] = NULL;
2336
0
        h->codecs[DS_HC] = NULL;
2337
0
        h->codecs[DS_SC] = NULL;
2338
2339
        //fprintf(stderr, "=== TC ===\n");
2340
0
        h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]),
2341
0
                                             c->stats[DS_TC], E_BYTE, NULL,
2342
0
                                             fd->version, &fd->vv);
2343
0
        if (c->stats[DS_TC]->nvals && !h->codecs[DS_TC]) goto_err;
2344
2345
        //fprintf(stderr, "=== TN ===\n");
2346
0
        h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]),
2347
0
                                             c->stats[DS_TN], E_INT, NULL,
2348
0
                                             fd->version, &fd->vv);
2349
0
        if (c->stats[DS_TN]->nvals && !h->codecs[DS_TN]) goto_err;
2350
27.6k
    } else {
2351
27.6k
        h->codecs[DS_TC] = NULL;
2352
27.6k
        h->codecs[DS_TN] = NULL;
2353
2354
        //fprintf(stderr, "=== TL ===\n");
2355
27.6k
        h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]),
2356
27.6k
                                             c->stats[DS_TL], E_INT, NULL,
2357
27.6k
                                             fd->version, &fd->vv);
2358
27.6k
        if (c->stats[DS_TL]->nvals && !h->codecs[DS_TL]) goto_err;
2359
2360
2361
        //fprintf(stderr, "=== RI ===\n");
2362
27.6k
        h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]),
2363
27.6k
                                             c->stats[DS_RI], E_INT, NULL,
2364
27.6k
                                             fd->version, &fd->vv);
2365
27.6k
        if (c->stats[DS_RI]->nvals && !h->codecs[DS_RI]) goto_err;
2366
2367
        //fprintf(stderr, "=== RS ===\n");
2368
27.6k
        h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]),
2369
27.6k
                                             c->stats[DS_RS], E_INT, NULL,
2370
27.6k
                                             fd->version, &fd->vv);
2371
27.6k
        if (c->stats[DS_RS]->nvals && !h->codecs[DS_RS]) goto_err;
2372
2373
        //fprintf(stderr, "=== PD ===\n");
2374
27.6k
        h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]),
2375
27.6k
                                             c->stats[DS_PD], E_INT, NULL,
2376
27.6k
                                             fd->version, &fd->vv);
2377
27.6k
        if (c->stats[DS_PD]->nvals && !h->codecs[DS_PD]) goto_err;
2378
2379
        //fprintf(stderr, "=== HC ===\n");
2380
27.6k
        h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]),
2381
27.6k
                                             c->stats[DS_HC], E_INT, NULL,
2382
27.6k
                                             fd->version, &fd->vv);
2383
27.6k
        if (c->stats[DS_HC]->nvals && !h->codecs[DS_HC]) goto_err;
2384
2385
        //fprintf(stderr, "=== SC ===\n");
2386
27.6k
        if (1) {
2387
27.6k
            int i2[2] = {0, DS_SC};
2388
2389
27.6k
            h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2390
27.6k
                                                 E_BYTE_ARRAY, (void *)i2,
2391
27.6k
                                                 fd->version, &fd->vv);
2392
27.6k
        } else {
2393
            // Appears to be no practical benefit to using this method,
2394
            // but it may work better if we start mixing SC, IN and BB
2395
            // elements into the same external block.
2396
0
            cram_byte_array_len_encoder e;
2397
2398
0
            e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
2399
0
                ? E_VARINT_UNSIGNED
2400
0
                : E_EXTERNAL;
2401
0
            e.len_dat = (void *)DS_SC_len;
2402
2403
0
            e.val_encoding = E_EXTERNAL;
2404
0
            e.val_dat = (void *)DS_SC;
2405
2406
0
            h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
2407
0
                                                 E_BYTE_ARRAY, (void *)&e,
2408
0
                                                 fd->version, &fd->vv);
2409
0
        }
2410
27.6k
        if (!h->codecs[DS_SC]) goto_err;
2411
27.6k
    }
2412
2413
    //fprintf(stderr, "=== IN ===\n");
2414
27.6k
    {
2415
27.6k
        int i2[2] = {0, DS_IN};
2416
27.6k
        h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2417
27.6k
                                             E_BYTE_ARRAY, (void *)i2,
2418
27.6k
                                             fd->version, &fd->vv);
2419
27.6k
        if (!h->codecs[DS_IN]) goto_err;
2420
27.6k
    }
2421
2422
27.6k
    h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE,
2423
27.6k
                                         (void *)DS_QS,
2424
27.6k
                                         fd->version, &fd->vv);
2425
27.6k
    if (!h->codecs[DS_QS]) goto_err;
2426
27.6k
    {
2427
27.6k
        int i2[2] = {0, DS_RN};
2428
27.6k
        h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2429
27.6k
                                             E_BYTE_ARRAY, (void *)i2,
2430
27.6k
                                             fd->version, &fd->vv);
2431
27.6k
        if (!h->codecs[DS_RN]) goto_err;
2432
27.6k
    }
2433
2434
2435
    /* Encode slices */
2436
55.3k
    for (i = 0; i < c->curr_slice; i++) {
2437
27.6k
        hts_log_info("Encode slice %d", i);
2438
2439
27.6k
        int local_embed_ref =
2440
27.6k
            embed_ref>0 && c->slices[i]->hdr->ref_seq_id != -1 ? 1 : 0;
2441
27.6k
        if (cram_encode_slice(fd, c, h, c->slices[i], local_embed_ref) != 0)
2442
14
            return -1;
2443
27.6k
    }
2444
2445
    /* Create compression header */
2446
27.6k
    {
2447
27.6k
        h->ref_seq_id    = c->ref_seq_id;
2448
27.6k
        h->ref_seq_start = c->ref_seq_start;
2449
27.6k
        h->ref_seq_span  = c->ref_seq_span;
2450
27.6k
        h->num_records   = c->num_records;
2451
27.6k
        h->qs_seq_orient = c->qs_seq_orient;
2452
        // slight misnomer - sorted or treat as-if sorted (ap_delta force to 1)
2453
27.6k
        h->AP_delta      = c->pos_sorted;
2454
27.6k
        memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20);
2455
2456
27.6k
        if (!(c_hdr = cram_encode_compression_header(fd, c, h, embed_ref)))
2457
0
            return -1;
2458
27.6k
    }
2459
2460
    /* Compute landmarks */
2461
    /* Fill out slice landmarks */
2462
27.6k
    c->num_landmarks = c->curr_slice;
2463
27.6k
    c->landmark = malloc(c->num_landmarks * sizeof(*c->landmark));
2464
27.6k
    if (!c->landmark)
2465
0
        return -1;
2466
2467
    /*
2468
     * Slice offset starts after the first block, so we need to simulate
2469
     * writing it to work out the correct offset
2470
     */
2471
27.6k
    {
2472
27.6k
        slice_offset = c_hdr->method == RAW
2473
27.6k
            ? c_hdr->uncomp_size
2474
27.6k
            : c_hdr->comp_size;
2475
27.6k
        slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
2476
27.6k
            fd->vv.varint_size(c_hdr->content_id) +
2477
27.6k
            fd->vv.varint_size(c_hdr->comp_size) +
2478
27.6k
            fd->vv.varint_size(c_hdr->uncomp_size);
2479
27.6k
    }
2480
2481
27.6k
    c->ref_seq_id    = c->slices[0]->hdr->ref_seq_id;
2482
27.6k
    if (c->ref_seq_id == -1 && CRAM_ge31(fd->version)) {
2483
        // Spec states span=0, but it broke our range queries.
2484
        // See commit message for this and prior.
2485
13.9k
        c->ref_seq_start = 0;
2486
13.9k
        c->ref_seq_span  = 0;
2487
13.9k
    } else {
2488
13.7k
        c->ref_seq_start = c->slices[0]->hdr->ref_seq_start;
2489
13.7k
        c->ref_seq_span  = c->slices[0]->hdr->ref_seq_span;
2490
13.7k
    }
2491
55.3k
    for (i = 0; i < c->curr_slice; i++) {
2492
27.6k
        cram_slice *s = c->slices[i];
2493
2494
27.6k
        c->num_blocks += s->hdr->num_blocks + 1; // slice header
2495
27.6k
        c->landmark[i] = slice_offset;
2496
2497
27.6k
        if (s->hdr->ref_seq_start + s->hdr->ref_seq_span >
2498
27.6k
            c->ref_seq_start + c->ref_seq_span) {
2499
0
            c->ref_seq_span = s->hdr->ref_seq_start + s->hdr->ref_seq_span
2500
0
                - c->ref_seq_start;
2501
0
        }
2502
2503
27.6k
        slice_offset += s->hdr_block->method == RAW
2504
27.6k
            ? s->hdr_block->uncomp_size
2505
27.6k
            : s->hdr_block->comp_size;
2506
2507
27.6k
        slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
2508
27.6k
            fd->vv.varint_size(s->hdr_block->content_id) +
2509
27.6k
            fd->vv.varint_size(s->hdr_block->comp_size) +
2510
27.6k
            fd->vv.varint_size(s->hdr_block->uncomp_size);
2511
2512
247k
        for (j = 0; j < s->hdr->num_blocks; j++) {
2513
220k
            slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
2514
220k
                fd->vv.varint_size(s->block[j]->content_id) +
2515
220k
                fd->vv.varint_size(s->block[j]->comp_size) +
2516
220k
                fd->vv.varint_size(s->block[j]->uncomp_size);
2517
2518
220k
            slice_offset += s->block[j]->method == RAW
2519
220k
                ? s->block[j]->uncomp_size
2520
220k
                : s->block[j]->comp_size;
2521
220k
        }
2522
27.6k
    }
2523
27.6k
    c->length += slice_offset; // just past the final slice
2524
2525
27.6k
    c->comp_hdr_block = c_hdr;
2526
2527
27.6k
    if (c->ref_seq_id >= 0) {
2528
13.2k
        if (c->ref_free) {
2529
13.0k
            free(c->ref);
2530
13.0k
            c->ref = NULL;
2531
13.0k
        } else {
2532
150
            cram_ref_decr(fd->refs, c->ref_seq_id);
2533
150
        }
2534
13.2k
    }
2535
2536
    /* Cache references up-front if we have unsorted access patterns */
2537
27.6k
    if (!no_ref && c->refs_used) {
2538
5
        for (i = 0; i < fd->refs->nref; i++) {
2539
0
            if (c->refs_used[i])
2540
0
                cram_ref_decr(fd->refs, i);
2541
0
        }
2542
5
    }
2543
2544
27.6k
    return 0;
2545
2546
28
 err:
2547
28
    return -1;
2548
27.6k
}
2549
2550
2551
/*
2552
 * Adds a feature code to a read within a slice. For purposes of minimising
2553
 * memory allocations and fragmentation we have one array of features for all
2554
 * reads within the slice. We return the index into this array for this new
2555
 * feature.
2556
 *
2557
 * Returns feature index on success
2558
 *         -1 on failure.
2559
 */
2560
static int cram_add_feature(cram_container *c, cram_slice *s,
2561
89.2k
                            cram_record *r, cram_feature *f) {
2562
89.2k
    if (s->nfeatures >= s->afeatures) {
2563
13.4k
        s->afeatures = s->afeatures ? s->afeatures*2 : 1024;
2564
13.4k
        s->features = realloc(s->features, s->afeatures*sizeof(*s->features));
2565
13.4k
        if (!s->features)
2566
0
            return -1;
2567
13.4k
    }
2568
2569
89.2k
    if (!r->nfeature++) {
2570
40.5k
        r->feature = s->nfeatures;
2571
40.5k
        if (cram_stats_add(c->stats[DS_FP], f->X.pos) < 0)
2572
0
            return -1;
2573
48.7k
    } else {
2574
48.7k
        if (cram_stats_add(c->stats[DS_FP],
2575
48.7k
                           f->X.pos - s->features[r->feature + r->nfeature-2].X.pos) < 0)
2576
0
            return -1;
2577
2578
48.7k
    }
2579
89.2k
    if (cram_stats_add(c->stats[DS_FC], f->X.code) < 0)
2580
0
        return -1;
2581
2582
89.2k
    s->features[s->nfeatures++] = *f;
2583
2584
89.2k
    return 0;
2585
89.2k
}
2586
2587
static int cram_add_substitution(cram_fd *fd, cram_container *c,
2588
                                 cram_slice *s, cram_record *r,
2589
1.54k
                                 int pos, char base, char qual, char ref) {
2590
1.54k
    cram_feature f;
2591
2592
    // seq=ACGTN vs ref=ACGT or seq=ACGT vs ref=ACGTN
2593
1.54k
    if (fd->L2[(uc)base]<4 || (fd->L2[(uc)base]<5 && fd->L2[(uc)ref]<4)) {
2594
1.50k
        f.X.pos = pos+1;
2595
1.50k
        f.X.code = 'X';
2596
1.50k
        f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f];
2597
1.50k
        if (cram_stats_add(c->stats[DS_BS], f.X.base) < 0)
2598
0
            return -1;
2599
1.50k
    } else {
2600
40
        f.B.pos = pos+1;
2601
40
        f.B.code = 'B';
2602
40
        f.B.base = base;
2603
40
        f.B.qual = qual;
2604
40
        if (cram_stats_add(c->stats[DS_BA], f.B.base) < 0) return -1;
2605
40
        if (cram_stats_add(c->stats[DS_QS], f.B.qual) < 0) return -1;
2606
40
        BLOCK_APPEND_CHAR(s->qual_blk, qual);
2607
40
    }
2608
1.54k
    return cram_add_feature(c, s, r, &f);
2609
2610
0
 block_err:
2611
0
    return -1;
2612
1.54k
}
2613
2614
static int cram_add_bases(cram_fd *fd, cram_container *c,
2615
                          cram_slice *s, cram_record *r,
2616
1.87k
                          int pos, int len, char *base) {
2617
1.87k
    cram_feature f;
2618
2619
1.87k
    f.b.pos = pos+1;
2620
1.87k
    f.b.code = 'b';
2621
1.87k
    f.b.seq_idx = base - (char *)BLOCK_DATA(s->seqs_blk);
2622
1.87k
    f.b.len = len;
2623
2624
1.87k
    return cram_add_feature(c, s, r, &f);
2625
1.87k
}
2626
2627
static int cram_add_base(cram_fd *fd, cram_container *c,
2628
                         cram_slice *s, cram_record *r,
2629
1.41k
                         int pos, char base, char qual) {
2630
1.41k
    cram_feature f;
2631
1.41k
    f.B.pos = pos+1;
2632
1.41k
    f.B.code = 'B';
2633
1.41k
    f.B.base = base;
2634
1.41k
    f.B.qual = qual;
2635
1.41k
    if (cram_stats_add(c->stats[DS_BA], base) < 0) return -1;
2636
1.41k
    if (cram_stats_add(c->stats[DS_QS], qual) < 0) return -1;
2637
1.41k
    BLOCK_APPEND_CHAR(s->qual_blk, qual);
2638
1.41k
    return cram_add_feature(c, s, r, &f);
2639
2640
0
 block_err:
2641
0
    return -1;
2642
1.41k
}
2643
2644
static int cram_add_quality(cram_fd *fd, cram_container *c,
2645
                            cram_slice *s, cram_record *r,
2646
17
                            int pos, char qual) {
2647
17
    cram_feature f;
2648
17
    f.Q.pos = pos+1;
2649
17
    f.Q.code = 'Q';
2650
17
    f.Q.qual = qual;
2651
17
    if (cram_stats_add(c->stats[DS_QS], qual) < 0) return -1;
2652
17
    BLOCK_APPEND_CHAR(s->qual_blk, qual);
2653
17
    return cram_add_feature(c, s, r, &f);
2654
2655
0
 block_err:
2656
0
    return -1;
2657
17
}
2658
2659
static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r,
2660
40.2k
                             int pos, int len, char *base) {
2661
40.2k
    cram_feature f;
2662
40.2k
    f.D.pos = pos+1;
2663
40.2k
    f.D.code = 'D';
2664
40.2k
    f.D.len = len;
2665
40.2k
    if (cram_stats_add(c->stats[DS_DL], len) < 0) return -1;
2666
40.2k
    return cram_add_feature(c, s, r, &f);
2667
40.2k
}
2668
2669
static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r,
2670
27.6k
                             int pos, int len, char *base, int version) {
2671
27.6k
    cram_feature f;
2672
27.6k
    f.S.pos = pos+1;
2673
27.6k
    f.S.code = 'S';
2674
27.6k
    f.S.len = len;
2675
27.6k
    switch (CRAM_MAJOR_VERS(version)) {
2676
0
    case 1:
2677
0
        f.S.seq_idx = BLOCK_SIZE(s->base_blk);
2678
0
        BLOCK_APPEND(s->base_blk, base, len);
2679
0
        BLOCK_APPEND_CHAR(s->base_blk, '\0');
2680
0
        break;
2681
2682
0
    case 2:
2683
27.6k
    default:
2684
27.6k
        f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
2685
27.6k
        if (base) {
2686
2.70k
            BLOCK_APPEND(s->soft_blk, base, len);
2687
24.9k
        } else {
2688
24.9k
            int i;
2689
83.7k
            for (i = 0; i < len; i++)
2690
58.7k
                BLOCK_APPEND_CHAR(s->soft_blk, 'N');
2691
24.9k
        }
2692
27.6k
        BLOCK_APPEND_CHAR(s->soft_blk, '\0');
2693
27.6k
        break;
2694
2695
        //default:
2696
        //    // v3.0 onwards uses BB data-series
2697
        //    f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
2698
27.6k
    }
2699
27.6k
    return cram_add_feature(c, s, r, &f);
2700
2701
0
 block_err:
2702
0
    return -1;
2703
27.6k
}
2704
2705
static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r,
2706
16.3k
                             int pos, int len, char *base) {
2707
16.3k
    cram_feature f;
2708
16.3k
    f.S.pos = pos+1;
2709
16.3k
    f.S.code = 'H';
2710
16.3k
    f.S.len = len;
2711
16.3k
    if (cram_stats_add(c->stats[DS_HC], len) < 0) return -1;
2712
16.3k
    return cram_add_feature(c, s, r, &f);
2713
16.3k
}
2714
2715
static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r,
2716
31
                         int pos, int len, char *base) {
2717
31
    cram_feature f;
2718
31
    f.S.pos = pos+1;
2719
31
    f.S.code = 'N';
2720
31
    f.S.len = len;
2721
31
    if (cram_stats_add(c->stats[DS_RS], len) < 0) return -1;
2722
31
    return cram_add_feature(c, s, r, &f);
2723
31
}
2724
2725
static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r,
2726
6
                        int pos, int len, char *base) {
2727
6
    cram_feature f;
2728
6
    f.S.pos = pos+1;
2729
6
    f.S.code = 'P';
2730
6
    f.S.len = len;
2731
6
    if (cram_stats_add(c->stats[DS_PD], len) < 0) return -1;
2732
6
    return cram_add_feature(c, s, r, &f);
2733
6
}
2734
2735
static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r,
2736
107
                              int pos, int len, char *base) {
2737
107
    cram_feature f;
2738
107
    f.I.pos = pos+1;
2739
107
    if (len == 1) {
2740
47
        char b = base ? *base : 'N';
2741
47
        f.i.code = 'i';
2742
47
        f.i.base = b;
2743
47
        if (cram_stats_add(c->stats[DS_BA], b) < 0) return -1;
2744
60
    } else {
2745
60
        f.I.code = 'I';
2746
60
        f.I.len = len;
2747
60
        f.S.seq_idx = BLOCK_SIZE(s->base_blk);
2748
60
        if (base) {
2749
3
            BLOCK_APPEND(s->base_blk, base, len);
2750
57
        } else {
2751
57
            int i;
2752
13.4M
            for (i = 0; i < len; i++)
2753
13.4M
                BLOCK_APPEND_CHAR(s->base_blk, 'N');
2754
57
        }
2755
60
        BLOCK_APPEND_CHAR(s->base_blk, '\0');
2756
60
    }
2757
107
    return cram_add_feature(c, s, r, &f);
2758
2759
0
 block_err:
2760
0
    return -1;
2761
107
}
2762
2763
/*
2764
 * Encodes auxiliary data. Largely duplicated from above, but done so to
2765
 * keep it simple and avoid a myriad of version ifs.
2766
 *
2767
 * Returns the RG header line pointed to by the BAM aux fields on success,
2768
 *         NULL on failure or no rg present, also sets "*err" to non-zero
2769
 */
2770
static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
2771
                                      cram_container *c,
2772
                                      cram_slice *s, cram_record *cr,
2773
                                      int verbatim_NM, int verbatim_MD,
2774
                                      int NM, kstring_t *MD, int cf_tag,
2775
4.63M
                                      int no_ref, int *err) {
2776
4.63M
    char *aux, *orig;
2777
4.63M
    sam_hrec_rg_t *brg = NULL;
2778
4.63M
    int aux_size = bam_get_l_aux(b);
2779
4.63M
    const char *aux_end = bam_data_end(b);
2780
4.63M
    cram_block *td_b = c->comp_hdr->TD_blk;
2781
4.63M
    int TD_blk_size = BLOCK_SIZE(td_b), new;
2782
4.63M
    char *key;
2783
4.63M
    khint_t k;
2784
2785
4.63M
    if (err) *err = 1;
2786
2787
4.63M
    orig = aux = (char *)bam_aux(b);
2788
2789
2790
    // cF:i  => Extra CRAM bit flags.
2791
    // 1:  Don't auto-decode MD (may be invalid)
2792
    // 2:  Don't auto-decode NM (may be invalid)
2793
4.63M
    if (cf_tag && CRAM_MAJOR_VERS(fd->version) < 4) {
2794
        // Temporary copy of aux so we can ammend it.
2795
34.9k
        aux = malloc(aux_size+4);
2796
34.9k
        if (!aux)
2797
0
            return NULL;
2798
2799
34.9k
        memcpy(aux, orig, aux_size);
2800
34.9k
        aux[aux_size++] = 'c';
2801
34.9k
        aux[aux_size++] = 'F';
2802
34.9k
        aux[aux_size++] = 'C';
2803
34.9k
        aux[aux_size++] = cf_tag;
2804
34.9k
        orig = aux;
2805
34.9k
        aux_end = aux + aux_size;
2806
34.9k
    }
2807
2808
    // Copy aux keys to td_b and aux values to slice aux blocks
2809
5.19M
    while (aux_end - aux >= 1 && aux[0] != 0) {
2810
560k
        int r;
2811
2812
        // Room for code + type + at least 1 byte of data
2813
560k
        if (aux - orig >= aux_size - 3)
2814
0
            goto err;
2815
2816
        // RG:Z
2817
560k
        if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
2818
463
            char *rg = &aux[3];
2819
463
            aux = rg;
2820
53.5k
            while (aux < aux_end && *aux++);
2821
463
            if (aux == aux_end && aux[-1] != '\0') {
2822
0
                hts_log_error("Unterminated RG:Z tag for read \"%s\"",
2823
0
                              bam_get_qname(b));
2824
0
                goto err;
2825
0
            }
2826
463
            brg = sam_hrecs_find_rg(fd->header->hrecs, rg);
2827
463
            if (brg) {
2828
182
                if (CRAM_MAJOR_VERS(fd->version) >= 4)
2829
0
                    BLOCK_APPEND(td_b, "RG*", 3);
2830
182
                continue;
2831
281
            } else {
2832
                // RG:Z tag will be stored verbatim
2833
281
                hts_log_warning("Missing @RG header for RG \"%s\"", rg);
2834
281
                aux = rg - 3;
2835
281
            }
2836
463
        }
2837
2838
        // MD:Z
2839
560k
        if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
2840
17.0k
            if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) {
2841
2.88k
                if (MD && MD->s && strncasecmp(MD->s, aux+3, orig + aux_size - (aux+3)) == 0) {
2842
0
                    while (aux < aux_end && *aux++);
2843
0
                    if (aux == aux_end && aux[-1] != '\0') {
2844
0
                        hts_log_error("Unterminated MD:Z tag for read \"%s\"",
2845
0
                                      bam_get_qname(b));
2846
0
                        goto err;
2847
0
                    }
2848
0
                    if (CRAM_MAJOR_VERS(fd->version) >= 4)
2849
0
                        BLOCK_APPEND(td_b, "MD*", 3);
2850
0
                    continue;
2851
0
                }
2852
2.88k
            }
2853
17.0k
        }
2854
2855
        // NM:i
2856
560k
        if (aux[0] == 'N' && aux[1] == 'M') {
2857
157
            if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_NM) {
2858
0
                int NM_ = bam_aux2i_end((uint8_t *)aux+2, (uint8_t *)aux_end);
2859
0
                if (NM_ == NM) {
2860
0
                    switch(aux[2]) {
2861
0
                    case 'A': case 'C': case 'c': aux+=4; break;
2862
0
                    case 'S': case 's':           aux+=5; break;
2863
0
                    case 'I': case 'i': case 'f': aux+=7; break;
2864
0
                    default:
2865
0
                        hts_log_error("Unhandled type code for NM tag");
2866
0
                        goto err;
2867
0
                    }
2868
0
                    if (CRAM_MAJOR_VERS(fd->version) >= 4)
2869
0
                        BLOCK_APPEND(td_b, "NM*", 3);
2870
0
                    continue;
2871
0
                }
2872
0
            }
2873
157
        }
2874
2875
560k
        BLOCK_APPEND(td_b, aux, 3);
2876
2877
        // Container level tags_used, for TD series
2878
        // Maps integer key ('X0i') to cram_tag_map struct.
2879
560k
        int key = (((unsigned char *) aux)[0]<<16 |
2880
560k
                   ((unsigned char *) aux)[1]<<8  |
2881
560k
                   ((unsigned char *) aux)[2]);
2882
560k
        k = kh_put(m_tagmap, c->tags_used, key, &r);
2883
560k
        if (-1 == r)
2884
0
            goto err;
2885
560k
        else if (r != 0)
2886
116k
            kh_val(c->tags_used, k) = NULL;
2887
2888
560k
        if (r == 1) {
2889
116k
            khint_t k_global;
2890
2891
            // Global tags_used for cram_metrics support
2892
116k
            pthread_mutex_lock(&fd->metrics_lock);
2893
116k
            k_global = kh_put(m_metrics, fd->tags_used, key, &r);
2894
116k
            if (-1 == r) {
2895
0
                pthread_mutex_unlock(&fd->metrics_lock);
2896
0
                goto err;
2897
0
            }
2898
116k
            if (r >= 1) {
2899
7.12k
                kh_val(fd->tags_used, k_global) = cram_new_metrics();
2900
7.12k
                if (!kh_val(fd->tags_used, k_global)) {
2901
0
                    kh_del(m_metrics, fd->tags_used, k_global);
2902
0
                    pthread_mutex_unlock(&fd->metrics_lock);
2903
0
                    goto err;
2904
0
                }
2905
7.12k
            }
2906
2907
116k
            pthread_mutex_unlock(&fd->metrics_lock);
2908
2909
116k
            int i2[2] = {'\t',key};
2910
116k
            size_t sk = key;
2911
116k
            cram_tag_map *m = calloc(1, sizeof(*m));
2912
116k
            if (!m)
2913
0
                goto_err;
2914
116k
            kh_val(c->tags_used, k) = m;
2915
2916
116k
            cram_codec *c;
2917
2918
            // Use a block content id based on the tag id.
2919
            // Codec type depends on tag data type.
2920
116k
            switch(aux[2]) {
2921
56.1k
            case 'Z': case 'H':
2922
                // string as byte_array_stop
2923
56.1k
                c = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2924
56.1k
                                      E_BYTE_ARRAY, (void *)i2,
2925
56.1k
                                      fd->version, &fd->vv);
2926
56.1k
                break;
2927
2928
48.4k
            case 'A': case 'c': case 'C': {
2929
                // byte array len, 1 byte
2930
48.4k
                cram_byte_array_len_encoder e;
2931
48.4k
                cram_stats st;
2932
2933
48.4k
                if (CRAM_MAJOR_VERS(fd->version) <= 3) {
2934
48.4k
                    e.len_encoding = E_HUFFMAN;
2935
48.4k
                    e.len_dat = NULL; // will get codes from st
2936
48.4k
                } else {
2937
0
                    e.len_encoding = E_CONST_INT;
2938
0
                    e.len_dat = NULL; // will get codes from st
2939
0
                }
2940
48.4k
                memset(&st, 0, sizeof(st));
2941
48.4k
                if (cram_stats_add(&st, 1) < 0) goto block_err;
2942
48.4k
                cram_stats_encoding(fd, &st);
2943
2944
48.4k
                e.val_encoding = E_EXTERNAL;
2945
48.4k
                e.val_dat = (void *)sk;
2946
2947
48.4k
                c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2948
48.4k
                                      E_BYTE_ARRAY, (void *)&e,
2949
48.4k
                                      fd->version, &fd->vv);
2950
48.4k
                break;
2951
48.4k
            }
2952
2953
2.95k
            case 's': case 'S': {
2954
                // byte array len, 2 byte
2955
2.95k
                cram_byte_array_len_encoder e;
2956
2.95k
                cram_stats st;
2957
2958
2.95k
                if (CRAM_MAJOR_VERS(fd->version) <= 3) {
2959
2.95k
                    e.len_encoding = E_HUFFMAN;
2960
2.95k
                    e.len_dat = NULL; // will get codes from st
2961
2.95k
                } else {
2962
0
                    e.len_encoding = E_CONST_INT;
2963
0
                    e.len_dat = NULL; // will get codes from st
2964
0
                }
2965
2.95k
                memset(&st, 0, sizeof(st));
2966
2.95k
                if (cram_stats_add(&st, 2) < 0) goto block_err;
2967
2.95k
                cram_stats_encoding(fd, &st);
2968
2969
2.95k
                e.val_encoding = E_EXTERNAL;
2970
2.95k
                e.val_dat = (void *)sk;
2971
2972
2.95k
                c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2973
2.95k
                                      E_BYTE_ARRAY, (void *)&e,
2974
2.95k
                                      fd->version, &fd->vv);
2975
2.95k
                break;
2976
2.95k
            }
2977
8.45k
            case 'i': case 'I': case 'f': {
2978
                // byte array len, 4 byte
2979
8.45k
                cram_byte_array_len_encoder e;
2980
8.45k
                cram_stats st;
2981
2982
8.45k
                if (CRAM_MAJOR_VERS(fd->version) <= 3) {
2983
8.45k
                    e.len_encoding = E_HUFFMAN;
2984
8.45k
                    e.len_dat = NULL; // will get codes from st
2985
8.45k
                } else {
2986
0
                    e.len_encoding = E_CONST_INT;
2987
0
                    e.len_dat = NULL; // will get codes from st
2988
0
                }
2989
8.45k
                memset(&st, 0, sizeof(st));
2990
8.45k
                if (cram_stats_add(&st, 4) < 0) goto block_err;
2991
8.45k
                cram_stats_encoding(fd, &st);
2992
2993
8.45k
                e.val_encoding = E_EXTERNAL;
2994
8.45k
                e.val_dat = (void *)sk;
2995
2996
8.45k
                c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2997
8.45k
                                      E_BYTE_ARRAY, (void *)&e,
2998
8.45k
                                      fd->version, &fd->vv);
2999
8.45k
                break;
3000
8.45k
            }
3001
3002
331
            case 'B': {
3003
                // Byte array of variable size, but we generate our tag
3004
                // byte stream at the wrong stage (during reading and not
3005
                // after slice header construction). So we use
3006
                // BYTE_ARRAY_LEN with the length codec being external
3007
                // too.
3008
331
                cram_byte_array_len_encoder e;
3009
3010
331
                e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
3011
331
                    ? E_VARINT_UNSIGNED
3012
331
                    : E_EXTERNAL;
3013
331
                e.len_dat = (void *)sk; // or key+128 for len?
3014
3015
331
                e.val_encoding = E_EXTERNAL;
3016
331
                e.val_dat = (void *)sk;
3017
3018
331
                c = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
3019
331
                                      E_BYTE_ARRAY, (void *)&e,
3020
331
                                      fd->version, &fd->vv);
3021
331
                break;
3022
8.45k
            }
3023
3024
50
            default:
3025
50
                hts_log_error("Unsupported SAM aux type '%c'", aux[2]);
3026
50
                c = NULL;
3027
116k
            }
3028
3029
116k
            if (!c)
3030
50
                goto_err;
3031
3032
116k
            m->codec = c;
3033
3034
            // Link to fd-global tag metrics
3035
116k
            pthread_mutex_lock(&fd->metrics_lock);
3036
116k
            m->m = k_global ? (cram_metrics *)kh_val(fd->tags_used, k_global) : NULL;
3037
116k
            pthread_mutex_unlock(&fd->metrics_lock);
3038
116k
        }
3039
3040
560k
        cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
3041
560k
        if (!tm) goto_err;
3042
560k
        cram_codec *codec = tm->codec;
3043
560k
        if (!tm->codec) goto_err;
3044
3045
560k
        switch(aux[2]) {
3046
286k
        case 'A': case 'C': case 'c':
3047
286k
            if (aux_end - aux < 3+1)
3048
0
                goto err;
3049
3050
286k
            if (!tm->blk) {
3051
48.4k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3052
0
                    goto err;
3053
48.4k
                codec->u.e_byte_array_len.val_codec->out = tm->blk;
3054
48.4k
            }
3055
3056
286k
            aux+=3;
3057
            //codec->encode(s, codec, aux, 1);
3058
            // Functionally equivalent, but less code.
3059
286k
            BLOCK_APPEND_CHAR(tm->blk, *aux);
3060
286k
            aux++;
3061
286k
            break;
3062
3063
21.1k
        case 'S': case 's':
3064
21.1k
            if (aux_end - aux < 3+2)
3065
0
                goto err;
3066
3067
21.1k
            if (!tm->blk) {
3068
2.95k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3069
0
                    goto err;
3070
2.95k
                codec->u.e_byte_array_len.val_codec->out = tm->blk;
3071
2.95k
            }
3072
3073
21.1k
            aux+=3;
3074
            //codec->encode(s, codec, aux, 2);
3075
21.1k
            BLOCK_APPEND(tm->blk, aux, 2);
3076
21.1k
            aux+=2;
3077
21.1k
            break;
3078
3079
88.0k
        case 'I': case 'i': case 'f':
3080
88.0k
            if (aux_end - aux < 3+4)
3081
0
                goto err;
3082
3083
88.0k
            if (!tm->blk) {
3084
8.45k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3085
0
                    goto err;
3086
8.45k
                codec->u.e_byte_array_len.val_codec->out = tm->blk;
3087
8.45k
            }
3088
3089
88.0k
            aux+=3;
3090
            //codec->encode(s, codec, aux, 4);
3091
88.0k
            BLOCK_APPEND(tm->blk, aux, 4);
3092
88.0k
            aux+=4;
3093
88.0k
            break;
3094
3095
0
        case 'd':
3096
0
            if (aux_end - aux < 3+8)
3097
0
                goto err;
3098
3099
0
            if (!tm->blk) {
3100
0
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3101
0
                    goto err;
3102
0
                codec->u.e_byte_array_len.val_codec->out = tm->blk;
3103
0
            }
3104
3105
0
            aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
3106
            //codec->encode(s, codec, aux, 8);
3107
0
            BLOCK_APPEND(tm->blk, aux, 8);
3108
0
            aux+=8;
3109
0
            break;
3110
3111
146k
        case 'Z': case 'H': {
3112
146k
            if (aux_end - aux < 3)
3113
0
                goto err;
3114
3115
146k
            if (!tm->blk) {
3116
56.1k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3117
0
                    goto err;
3118
56.1k
                codec->out = tm->blk;
3119
56.1k
            }
3120
3121
146k
            char *aux_s;
3122
146k
            aux += 3;
3123
146k
            aux_s = aux;
3124
8.04M
            while (aux < aux_end && *aux++);
3125
146k
            if (aux == aux_end && aux[-1] != '\0') {
3126
0
                hts_log_error("Unterminated %c%c:%c tag for read \"%s\"",
3127
0
                              aux_s[-3], aux_s[-2], aux_s[-1],
3128
0
                              bam_get_qname(b));
3129
0
                goto err;
3130
0
            }
3131
146k
            if (codec->encode(s, codec, aux_s, aux - aux_s) < 0)
3132
0
                goto err;
3133
146k
            break;
3134
146k
        }
3135
3136
146k
        case 'B': {
3137
18.1k
            if (aux_end - aux < 4+4)
3138
0
                goto err;
3139
3140
18.1k
            int type = aux[3];
3141
18.1k
            uint64_t count = (((uint64_t)((unsigned char *)aux)[4]) << 0 |
3142
18.1k
                              ((uint64_t)((unsigned char *)aux)[5]) << 8 |
3143
18.1k
                              ((uint64_t)((unsigned char *)aux)[6]) <<16 |
3144
18.1k
                              ((uint64_t)((unsigned char *)aux)[7]) <<24);
3145
18.1k
            uint64_t blen;
3146
18.1k
            if (!tm->blk) {
3147
331
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3148
0
                    goto err;
3149
331
                if (codec->u.e_byte_array_len.val_codec->codec == E_XDELTA) {
3150
0
                    if (!(tm->blk2 = cram_new_block(EXTERNAL, key+128)))
3151
0
                        goto err;
3152
0
                    codec->u.e_byte_array_len.len_codec->out = tm->blk2;
3153
0
                    codec->u.e_byte_array_len.val_codec->u.e_xdelta.sub_codec->out = tm->blk;
3154
331
                } else {
3155
331
                    codec->u.e_byte_array_len.len_codec->out = tm->blk;
3156
331
                    codec->u.e_byte_array_len.val_codec->out = tm->blk;
3157
331
                }
3158
331
            }
3159
3160
            // skip TN field
3161
18.1k
            aux+=3;
3162
3163
            // We use BYTE_ARRAY_LEN with external length, so store that first
3164
18.1k
            switch (type) {
3165
12.8k
            case 'c': case 'C':
3166
12.8k
                blen = count;
3167
12.8k
                break;
3168
1.98k
            case 's': case 'S':
3169
1.98k
                blen = 2*count;
3170
1.98k
                break;
3171
3.36k
            case 'i': case 'I': case 'f':
3172
3.36k
                blen = 4*count;
3173
3.36k
                break;
3174
3
            default:
3175
3
                hts_log_error("Unknown sub-type '%c' for aux type 'B'", type);
3176
3
                goto err;
3177
18.1k
            }
3178
3179
18.1k
            blen += 5; // sub-type & length
3180
18.1k
            if (aux_end - aux < blen || blen > INT_MAX)
3181
0
                goto err;
3182
3183
18.1k
            if (codec->encode(s, codec, aux, (int) blen) < 0)
3184
0
                goto err;
3185
18.1k
            aux += blen;
3186
18.1k
            break;
3187
18.1k
        }
3188
0
        default:
3189
0
            hts_log_error("Unknown aux type '%c'", aux_end - aux < 2 ? '?' : aux[2]);
3190
0
            goto err;
3191
560k
        }
3192
560k
        tm->blk->m = tm->m;
3193
560k
    }
3194
3195
    // FIXME: sort BLOCK_DATA(td_b) by char[3] triples
3196
3197
    // And and increment TD hash entry
3198
4.63M
    BLOCK_APPEND_CHAR(td_b, 0);
3199
3200
    // Duplicate key as BLOCK_DATA() can be realloced to a new pointer.
3201
4.63M
    key = string_ndup(c->comp_hdr->TD_keys,
3202
4.63M
                      (char *)BLOCK_DATA(td_b) + TD_blk_size,
3203
4.63M
                      BLOCK_SIZE(td_b) - TD_blk_size);
3204
4.63M
    if (!key)
3205
0
        goto block_err;
3206
4.63M
    k = kh_put(m_s2i, c->comp_hdr->TD_hash, key, &new);
3207
4.63M
    if (new < 0) {
3208
0
        goto err;
3209
4.63M
    } else if (new == 0) {
3210
4.59M
        BLOCK_SIZE(td_b) = TD_blk_size;
3211
4.59M
    } else {
3212
40.8k
        kh_val(c->comp_hdr->TD_hash, k) = c->comp_hdr->nTL;
3213
40.8k
        c->comp_hdr->nTL++;
3214
40.8k
    }
3215
3216
4.63M
    cr->TL = kh_val(c->comp_hdr->TD_hash, k);
3217
4.63M
    if (cram_stats_add(c->stats[DS_TL], cr->TL) < 0)
3218
0
        goto block_err;
3219
3220
4.63M
    if (orig != (char *)bam_aux(b))
3221
34.9k
        free(orig);
3222
3223
4.63M
    if (err) *err = 0;
3224
3225
4.63M
    return brg;
3226
3227
53
 err:
3228
53
 block_err:
3229
53
    if (orig != (char *)bam_aux(b))
3230
8
        free(orig);
3231
53
    return NULL;
3232
53
}
3233
3234
/*
3235
 * During cram_next_container or before the final flush at end of
3236
 * file, we update the current slice headers and increment the slice
3237
 * number to the next slice.
3238
 *
3239
 * See cram_next_container() and cram_close().
3240
 */
3241
27.7k
void cram_update_curr_slice(cram_container *c, int version) {
3242
27.7k
    cram_slice *s = c->slice;
3243
27.7k
    if (c->multi_seq) {
3244
539
        s->hdr->ref_seq_id    = -2;
3245
539
        s->hdr->ref_seq_start = 0;
3246
539
        s->hdr->ref_seq_span  = 0;
3247
27.2k
    } else if (c->curr_ref == -1 && CRAM_ge31(version)) {
3248
        // Spec states span=0, but it broke our range queries.
3249
        // See commit message for this and prior.
3250
13.9k
        s->hdr->ref_seq_id    = -1;
3251
13.9k
        s->hdr->ref_seq_start = 0;
3252
13.9k
        s->hdr->ref_seq_span  = 0;
3253
13.9k
    } else {
3254
13.2k
        s->hdr->ref_seq_id    = c->curr_ref;
3255
13.2k
        s->hdr->ref_seq_start = c->first_base;
3256
13.2k
        s->hdr->ref_seq_span  = MAX(0, c->last_base - c->first_base + 1);
3257
13.2k
    }
3258
27.7k
    s->hdr->num_records   = c->curr_rec;
3259
3260
27.7k
    if (c->curr_slice == 0) {
3261
27.7k
        if (c->ref_seq_id != s->hdr->ref_seq_id)
3262
14.4k
            c->ref_seq_id  = s->hdr->ref_seq_id;
3263
27.7k
        c->ref_seq_start = c->first_base;
3264
27.7k
    }
3265
3266
27.7k
    c->curr_slice++;
3267
27.7k
}
3268
3269
/*
3270
 * Handles creation of a new container or new slice, flushing any
3271
 * existing containers when appropriate.
3272
 *
3273
 * Really this is next slice, which may or may not lead to a new container.
3274
 *
3275
 * Returns cram_container pointer on success
3276
 *         NULL on failure.
3277
 */
3278
27.8k
static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) {
3279
27.8k
    cram_container *c = fd->ctr;
3280
27.8k
    int i;
3281
3282
    /* First occurrence */
3283
27.8k
    if (c->curr_ref == -2)
3284
1.42k
        c->curr_ref = bam_ref(b);
3285
3286
27.8k
    if (c->slice)
3287
26.3k
        cram_update_curr_slice(c, fd->version);
3288
3289
    /* Flush container */
3290
27.8k
    if (c->curr_slice == c->max_slice ||
3291
26.3k
        (bam_ref(b) != c->curr_ref && !c->multi_seq)) {
3292
26.3k
        c->ref_seq_span = fd->last_base - c->ref_seq_start + 1;
3293
26.3k
        hts_log_info("Flush container %d/%"PRId64"..%"PRId64,
3294
26.3k
                     c->ref_seq_id, c->ref_seq_start,
3295
26.3k
                     c->ref_seq_start + c->ref_seq_span -1);
3296
3297
        /* Encode slices */
3298
26.3k
        if (-1 == cram_flush_container_mt(fd, c))
3299
11
            return NULL;
3300
26.3k
        if (!fd->pool) {
3301
            // Move to sep func, as we need cram_flush_container for
3302
            // the closing phase to flush the partial container.
3303
52.7k
            for (i = 0; i < c->max_slice; i++) {
3304
26.3k
                cram_free_slice(c->slices[i]);
3305
26.3k
                c->slices[i] = NULL;
3306
26.3k
            }
3307
3308
26.3k
            c->slice = NULL;
3309
26.3k
            c->curr_slice = 0;
3310
3311
            /* Easy approach for purposes of freeing stats */
3312
26.3k
            cram_free_container(c);
3313
26.3k
        }
3314
3315
26.3k
        c = fd->ctr = cram_new_container(fd->seqs_per_slice,
3316
26.3k
                                         fd->slices_per_container);
3317
26.3k
        if (!c)
3318
0
            return NULL;
3319
3320
26.3k
        pthread_mutex_lock(&fd->ref_lock);
3321
26.3k
        c->no_ref = fd->no_ref;
3322
26.3k
        c->embed_ref = fd->embed_ref;
3323
26.3k
        c->record_counter = fd->record_counter;
3324
26.3k
        pthread_mutex_unlock(&fd->ref_lock);
3325
26.3k
        c->curr_ref = bam_ref(b);
3326
26.3k
    }
3327
3328
27.7k
    c->last_pos = c->first_base = c->last_base = bam_pos(b)+1;
3329
3330
    /* New slice */
3331
27.7k
    c->slice = c->slices[c->curr_slice] =
3332
27.7k
        cram_new_slice(MAPPED_SLICE, c->max_rec);
3333
27.7k
    if (!c->slice)
3334
0
        return NULL;
3335
3336
27.7k
    if (c->multi_seq) {
3337
0
        c->slice->hdr->ref_seq_id = -2;
3338
0
        c->slice->hdr->ref_seq_start = 0;
3339
0
        c->slice->last_apos = 1;
3340
27.7k
    } else {
3341
27.7k
        c->slice->hdr->ref_seq_id = bam_ref(b);
3342
        // wrong for unsorted data, will fix during encoding.
3343
27.7k
        c->slice->hdr->ref_seq_start = bam_pos(b)+1;
3344
27.7k
        c->slice->last_apos = bam_pos(b)+1;
3345
27.7k
    }
3346
3347
27.7k
    c->curr_rec = 0;
3348
27.7k
    c->s_num_bases = 0;
3349
27.7k
    c->n_mapped = 0;
3350
3351
    // QO field: 0 implies original orientation, 1 implies sequence orientation
3352
    // 1 is often preferable for NovaSeq, but impact is slight. ~0.5% diff.
3353
    // Conversely other data sets it's often better than 1% saving for 0.
3354
    // Short of trying both and learning, for now we use use 0 for V4, 1 for V3.
3355
27.7k
    c->qs_seq_orient = CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : 1;
3356
3357
27.7k
    return c;
3358
27.7k
}
3359
3360
3361
/*
3362
 * Converts a single bam record into a cram record.
3363
 * Possibly used within a thread.
3364
 *
3365
 * Returns 0 on success;
3366
 *        -1 on failure
3367
 */
3368
static int process_one_read(cram_fd *fd, cram_container *c,
3369
                            cram_slice *s, cram_record *cr,
3370
                            bam_seq_t *b, int rnum, kstring_t *MD,
3371
4.63M
                            int embed_ref, int no_ref) {
3372
4.63M
    int i, fake_qual = -1, NM = 0;
3373
4.63M
    char *cp;
3374
4.63M
    char *ref, *seq, *qual;
3375
3376
    // Any places with N in seq and/or reference can lead to ambiguous
3377
    // interpretation of the SAM NM:i tag.  So we store these verbatim
3378
    // to ensure valid data round-trips the same regardless of who
3379
    // defines it as valid.
3380
    // Similarly when alignments go beyond end of the reference.
3381
4.63M
    int verbatim_NM = fd->store_nm;
3382
4.63M
    int verbatim_MD = fd->store_md;
3383
3384
    // FIXME: multi-ref containers
3385
3386
4.63M
    cr->flags       = bam_flag(b);
3387
4.63M
    cr->len         = bam_seq_len(b);
3388
4.63M
    uint8_t *md;
3389
4.63M
    if (!(md = bam_aux_get(b, "MD")))
3390
4.61M
        MD = NULL;
3391
18.0k
    else
3392
18.0k
        MD->l = 0;
3393
3394
4.63M
    int cf_tag = 0;
3395
3396
4.63M
    if (embed_ref == 2) {
3397
35.0k
        cf_tag  = MD ? 0 : 1;                   // No MD
3398
35.0k
        cf_tag |= bam_aux_get(b, "NM") ? 0 : 2; // No NM
3399
35.0k
    }
3400
3401
    //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg);
3402
3403
4.63M
    ref = c->ref ? c->ref - (c->ref_start-1) : NULL;
3404
4.63M
    cr->ref_id      = bam_ref(b);
3405
4.63M
    if (cram_stats_add(c->stats[DS_RI], cr->ref_id) < 0)
3406
0
        goto block_err;
3407
4.63M
    if (cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]) < 0)
3408
0
        goto block_err;
3409
3410
    // Non reference based encoding means storing the bases verbatim as features, which in
3411
    // turn means every base also has a quality already stored.
3412
4.63M
    if (!no_ref || CRAM_MAJOR_VERS(fd->version) >= 3)
3413
4.63M
        cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
3414
3415
4.63M
    if (cr->len <= 0 && CRAM_MAJOR_VERS(fd->version) >= 3)
3416
4.46M
        cr->cram_flags |= CRAM_FLAG_NO_SEQ;
3417
    //cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK);
3418
3419
4.63M
    c->num_bases   += cr->len;
3420
4.63M
    cr->apos        = bam_pos(b)+1;
3421
4.63M
    if (cr->apos < 0 || cr->apos > INT64_MAX/2)
3422
1
        goto err;
3423
4.63M
    if (c->pos_sorted) {
3424
4.62M
        if (cr->apos < s->last_apos && !fd->ap_delta) {
3425
161
            c->pos_sorted = 0;
3426
4.61M
        } else {
3427
4.61M
            if (cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos) < 0)
3428
0
                goto block_err;
3429
4.61M
            s->last_apos = cr->apos;
3430
4.61M
        }
3431
4.62M
    } else {
3432
        //cram_stats_add(c->stats[DS_AP], cr->apos);
3433
12.3k
    }
3434
4.63M
    c->max_apos += (cr->apos > c->max_apos) * (cr->apos - c->max_apos);
3435
3436
    /*
3437
     * This seqs_ds is largely pointless and it could reuse the same memory
3438
     * over and over.
3439
     * s->base_blk is what we need for encoding.
3440
     */
3441
4.63M
    cr->seq         = BLOCK_SIZE(s->seqs_blk);
3442
4.63M
    cr->qual        = BLOCK_SIZE(s->qual_blk);
3443
4.63M
    BLOCK_GROW(s->seqs_blk, cr->len+1);
3444
4.63M
    BLOCK_GROW(s->qual_blk, cr->len);
3445
3446
    // Convert BAM nibble encoded sequence to string of base pairs
3447
4.63M
    seq = cp = (char *)BLOCK_END(s->seqs_blk);
3448
4.63M
    *seq = 0;
3449
4.63M
    nibble2base(bam_seq(b), cp, cr->len);
3450
4.63M
    BLOCK_SIZE(s->seqs_blk) += cr->len;
3451
3452
4.63M
    qual = cp = (char *)bam_qual(b);
3453
3454
3455
    /* Copy and parse */
3456
4.63M
    if (!(cr->flags & BAM_FUNMAP)) {
3457
40.5k
        uint32_t *cig_to, *cig_from;
3458
40.5k
        int64_t apos = cr->apos-1, spos = 0;
3459
40.5k
        int64_t MD_last = apos; // last position of edit in MD tag
3460
3461
40.5k
        if (apos < 0) {
3462
0
            hts_log_error("Mapped read with position <= 0 is disallowed");
3463
0
            return -1;
3464
0
        }
3465
3466
40.5k
        cr->cigar       = s->ncigar;
3467
40.5k
        cr->ncigar      = bam_cigar_len(b);
3468
40.6k
        while (cr->cigar + cr->ncigar >= s->cigar_alloc) {
3469
96
            s->cigar_alloc = s->cigar_alloc ? s->cigar_alloc*2 : 1024;
3470
96
            s->cigar = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar));
3471
96
            if (!s->cigar)
3472
0
                return -1;
3473
96
        }
3474
3475
40.5k
        cig_to = (uint32_t *)s->cigar;
3476
40.5k
        cig_from = (uint32_t *)bam_cigar(b);
3477
3478
40.5k
        cr->feature = 0;
3479
40.5k
        cr->nfeature = 0;
3480
148k
        for (i = 0; i < cr->ncigar; i++) {
3481
107k
            enum cigar_op cig_op = cig_from[i] & BAM_CIGAR_MASK;
3482
107k
            uint32_t cig_len = cig_from[i] >> BAM_CIGAR_SHIFT;
3483
107k
            cig_to[i] = cig_from[i];
3484
3485
            /* Can also generate events from here for CRAM diffs */
3486
3487
107k
            switch (cig_op) {
3488
0
                int l;
3489
3490
                // Don't trust = and X ops to be correct.
3491
22.7k
            case BAM_CMATCH:
3492
22.8k
            case BAM_CBASE_MATCH:
3493
23.3k
            case BAM_CBASE_MISMATCH:
3494
                //fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n",
3495
                //      cig_len, &ref[apos], cig_len, &seq[spos]);
3496
23.3k
                l = 0;
3497
23.3k
                if (!no_ref && cr->len) {
3498
16.4k
                    int end = cig_len+apos < c->ref_end
3499
16.4k
                        ? cig_len : c->ref_end - apos;
3500
16.4k
                    char *sp = &seq[spos];
3501
16.4k
                    char *rp = &ref[apos];
3502
16.4k
                    char *qp = &qual[spos];
3503
16.4k
                    if (end > cr->len) {
3504
0
                        hts_log_error("CIGAR and query sequence are of different length");
3505
0
                        return -1;
3506
0
                    }
3507
24.3k
                    for (l = 0; l < end; l++) {
3508
                        // This case is just too disputed and different tools
3509
                        // interpret these in different ways.  We give up and
3510
                        // store verbatim.
3511
7.89k
                        if (rp[l] == 'N' && sp[l] == 'N')
3512
35
                            verbatim_NM = verbatim_MD = 1;
3513
7.89k
                        if (rp[l] != sp[l]) {
3514
                            // Build our own MD tag if one is on the sequence, so
3515
                            // we can ensure it matches and thus can be discarded.
3516
1.54k
                            if (MD && ref) {
3517
376
                                if (kputuw(apos+l - MD_last, MD) < 0) goto err;
3518
376
                                if (kputc(rp[l], MD) < 0) goto err;
3519
376
                                MD_last = apos+l+1;
3520
376
                            }
3521
1.54k
                            NM++;
3522
1.54k
                            if (!sp[l])
3523
0
                                break;
3524
1.54k
                            if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) {
3525
#if 0
3526
                                // Disabled for the time being as it doesn't
3527
                                // seem to gain us much.
3528
                                int ol=l;
3529
                                while (l<end && rp[l] != sp[l])
3530
                                    l++;
3531
                                if (l-ol > 1) {
3532
                                    if (cram_add_bases(fd, c, s, cr, spos+ol,
3533
                                                       l-ol, &seq[spos+ol]))
3534
                                        return -1;
3535
                                    l--;
3536
                                } else {
3537
                                    l = ol;
3538
                                    if (cram_add_substitution(fd, c, s, cr,
3539
                                                              spos+l, sp[l],
3540
                                                              qp[l], rp[l]))
3541
                                        return -1;
3542
                                }
3543
#else
3544
                                // With urmap pushed to the limit and lots
3545
                                // of unaligned data (should be soft-clipped)
3546
                                // this saves ~2-7%. Worth it?
3547
0
                                int nl = l;
3548
0
                                int max_end = nl, max_score = 0, score = 0;
3549
0
                                while (nl < end) {
3550
0
                                    if (rp[nl] != sp[nl]) {
3551
0
                                        score += 3;
3552
0
                                        if (max_score < score) {
3553
0
                                            max_score = score;
3554
0
                                            max_end = nl;
3555
0
                                        }
3556
0
                                    } else {
3557
0
                                        score--;
3558
0
                                        if (score < -2 ||
3559
0
                                            max_score - score > 7)
3560
0
                                            break;
3561
0
                                    }
3562
0
                                    nl++;
3563
0
                                }
3564
0
                                if (max_score > 20) {
3565
0
                                    cram_add_bases(fd, c, s, cr, spos+l,
3566
0
                                                   max_end-l, &seq[spos+l]);
3567
0
                                    l = max_end-1;
3568
0
                                } else {
3569
0
                                    while (l < nl) {
3570
0
                                        if (rp[l] != sp[l])
3571
0
                                            cram_add_substitution(fd, c, s,
3572
0
                                                                  cr, spos+l,
3573
0
                                                                  sp[l], qp[l],
3574
0
                                                                  rp[l]);
3575
0
                                        l++;
3576
0
                                    }
3577
0
                                    l--;
3578
0
                                }
3579
0
#endif
3580
1.54k
                            } else {
3581
1.54k
                                if (cram_add_substitution(fd, c, s, cr, spos+l,
3582
1.54k
                                                          sp[l], qp[l], rp[l]))
3583
0
                                    return -1;
3584
1.54k
                            }
3585
1.54k
                        }
3586
7.89k
                    }
3587
16.4k
                    spos += l;
3588
16.4k
                    apos += l;
3589
16.4k
                }
3590
3591
23.3k
                if (l < cig_len && cr->len) {
3592
3.28k
                    if (no_ref) {
3593
1.87k
                        if (CRAM_MAJOR_VERS(fd->version) == 3) {
3594
1.87k
                            if (cram_add_bases(fd, c, s, cr, spos,
3595
1.87k
                                               cig_len-l, &seq[spos]))
3596
0
                                return -1;
3597
1.87k
                            spos += cig_len-l;
3598
1.87k
                        } else {
3599
0
                            for (; l < cig_len && seq[spos]; l++, spos++) {
3600
0
                                if (cram_add_base(fd, c, s, cr, spos,
3601
0
                                                  seq[spos], qual[spos]))
3602
0
                                    return -1;
3603
0
                            }
3604
0
                        }
3605
1.87k
                    } else {
3606
                        /* off end of sequence or non-ref based output */
3607
1.41k
                        verbatim_NM = verbatim_MD = 1;
3608
2.82k
                        for (; l < cig_len && seq[spos]; l++, spos++) {
3609
1.41k
                            if (cram_add_base(fd, c, s, cr, spos,
3610
1.41k
                                              seq[spos], qual[spos]))
3611
0
                                return -1;
3612
1.41k
                        }
3613
1.41k
                    }
3614
3.28k
                    apos += cig_len;
3615
20.0k
                } else if (!cr->len) {
3616
                    /* Seq "*" */
3617
3.47k
                    verbatim_NM = verbatim_MD = 1;
3618
3.47k
                    apos += cig_len;
3619
3.47k
                    spos += cig_len;
3620
3.47k
                }
3621
23.3k
                break;
3622
3623
40.2k
            case BAM_CDEL:
3624
40.2k
                if (MD && ref) {
3625
4.27k
                    if (kputuw(apos - MD_last, MD) < 0) goto err;
3626
4.27k
                    if (apos < c->ref_end) {
3627
2.30k
                        if (kputc_('^', MD) < 0) goto err;
3628
2.30k
                        if (kputsn(&ref[apos], MIN(c->ref_end - apos, cig_len), MD) < 0)
3629
0
                            goto err;
3630
2.30k
                    }
3631
4.27k
                }
3632
40.2k
                NM += cig_len;
3633
3634
40.2k
                if (cram_add_deletion(c, s, cr, spos, cig_len, &seq[spos]))
3635
0
                    return -1;
3636
40.2k
                apos += cig_len;
3637
40.2k
                MD_last = apos;
3638
40.2k
                break;
3639
3640
31
            case BAM_CREF_SKIP:
3641
31
                if (cram_add_skip(c, s, cr, spos, cig_len, &seq[spos]))
3642
0
                    return -1;
3643
31
                apos += cig_len;
3644
31
                MD_last += cig_len;
3645
31
                break;
3646
3647
107
            case BAM_CINS:
3648
107
                if (cram_add_insertion(c, s, cr, spos, cig_len,
3649
107
                                       cr->len ? &seq[spos] : NULL))
3650
0
                    return -1;
3651
107
                if (no_ref && cr->len) {
3652
34
                    for (l = 0; l < cig_len; l++, spos++) {
3653
17
                        cram_add_quality(fd, c, s, cr, spos, qual[spos]);
3654
17
                    }
3655
90
                } else {
3656
90
                    spos += cig_len;
3657
90
                }
3658
107
                NM += cig_len;
3659
107
                break;
3660
3661
27.6k
            case BAM_CSOFT_CLIP:
3662
27.6k
                if (cram_add_softclip(c, s, cr, spos, cig_len,
3663
27.6k
                                      cr->len ? &seq[spos] : NULL,
3664
27.6k
                                      fd->version))
3665
0
                    return -1;
3666
3667
27.6k
                if (no_ref &&
3668
3.83k
                    !(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
3669
0
                    if (cr->len) {
3670
0
                        for (l = 0; l < cig_len; l++, spos++) {
3671
0
                            cram_add_quality(fd, c, s, cr, spos, qual[spos]);
3672
0
                        }
3673
0
                    } else {
3674
0
                        for (l = 0; l < cig_len; l++, spos++) {
3675
0
                            cram_add_quality(fd, c, s, cr, spos, -1);
3676
0
                        }
3677
0
                    }
3678
27.6k
                } else {
3679
27.6k
                    spos += cig_len;
3680
27.6k
                }
3681
27.6k
                break;
3682
3683
16.3k
            case BAM_CHARD_CLIP:
3684
16.3k
                if (cram_add_hardclip(c, s, cr, spos, cig_len, &seq[spos]))
3685
0
                    return -1;
3686
16.3k
                break;
3687
3688
16.3k
            case BAM_CPAD:
3689
6
                if (cram_add_pad(c, s, cr, spos, cig_len, &seq[spos]))
3690
0
                    return -1;
3691
6
                break;
3692
3693
40
            default:
3694
40
                hts_log_error("Unknown CIGAR op code %d", cig_op);
3695
40
                return -1;
3696
107k
            }
3697
107k
        }
3698
40.5k
        if (cr->len && spos != cr->len) {
3699
0
            hts_log_error("CIGAR and query sequence are of different length");
3700
0
            return -1;
3701
0
        }
3702
40.5k
        fake_qual = spos;
3703
        // Protect against negative length refs (fuzz 382922241)
3704
40.5k
        cr->aend = no_ref ? apos : MIN(apos, MAX(0, c->ref_end));
3705
40.5k
        if (cram_stats_add(c->stats[DS_FN], cr->nfeature) < 0)
3706
0
            goto block_err;
3707
3708
40.5k
        if (MD && ref)
3709
9.60k
            if (kputuw(apos - MD_last, MD) < 0) goto err;
3710
4.59M
    } else {
3711
        // Unmapped
3712
4.59M
        cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
3713
4.59M
        cr->cigar  = 0;
3714
4.59M
        cr->ncigar = 0;
3715
4.59M
        cr->nfeature = 0;
3716
4.59M
        cr->aend = MIN(cr->apos, c->ref_end);
3717
408M
        for (i = 0; i < cr->len; i++)
3718
404M
            if (cram_stats_add(c->stats[DS_BA], seq[i]) < 0)
3719
0
                goto block_err;
3720
4.59M
        fake_qual = 0;
3721
4.59M
    }
3722
3723
4.63M
    cr->ntags      = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags);
3724
4.63M
    int err = 0;
3725
4.63M
    sam_hrec_rg_t *brg =
3726
4.63M
        cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD,
3727
4.63M
                        cf_tag, no_ref, &err);
3728
4.63M
    if (err)
3729
53
        goto block_err;
3730
3731
    /* Read group, identified earlier */
3732
4.63M
    if (brg) {
3733
182
        cr->rg = brg->id;
3734
4.63M
    } else if (CRAM_MAJOR_VERS(fd->version) == 1) {
3735
0
        sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, "UNKNOWN");
3736
0
        if (!brg) goto block_err;
3737
0
        cr->rg = brg->id;
3738
4.63M
    } else {
3739
4.63M
        cr->rg = -1;
3740
4.63M
    }
3741
4.63M
    if (cram_stats_add(c->stats[DS_RG], cr->rg) < 0)
3742
0
        goto block_err;
3743
3744
    /*
3745
     * Append to the qual block now. We do this here as
3746
     * cram_add_substitution() can generate BA/QS events which need to
3747
     * be in the qual block before we append the rest of the data.
3748
     */
3749
4.63M
    if (cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES) {
3750
        /* Special case of seq "*" */
3751
4.63M
        if (cr->len == 0) {
3752
4.46M
            cr->len = fake_qual;
3753
4.46M
            BLOCK_GROW(s->qual_blk, cr->len);
3754
4.46M
            cp = (char *)BLOCK_END(s->qual_blk);
3755
4.46M
            memset(cp, 255, cr->len);
3756
4.46M
        } else {
3757
169k
            BLOCK_GROW(s->qual_blk, cr->len);
3758
169k
            cp = (char *)BLOCK_END(s->qual_blk);
3759
169k
            char *from = (char *)&bam_qual(b)[0];
3760
169k
            char *to = &cp[0];
3761
169k
            memcpy(to, from, cr->len);
3762
3763
            // Store quality in original orientation for better compression.
3764
169k
            if (!c->qs_seq_orient) {
3765
0
                if (cr->flags & BAM_FREVERSE) {
3766
0
                    int i, j;
3767
0
                    for (i = 0, j = cr->len-1; i < j; i++, j--) {
3768
0
                        unsigned char c;
3769
0
                        c = to[i];
3770
0
                        to[i] = to[j];
3771
0
                        to[j] = c;
3772
0
                    }
3773
0
                }
3774
0
            }
3775
169k
        }
3776
4.63M
        BLOCK_SIZE(s->qual_blk) += cr->len;
3777
4.63M
    } else {
3778
0
        if (cr->len == 0)
3779
0
            cr->len = fake_qual >= 0 ? fake_qual : cr->aend - cr->apos + 1;
3780
0
    }
3781
3782
4.63M
    if (cram_stats_add(c->stats[DS_RL], cr->len) < 0)
3783
0
        goto block_err;
3784
3785
    /* Now we know apos and aend both, update mate-pair information */
3786
4.63M
    {
3787
4.63M
        int new;
3788
4.63M
        khint_t k;
3789
4.63M
        int sec = (cr->flags & BAM_FSECONDARY) ? 1 : 0;
3790
3791
        //fprintf(stderr, "Checking %"PRId64"/%.*s\t", rnum,
3792
        //      cr->name_len, DSTRING_STR(s->name_ds)+cr->name);
3793
4.63M
        if (cr->flags & BAM_FPAIRED) {
3794
12.0k
            char *key = string_ndup(s->pair_keys, bam_name(b), bam_name_len(b));
3795
12.0k
            if (!key)
3796
0
                return -1;
3797
3798
12.0k
            k = kh_put(m_s2i, s->pair[sec], key, &new);
3799
12.0k
            if (-1 == new)
3800
0
                return -1;
3801
12.0k
            else if (new > 0)
3802
4.49k
                kh_val(s->pair[sec], k) = rnum
3803
4.49k
                    | ((unsigned)((cr->flags & BAM_FREAD1)!=0)<<30)
3804
4.49k
                    | ((unsigned)((cr->flags & BAM_FREAD2)!=0)<<31);
3805
4.62M
        } else {
3806
4.62M
            new = 1;
3807
4.62M
            k = 0; // Prevents false-positive warning from gcc -Og
3808
4.62M
        }
3809
3810
4.63M
        if (new == 0) {
3811
7.51k
            cram_record *p = &s->crecs[kh_val(s->pair[sec], k) & ((1<<30)-1)];
3812
7.51k
            int64_t aleft, aright;
3813
7.51k
            int sign;
3814
3815
7.51k
            aleft = MIN(cr->apos, p->apos);
3816
7.51k
            aright = MAX(cr->aend, p->aend);
3817
7.51k
            if (cr->apos < p->apos) {
3818
76
                sign = 1;
3819
7.43k
            } else if (cr->apos > p->apos) {
3820
158
                sign = -1;
3821
7.27k
            } else if (cr->flags & BAM_FREAD1) {
3822
4.87k
                sign = 1;
3823
4.87k
            } else {
3824
2.39k
                sign = -1;
3825
2.39k
            }
3826
3827
            // Multiple sets of secondary reads means we cannot tell which is
3828
            // which, so we store TLEN etc verbatim.
3829
7.51k
            int has_r1 = kh_val(s->pair[sec], k) & (1<<30);
3830
7.51k
            unsigned int has_r2 = kh_val(s->pair[sec], k) & (1u<<31);
3831
7.51k
            if ((has_r1 && (cr->flags & BAM_FREAD1)) ||
3832
2.77k
                (has_r2 && (cr->flags & BAM_FREAD2)))
3833
5.10k
                goto detached;
3834
3835
            // This vs p: tlen, matepos, flags. Permit TLEN 0 and/or TLEN +/-
3836
            // a small amount, if appropriate options set.
3837
2.40k
            if ((!fd->tlen_zero && MAX(bam_mate_pos(b)+1, 0) != p->apos) &&
3838
1.24k
                !(fd->tlen_zero && bam_mate_pos(b) == 0))
3839
1.24k
                goto detached;
3840
3841
1.15k
            if (((bam_flag(b) & BAM_FMUNMAP) != 0) !=
3842
1.15k
                ((p->flags & BAM_FUNMAP) != 0))
3843
979
                goto detached;
3844
3845
178
            if (((bam_flag(b) & BAM_FMREVERSE) != 0) !=
3846
178
                ((p->flags & BAM_FREVERSE) != 0))
3847
0
                goto detached;
3848
3849
3850
            // p vs this: tlen, matepos, flags
3851
178
            if (p->ref_id != cr->ref_id &&
3852
96
                !(fd->tlen_zero && p->ref_id == -1))
3853
96
                goto detached;
3854
3855
82
            if (p->mate_pos != cr->apos &&
3856
0
                !(fd->tlen_zero && p->mate_pos == 0))
3857
0
                goto detached;
3858
3859
82
            if (((p->flags & BAM_FMUNMAP) != 0) !=
3860
82
                ((p->mate_flags & CRAM_M_UNMAP) != 0))
3861
0
                goto detached;
3862
3863
82
            if (((p->flags & BAM_FMREVERSE) != 0) !=
3864
82
                ((p->mate_flags & CRAM_M_REVERSE) != 0))
3865
0
                goto detached;
3866
3867
            // Supplementary reads are just too ill defined
3868
82
            if ((cr->flags & BAM_FSUPPLEMENTARY) ||
3869
82
                (p->flags & BAM_FSUPPLEMENTARY))
3870
0
                goto detached;
3871
3872
            // When in lossy name mode, if a read isn't detached we
3873
            // cannot store the name.  The corollary is that when we
3874
            // must store the name, it must be detached (inefficient).
3875
82
            if (fd->lossy_read_names &&
3876
0
                (!(cr->cram_flags & CRAM_FLAG_DISCARD_NAME) ||
3877
0
                 !((p->cram_flags & CRAM_FLAG_DISCARD_NAME))))
3878
0
                goto detached;
3879
3880
            // Now check TLEN.  We do this last as sometimes it's the
3881
            // only thing that differs.  In CRAM4 we have a better way
3882
            // of handling this that doesn't break detached status
3883
82
            int explicit_tlen = 0;
3884
82
            int tflag1 = ((bam_ins_size(b) &&
3885
0
                           llabs(bam_ins_size(b) - sign*(aright-aleft+1))
3886
0
                           > fd->tlen_approx)
3887
82
                          || (!bam_ins_size(b) && !fd->tlen_zero));
3888
3889
82
            int tflag2 = ((p->tlen && llabs(p->tlen - -sign*(aright-aleft+1))
3890
0
                           > fd->tlen_approx)
3891
82
                          || (!p->tlen && !fd->tlen_zero));
3892
3893
82
            if (tflag1 || tflag2) {
3894
82
                if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3895
0
                    explicit_tlen = CRAM_FLAG_EXPLICIT_TLEN;
3896
82
                } else {
3897
                    // Stil do detached for unmapped data in CRAM4 as this
3898
                    // also impacts RNEXT calculation.
3899
82
                    goto detached;
3900
82
                }
3901
82
            }
3902
3903
            /*
3904
             * The fields below are unused when encoding this read as it is
3905
             * no longer detached.  In theory they may get referred to when
3906
             * processing a 3rd or 4th read in this template?, so we set them
3907
             * here just to be sure.
3908
             *
3909
             * They do not need cram_stats_add() calls those as they are
3910
             * not emitted.
3911
             */
3912
0
            cr->mate_pos = p->apos;
3913
0
            cram_stats_add(c->stats[DS_NP], cr->mate_pos);
3914
0
            cr->tlen = explicit_tlen ? bam_ins_size(b) : sign*(aright-aleft+1);
3915
0
            cram_stats_add(c->stats[DS_TS], cr->tlen);
3916
0
            cr->mate_flags =
3917
0
                ((p->flags & BAM_FMUNMAP)   == BAM_FMUNMAP)   * CRAM_M_UNMAP +
3918
0
                ((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE;
3919
3920
            // Decrement statistics aggregated earlier
3921
0
            if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
3922
0
                cram_stats_del(c->stats[DS_NP], p->mate_pos);
3923
0
                cram_stats_del(c->stats[DS_MF], p->mate_flags);
3924
0
                if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN))
3925
0
                    cram_stats_del(c->stats[DS_TS], p->tlen);
3926
0
                cram_stats_del(c->stats[DS_NS], p->mate_ref_id);
3927
0
            }
3928
3929
            /* Similarly we could correct the p-> values too, but these will no
3930
             * longer have any code that refers back to them as the new 'p'
3931
             * for this template is our current 'cr'.
3932
             */
3933
            //p->mate_pos = cr->apos;
3934
            //p->mate_flags =
3935
            //  ((cr->flags & BAM_FMUNMAP)   == BAM_FMUNMAP)  * CRAM_M_UNMAP +
3936
            //  ((cr->flags & BAM_FMREVERSE) == BAM_FMREVERSE)* CRAM_M_REVERSE;
3937
            //p->tlen = p->apos - cr->aend;
3938
3939
            // Clear detached from cr flags
3940
0
            cr->cram_flags &= ~CRAM_FLAG_DETACHED;
3941
0
            cr->cram_flags |= explicit_tlen;
3942
0
            if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0)
3943
0
                goto block_err;
3944
3945
            // Clear detached from p flags and set downstream
3946
0
            if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
3947
0
                cram_stats_del(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK);
3948
0
                p->cram_flags &= ~CRAM_FLAG_STATS_ADDED;
3949
0
            }
3950
3951
0
            p->cram_flags  &= ~CRAM_FLAG_DETACHED;
3952
0
            p->cram_flags  |=  CRAM_FLAG_MATE_DOWNSTREAM | explicit_tlen;;
3953
0
            if (cram_stats_add(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK) < 0)
3954
0
                goto block_err;
3955
3956
0
            p->mate_line = rnum - ((kh_val(s->pair[sec], k) & ((1<<30)-1)) + 1);
3957
0
            if (cram_stats_add(c->stats[DS_NF], p->mate_line) < 0)
3958
0
                goto block_err;
3959
3960
0
            int r12_flags = kh_val(s->pair[sec], k) & (3u<<30);
3961
0
            kh_val(s->pair[sec], k) = (unsigned int)rnum | r12_flags
3962
0
                | (((cr->flags & BAM_FREAD1)!=0)<<30)
3963
0
                | ((unsigned)((cr->flags & BAM_FREAD2)!=0)<<31);
3964
4.62M
        } else {
3965
4.63M
        detached:
3966
            //fprintf(stderr, "unpaired\n");
3967
3968
            /* Derive mate flags from this flag */
3969
4.63M
            cr->mate_flags = 0;
3970
4.63M
            if (bam_flag(b) & BAM_FMUNMAP)
3971
9.21k
                cr->mate_flags |= CRAM_M_UNMAP;
3972
4.63M
            if (bam_flag(b) & BAM_FMREVERSE)
3973
18
                cr->mate_flags |= CRAM_M_REVERSE;
3974
3975
4.63M
            if (cram_stats_add(c->stats[DS_MF], cr->mate_flags) < 0)
3976
0
                goto block_err;
3977
3978
4.63M
            cr->mate_pos    = MAX(bam_mate_pos(b)+1, 0);
3979
4.63M
            if (cram_stats_add(c->stats[DS_NP], cr->mate_pos) < 0)
3980
0
                goto block_err;
3981
3982
4.63M
            cr->tlen        = bam_ins_size(b);
3983
4.63M
            if (cram_stats_add(c->stats[DS_TS], cr->tlen) < 0)
3984
0
                goto block_err;
3985
3986
4.63M
            cr->cram_flags |= CRAM_FLAG_DETACHED;
3987
4.63M
            if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0)
3988
0
                goto block_err;
3989
4.63M
            if (cram_stats_add(c->stats[DS_NS], bam_mate_ref(b)) < 0)
3990
0
                goto block_err;
3991
3992
4.63M
            cr->cram_flags |= CRAM_FLAG_STATS_ADDED;
3993
4.63M
        }
3994
4.63M
    }
3995
3996
4.63M
    cr->mqual       = bam_map_qual(b);
3997
4.63M
    if (cram_stats_add(c->stats[DS_MQ], cr->mqual) < 0)
3998
0
        goto block_err;
3999
4000
4.63M
    cr->mate_ref_id = bam_mate_ref(b);
4001
4002
4.63M
    if (!(bam_flag(b) & BAM_FUNMAP)) {
4003
40.5k
        if (c->first_base > cr->apos)
4004
70
            c->first_base = cr->apos;
4005
4006
40.5k
        if (c->last_base < cr->aend)
4007
2.16k
            c->last_base = cr->aend;
4008
40.5k
    }
4009
4010
4.63M
    return 0;
4011
4012
53
 block_err:
4013
54
 err:
4014
54
    return -1;
4015
53
}
4016
4017
/*
4018
 * Write iterator: put BAM format sequences into a CRAM file.
4019
 * We buffer up a containers worth of data at a time.
4020
 *
4021
 * Returns 0 on success
4022
 *        -1 on failure
4023
 */
4024
4.63M
int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
4025
4.63M
    cram_container *c;
4026
4027
4.63M
    if (!fd->ctr) {
4028
1.42k
        fd->ctr = cram_new_container(fd->seqs_per_slice,
4029
1.42k
                                     fd->slices_per_container);
4030
1.42k
        if (!fd->ctr)
4031
0
            return -1;
4032
1.42k
        fd->ctr->record_counter = fd->record_counter;
4033
4034
1.42k
        pthread_mutex_lock(&fd->ref_lock);
4035
1.42k
        fd->ctr->no_ref = fd->no_ref;
4036
1.42k
        fd->ctr->embed_ref = fd->embed_ref;
4037
1.42k
        pthread_mutex_unlock(&fd->ref_lock);
4038
1.42k
    }
4039
4.63M
    c = fd->ctr;
4040
4041
4.63M
    int embed_ref = c->embed_ref;
4042
4043
4.63M
    if (!c->slice || c->curr_rec == c->max_rec ||
4044
4.63M
        (bam_ref(b) != c->curr_ref && c->curr_ref >= -1) ||
4045
4.60M
        (c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice)) {
4046
31.7k
        int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1;
4047
31.7k
        int curr_ref = c->slice ? c->curr_ref : bam_ref(b);
4048
4049
        /*
4050
         * Start packing slices when we routinely have under 1/4tr full.
4051
         *
4052
         * This option isn't available if we choose to embed references
4053
         * since we can only have one per slice.
4054
         *
4055
         * The multi_seq var here refers to our intention for the next slice.
4056
         * This slice has already been encoded so we output as-is.
4057
         */
4058
31.7k
        if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 &&
4059
27.1k
            fd->last_slice && fd->last_slice < c->max_rec/4+10 &&
4060
25.1k
            embed_ref<=0) {
4061
470
            if (!c->multi_seq)
4062
331
                hts_log_info("Multi-ref enabled for next container");
4063
470
            multi_seq = 1;
4064
31.2k
        } else if (fd->multi_seq == 1) {
4065
4.20k
            pthread_mutex_lock(&fd->metrics_lock);
4066
4.20k
            if (fd->last_RI_count <= c->max_slice && fd->multi_seq_user != 1) {
4067
348
                multi_seq = 0;
4068
348
                hts_log_info("Multi-ref disabled for next container");
4069
348
            }
4070
4.20k
            pthread_mutex_unlock(&fd->metrics_lock);
4071
4.20k
        }
4072
4073
31.7k
        slice_rec = c->slice_rec;
4074
31.7k
        curr_rec  = c->curr_rec;
4075
4076
31.7k
        if (CRAM_MAJOR_VERS(fd->version) == 1 ||
4077
31.7k
            c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice ||
4078
27.8k
            c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice) {
4079
27.8k
            if (NULL == (c = cram_next_container(fd, b))) {
4080
11
                if (fd->ctr) {
4081
                    // prevent cram_close attempting to flush
4082
11
                    fd->ctr_mt = fd->ctr; // delay free when threading
4083
11
                    fd->ctr = NULL;
4084
11
                }
4085
11
                return -1;
4086
11
            }
4087
27.8k
        }
4088
4089
        /*
4090
         * Due to our processing order, some things we've already done we
4091
         * cannot easily undo. So when we first notice we should be packing
4092
         * multiple sequences per container we emit the small partial
4093
         * container as-is and then start a fresh one in a different mode.
4094
         */
4095
31.7k
        if (multi_seq == 0 && fd->multi_seq == 1 && fd->multi_seq_user == -1) {
4096
            // User selected auto-mode, we're currently using multi-seq, but
4097
            // have detected we don't need to.  Switch back to auto.
4098
348
            fd->multi_seq = -1;
4099
31.3k
        } else if (multi_seq) {
4100
            // We detected we need multi-seq
4101
4.32k
            fd->multi_seq = 1;
4102
4.32k
            c->multi_seq = 1;
4103
4.32k
            c->pos_sorted = 0;
4104
4105
            // Cram_next_container may end up flushing an existing one and
4106
            // triggering fd->embed_ref=2 if no reference is found.
4107
            // Embedded refs are incompatible with multi-seq, so we bail
4108
            // out and switch to no_ref in this scenario.  We do this
4109
            // within the container only, as multi_seq may be temporary
4110
            // and we switch back away from it again.
4111
4.32k
            pthread_mutex_lock(&fd->ref_lock);
4112
4.32k
            if (fd->embed_ref > 0 && c->curr_rec == 0 && c->curr_slice == 0) {
4113
70
                hts_log_warning("Changing from embed_ref to no_ref mode");
4114
                // Should we update fd->embed_ref and no_ref here too?
4115
                // Doing so means if we go into multi-seq and back out
4116
                // again, eg due a cluster of tiny refs in the middle of
4117
                // much larger ones, then we bake in no-ref mode.
4118
                //
4119
                // However for unsorted data we're realistically not
4120
                // going to switch back.
4121
70
                c->embed_ref = fd->embed_ref = 0; // or -1 for auto?
4122
70
                c->no_ref = fd->no_ref = 1;
4123
70
            }
4124
4.32k
            pthread_mutex_unlock(&fd->ref_lock);
4125
4126
4.32k
            if (!c->refs_used) {
4127
539
                pthread_mutex_lock(&fd->ref_lock);
4128
539
                c->refs_used = calloc(fd->refs->nref, sizeof(int));
4129
539
                pthread_mutex_unlock(&fd->ref_lock);
4130
539
                if (!c->refs_used)
4131
0
                    return -1;
4132
539
            }
4133
4.32k
        }
4134
4135
31.7k
        fd->last_slice = curr_rec - slice_rec;
4136
31.7k
        c->slice_rec = c->curr_rec;
4137
4138
        // Have we seen this reference before?
4139
31.7k
        if (bam_ref(b) >= 0 && curr_ref >= 0 && bam_ref(b) != curr_ref &&
4140
2
            embed_ref<=0 && !fd->unsorted && multi_seq) {
4141
4142
0
            if (!c->refs_used) {
4143
0
                pthread_mutex_lock(&fd->ref_lock);
4144
0
                c->refs_used = calloc(fd->refs->nref, sizeof(int));
4145
0
                pthread_mutex_unlock(&fd->ref_lock);
4146
0
                if (!c->refs_used)
4147
0
                    return -1;
4148
0
            } else if (c->refs_used && c->refs_used[bam_ref(b)]) {
4149
0
                pthread_mutex_lock(&fd->ref_lock);
4150
0
                fd->unsorted = 1;
4151
0
                fd->multi_seq = 1;
4152
0
                pthread_mutex_unlock(&fd->ref_lock);
4153
0
            }
4154
0
        }
4155
4156
31.7k
        c->curr_ref = bam_ref(b);
4157
31.7k
        if (c->refs_used && c->curr_ref >= 0) c->refs_used[c->curr_ref]++;
4158
31.7k
    }
4159
4160
4.63M
    if (!c->bams) {
4161
        /* First time through, allocate a set of bam pointers */
4162
27.7k
        pthread_mutex_lock(&fd->bam_list_lock);
4163
27.7k
        if (fd->bl) {
4164
26.3k
            spare_bams *spare = fd->bl;
4165
26.3k
            c->bams = spare->bams;
4166
26.3k
            fd->bl = spare->next;
4167
26.3k
            free(spare);
4168
26.3k
        } else {
4169
1.42k
            c->bams = calloc(c->max_c_rec, sizeof(bam_seq_t *));
4170
1.42k
            if (!c->bams) {
4171
0
                pthread_mutex_unlock(&fd->bam_list_lock);
4172
0
                return -1;
4173
0
            }
4174
1.42k
        }
4175
27.7k
        pthread_mutex_unlock(&fd->bam_list_lock);
4176
27.7k
    }
4177
4178
    /* Copy or alloc+copy the bam record, for later encoding */
4179
4.63M
    if (c->bams[c->curr_c_rec]) {
4180
3.62M
        if (bam_copy1(c->bams[c->curr_c_rec], b) == NULL)
4181
0
            return -1;
4182
3.62M
    } else {
4183
1.00M
        c->bams[c->curr_c_rec] = bam_dup1(b);
4184
1.00M
        if (c->bams[c->curr_c_rec] == NULL)
4185
0
            return -1;
4186
1.00M
    }
4187
4.63M
    if (bam_seq_len(b)) {
4188
169k
        c->s_num_bases += bam_seq_len(b);
4189
4.46M
    } else {
4190
        // No sequence in BAM record.  CRAM doesn't directly support this
4191
        // case, it ends up being stored as a string of N's for each query
4192
        // consuming CIGAR operation.  As this can become very inefficient
4193
        // in time and memory, data where the query length is excessively
4194
        // long are rejected.
4195
4.46M
        hts_pos_t qlen = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
4196
4.46M
        if (qlen > 100000000) {
4197
22
            hts_log_error("CIGAR query length %"PRIhts_pos
4198
22
                          " for read \"%s\" is too long",
4199
22
                          qlen, bam_get_qname(b));
4200
22
            return -1;
4201
22
        }
4202
4.46M
        c->s_num_bases += qlen;
4203
4.46M
    }
4204
4.63M
    c->curr_rec++;
4205
4.63M
    c->curr_c_rec++;
4206
4.63M
    c->s_aux_bytes += bam_get_l_aux(b);
4207
4.63M
    c->n_mapped += (bam_flag(b) & BAM_FUNMAP) ? 0 : 1;
4208
4.63M
    fd->record_counter++;
4209
4210
4.63M
    return 0;
4211
4.63M
}