Coverage Report

Created: 2025-12-14 06:43

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/cram/cram_encode.c
Line
Count
Source
1
/*
2
Copyright (c) 2012-2020, 2022-2025 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
32
#include <config.h>
33
34
#include <stdio.h>
35
#include <errno.h>
36
#include <assert.h>
37
#include <stdlib.h>
38
#include <string.h>
39
#include <strings.h>
40
#include <zlib.h>
41
#include <sys/types.h>
42
#include <sys/stat.h>
43
#include <math.h>
44
#include <inttypes.h>
45
#include <ctype.h>
46
47
#include "cram.h"
48
#include "os.h"
49
#include "../sam_internal.h" // for nibble2base
50
#include "../htslib/hts.h"
51
#include "../htslib/hts_endian.h"
52
#include "../textutils_internal.h"
53
54
KHASH_MAP_INIT_STR(m_s2u64, uint64_t)
55
56
#define Z_CRAM_STRAT Z_FILTERED
57
//#define Z_CRAM_STRAT Z_RLE
58
//#define Z_CRAM_STRAT Z_HUFFMAN_ONLY
59
//#define Z_CRAM_STRAT Z_DEFAULT_STRATEGY
60
61
static int process_one_read(cram_fd *fd, cram_container *c,
62
                            cram_slice *s, cram_record *cr,
63
                            bam_seq_t *b, int rnum, kstring_t *MD,
64
                            int embed_ref, int no_ref);
65
66
/*
67
 * Returns index of val into key.
68
 * Basically strchr(key, val)-key;
69
 */
70
309k
static int sub_idx(char *key, char val) {
71
309k
    int i;
72
73
774k
    for (i = 0; i < 4 && *key++ != val; i++);
74
309k
    return i;
75
309k
}
76
77
/*
78
 * Encodes a compression header block into a generic cram_block structure.
79
 *
80
 * Returns cram_block ptr on success
81
 *         NULL on failure
82
 */
83
cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
84
                                           cram_block_compression_hdr *h,
85
20.8k
                                           int embed_ref) {
86
20.8k
    cram_block *cb  = cram_new_block(COMPRESSION_HEADER, 0);
87
20.8k
    cram_block *map = cram_new_block(COMPRESSION_HEADER, 0);
88
20.8k
    int i, mc, r = 0;
89
90
20.8k
    int no_ref = c->no_ref;
91
92
20.8k
    if (!cb || !map)
93
0
        return NULL;
94
95
    /*
96
     * This is a concatenation of several blocks of data:
97
     * header + landmarks, preservation map, read encoding map, and the tag
98
     * encoding map.
99
     * All 4 are variable sized and we need to know how large these are
100
     * before creating the compression header itself as this starts with
101
     * the total size (stored as a variable length string).
102
     */
103
104
    // Duplicated from container itself, and removed in 1.1
105
20.8k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
106
0
        r |= itf8_put_blk(cb, h->ref_seq_id);
107
0
        r |= itf8_put_blk(cb, h->ref_seq_start);
108
0
        r |= itf8_put_blk(cb, h->ref_seq_span);
109
0
        r |= itf8_put_blk(cb, h->num_records);
110
0
        r |= itf8_put_blk(cb, h->num_landmarks);
111
0
        for (i = 0; i < h->num_landmarks; i++) {
112
0
            r |= itf8_put_blk(cb, h->landmark[i]);
113
0
        }
114
0
    }
115
116
20.8k
    if (h->preservation_map) {
117
0
        kh_destroy(map, h->preservation_map);
118
0
        h->preservation_map = NULL;
119
0
    }
120
121
    /* Create in-memory preservation map */
122
    /* FIXME: should create this when we create the container */
123
20.8k
    if (c->num_records > 0) {
124
15.4k
        khint_t k;
125
15.4k
        int r;
126
127
15.4k
        if (!(h->preservation_map = kh_init(map)))
128
0
            return NULL;
129
130
15.4k
        k = kh_put(map, h->preservation_map, "RN", &r);
131
15.4k
        if (-1 == r) return NULL;
132
15.4k
        kh_val(h->preservation_map, k).i = !fd->lossy_read_names;
133
134
15.4k
        if (CRAM_MAJOR_VERS(fd->version) == 1) {
135
0
            k = kh_put(map, h->preservation_map, "PI", &r);
136
0
            if (-1 == r) return NULL;
137
0
            kh_val(h->preservation_map, k).i = 0;
138
139
0
            k = kh_put(map, h->preservation_map, "UI", &r);
140
0
            if (-1 == r) return NULL;
141
0
            kh_val(h->preservation_map, k).i = 1;
142
143
0
            k = kh_put(map, h->preservation_map, "MI", &r);
144
0
            if (-1 == r) return NULL;
145
0
            kh_val(h->preservation_map, k).i = 1;
146
147
15.4k
        } else {
148
            // Technically SM was in 1.0, but wasn't in Java impl.
149
15.4k
            k = kh_put(map, h->preservation_map, "SM", &r);
150
15.4k
            if (-1 == r) return NULL;
151
15.4k
            kh_val(h->preservation_map, k).i = 0;
152
153
15.4k
            k = kh_put(map, h->preservation_map, "TD", &r);
154
15.4k
            if (-1 == r) return NULL;
155
15.4k
            kh_val(h->preservation_map, k).i = 0;
156
157
15.4k
            k = kh_put(map, h->preservation_map, "AP", &r);
158
15.4k
            if (-1 == r) return NULL;
159
15.4k
            kh_val(h->preservation_map, k).i = h->AP_delta;
160
161
15.4k
            if (CRAM_MAJOR_VERS(fd->version) >= 4) {
162
0
                k = kh_put(map, h->preservation_map, "QO", &r);
163
0
                if (-1 == r) return NULL;
164
0
                kh_val(h->preservation_map, k).i = h->qs_seq_orient;
165
0
            }
166
167
15.4k
            if (no_ref || embed_ref>0) {
168
                // Reference Required == No
169
13.7k
                k = kh_put(map, h->preservation_map, "RR", &r);
170
13.7k
                if (-1 == r) return NULL;
171
13.7k
                kh_val(h->preservation_map, k).i = 0;
172
13.7k
            }
173
15.4k
        }
174
15.4k
    }
175
176
    /* Encode preservation map; could collapse this and above into one */
177
20.8k
    mc = 0;
178
20.8k
    BLOCK_SIZE(map) = 0;
179
20.8k
    if (h->preservation_map) {
180
15.4k
        khint_t k;
181
182
15.4k
        for (k = kh_begin(h->preservation_map);
183
139k
             k != kh_end(h->preservation_map);
184
123k
             k++) {
185
123k
            const char *key;
186
123k
            khash_t(map) *pmap = h->preservation_map;
187
188
189
123k
            if (!kh_exist(pmap, k))
190
48.2k
                continue;
191
192
75.6k
            key = kh_key(pmap, k);
193
75.6k
            BLOCK_APPEND(map, key, 2);
194
195
75.6k
            switch(CRAM_KEY(key[0], key[1])) {
196
0
            case CRAM_KEY('M','I'):
197
0
            case CRAM_KEY('U','I'):
198
0
            case CRAM_KEY('P','I'):
199
15.4k
            case CRAM_KEY('A','P'):
200
30.9k
            case CRAM_KEY('R','N'):
201
44.6k
            case CRAM_KEY('R','R'):
202
44.6k
            case CRAM_KEY('Q','O'):
203
44.6k
                BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
204
44.6k
                break;
205
206
44.6k
            case CRAM_KEY('S','M'): {
207
15.4k
                char smat[5], *mp = smat;
208
                // Output format is for order ACGTN (minus ref base)
209
                // to store the code value 0-3 for each symbol.
210
                //
211
                // Note this is different to storing the symbols in order
212
                // that the codes occur from 0-3, which is what we used to
213
                // do.  (It didn't matter as we always had a fixed table in
214
                // the order.)
215
15.4k
                *mp++ =
216
15.4k
                    (sub_idx(h->substitution_matrix[0], 'C') << 6) |
217
15.4k
                    (sub_idx(h->substitution_matrix[0], 'G') << 4) |
218
15.4k
                    (sub_idx(h->substitution_matrix[0], 'T') << 2) |
219
15.4k
                    (sub_idx(h->substitution_matrix[0], 'N') << 0);
220
15.4k
                *mp++ =
221
15.4k
                    (sub_idx(h->substitution_matrix[1], 'A') << 6) |
222
15.4k
                    (sub_idx(h->substitution_matrix[1], 'G') << 4) |
223
15.4k
                    (sub_idx(h->substitution_matrix[1], 'T') << 2) |
224
15.4k
                    (sub_idx(h->substitution_matrix[1], 'N') << 0);
225
15.4k
                *mp++ =
226
15.4k
                    (sub_idx(h->substitution_matrix[2], 'A') << 6) |
227
15.4k
                    (sub_idx(h->substitution_matrix[2], 'C') << 4) |
228
15.4k
                    (sub_idx(h->substitution_matrix[2], 'T') << 2) |
229
15.4k
                    (sub_idx(h->substitution_matrix[2], 'N') << 0);
230
15.4k
                *mp++ =
231
15.4k
                    (sub_idx(h->substitution_matrix[3], 'A') << 6) |
232
15.4k
                    (sub_idx(h->substitution_matrix[3], 'C') << 4) |
233
15.4k
                    (sub_idx(h->substitution_matrix[3], 'G') << 2) |
234
15.4k
                    (sub_idx(h->substitution_matrix[3], 'N') << 0);
235
15.4k
                *mp++ =
236
15.4k
                    (sub_idx(h->substitution_matrix[4], 'A') << 6) |
237
15.4k
                    (sub_idx(h->substitution_matrix[4], 'C') << 4) |
238
15.4k
                    (sub_idx(h->substitution_matrix[4], 'G') << 2) |
239
15.4k
                    (sub_idx(h->substitution_matrix[4], 'T') << 0);
240
15.4k
                BLOCK_APPEND(map, smat, 5);
241
15.4k
                break;
242
15.4k
            }
243
244
15.4k
            case CRAM_KEY('T','D'): {
245
15.4k
                r |= (fd->vv.varint_put32_blk(map, BLOCK_SIZE(h->TD_blk)) <= 0);
246
15.4k
                BLOCK_APPEND(map,
247
15.4k
                             BLOCK_DATA(h->TD_blk),
248
15.4k
                             BLOCK_SIZE(h->TD_blk));
249
15.4k
                break;
250
15.4k
            }
251
252
15.4k
            default:
253
0
                hts_log_warning("Unknown preservation key '%.2s'", key);
254
0
                break;
255
75.6k
            }
256
257
75.6k
            mc++;
258
75.6k
        }
259
15.4k
    }
260
20.8k
    r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
261
20.8k
    r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
262
20.8k
    BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
263
264
    /* rec encoding map */
265
20.8k
    mc = 0;
266
20.8k
    BLOCK_SIZE(map) = 0;
267
20.8k
    if (h->codecs[DS_BF]) {
268
15.4k
        if (-1 == h->codecs[DS_BF]->store(h->codecs[DS_BF], map, "BF",
269
15.4k
                                          fd->version))
270
0
            return NULL;
271
15.4k
        mc++;
272
15.4k
    }
273
20.8k
    if (h->codecs[DS_CF]) {
274
15.4k
        if (-1 == h->codecs[DS_CF]->store(h->codecs[DS_CF], map, "CF",
275
15.4k
                                          fd->version))
276
0
            return NULL;
277
15.4k
        mc++;
278
15.4k
    }
279
20.8k
    if (h->codecs[DS_RL]) {
280
15.4k
        if (-1 == h->codecs[DS_RL]->store(h->codecs[DS_RL], map, "RL",
281
15.4k
                                          fd->version))
282
0
            return NULL;
283
15.4k
        mc++;
284
15.4k
    }
285
20.8k
    if (h->codecs[DS_AP]) {
286
15.4k
        if (-1 == h->codecs[DS_AP]->store(h->codecs[DS_AP], map, "AP",
287
15.4k
                                          fd->version))
288
0
            return NULL;
289
15.4k
        mc++;
290
15.4k
    }
291
20.8k
    if (h->codecs[DS_RG]) {
292
15.4k
        if (-1 == h->codecs[DS_RG]->store(h->codecs[DS_RG], map, "RG",
293
15.4k
                                          fd->version))
294
0
            return NULL;
295
15.4k
        mc++;
296
15.4k
    }
297
20.8k
    if (h->codecs[DS_MF]) {
298
15.4k
        if (-1 == h->codecs[DS_MF]->store(h->codecs[DS_MF], map, "MF",
299
15.4k
                                          fd->version))
300
0
            return NULL;
301
15.4k
        mc++;
302
15.4k
    }
303
20.8k
    if (h->codecs[DS_NS]) {
304
15.4k
        if (-1 == h->codecs[DS_NS]->store(h->codecs[DS_NS], map, "NS",
305
15.4k
                                          fd->version))
306
0
            return NULL;
307
15.4k
        mc++;
308
15.4k
    }
309
20.8k
    if (h->codecs[DS_NP]) {
310
15.4k
        if (-1 == h->codecs[DS_NP]->store(h->codecs[DS_NP], map, "NP",
311
15.4k
                                          fd->version))
312
0
            return NULL;
313
15.4k
        mc++;
314
15.4k
    }
315
20.8k
    if (h->codecs[DS_TS]) {
316
15.4k
        if (-1 == h->codecs[DS_TS]->store(h->codecs[DS_TS], map, "TS",
317
15.4k
                                          fd->version))
318
0
            return NULL;
319
15.4k
        mc++;
320
15.4k
    }
321
20.8k
    if (h->codecs[DS_NF]) {
322
8
        if (-1 == h->codecs[DS_NF]->store(h->codecs[DS_NF], map, "NF",
323
8
                                          fd->version))
324
0
            return NULL;
325
8
        mc++;
326
8
    }
327
20.8k
    if (h->codecs[DS_TC]) {
328
0
        if (-1 == h->codecs[DS_TC]->store(h->codecs[DS_TC], map, "TC",
329
0
                                          fd->version))
330
0
            return NULL;
331
0
        mc++;
332
0
    }
333
20.8k
    if (h->codecs[DS_TN]) {
334
0
        if (-1 == h->codecs[DS_TN]->store(h->codecs[DS_TN], map, "TN",
335
0
                                          fd->version))
336
0
            return NULL;
337
0
        mc++;
338
0
    }
339
20.8k
    if (h->codecs[DS_TL]) {
340
15.4k
        if (-1 == h->codecs[DS_TL]->store(h->codecs[DS_TL], map, "TL",
341
15.4k
                                          fd->version))
342
0
            return NULL;
343
15.4k
        mc++;
344
15.4k
    }
345
20.8k
    if (h->codecs[DS_FN]) {
346
6.26k
        if (-1 == h->codecs[DS_FN]->store(h->codecs[DS_FN], map, "FN",
347
6.26k
                                          fd->version))
348
0
            return NULL;
349
6.26k
        mc++;
350
6.26k
    }
351
20.8k
    if (h->codecs[DS_FC]) {
352
6.22k
        if (-1 == h->codecs[DS_FC]->store(h->codecs[DS_FC], map, "FC",
353
6.22k
                                          fd->version))
354
0
            return NULL;
355
6.22k
        mc++;
356
6.22k
    }
357
20.8k
    if (h->codecs[DS_FP]) {
358
6.22k
        if (-1 == h->codecs[DS_FP]->store(h->codecs[DS_FP], map, "FP",
359
6.22k
                                          fd->version))
360
0
            return NULL;
361
6.22k
        mc++;
362
6.22k
    }
363
20.8k
    if (h->codecs[DS_BS]) {
364
248
        if (-1 == h->codecs[DS_BS]->store(h->codecs[DS_BS], map, "BS",
365
248
                                          fd->version))
366
0
            return NULL;
367
248
        mc++;
368
248
    }
369
20.8k
    if (h->codecs[DS_IN]) {
370
15.4k
        if (-1 == h->codecs[DS_IN]->store(h->codecs[DS_IN], map, "IN",
371
15.4k
                                          fd->version))
372
0
            return NULL;
373
15.4k
        mc++;
374
15.4k
    }
375
20.8k
    if (h->codecs[DS_DL]) {
376
1.51k
        if (-1 == h->codecs[DS_DL]->store(h->codecs[DS_DL], map, "DL",
377
1.51k
                                          fd->version))
378
0
            return NULL;
379
1.51k
        mc++;
380
1.51k
    }
381
20.8k
    if (h->codecs[DS_BA]) {
382
1.67k
        if (-1 == h->codecs[DS_BA]->store(h->codecs[DS_BA], map, "BA",
383
1.67k
                                          fd->version))
384
0
            return NULL;
385
1.67k
        mc++;
386
1.67k
    }
387
20.8k
    if (h->codecs[DS_BB]) {
388
15.4k
        if (-1 == h->codecs[DS_BB]->store(h->codecs[DS_BB], map, "BB",
389
15.4k
                                          fd->version))
390
0
            return NULL;
391
15.4k
        mc++;
392
15.4k
    }
393
20.8k
    if (h->codecs[DS_MQ]) {
394
15.4k
        if (-1 == h->codecs[DS_MQ]->store(h->codecs[DS_MQ], map, "MQ",
395
15.4k
                                          fd->version))
396
0
            return NULL;
397
15.4k
        mc++;
398
15.4k
    }
399
20.8k
    if (h->codecs[DS_RN]) {
400
15.4k
        if (-1 == h->codecs[DS_RN]->store(h->codecs[DS_RN], map, "RN",
401
15.4k
                                          fd->version))
402
0
            return NULL;
403
15.4k
        mc++;
404
15.4k
    }
405
20.8k
    if (h->codecs[DS_QS]) {
406
15.4k
        if (-1 == h->codecs[DS_QS]->store(h->codecs[DS_QS], map, "QS",
407
15.4k
                                          fd->version))
408
0
            return NULL;
409
15.4k
        mc++;
410
15.4k
    }
411
20.8k
    if (h->codecs[DS_QQ]) {
412
0
        if (-1 == h->codecs[DS_QQ]->store(h->codecs[DS_QQ], map, "QQ",
413
0
                                          fd->version))
414
0
            return NULL;
415
0
        mc++;
416
0
    }
417
20.8k
    if (h->codecs[DS_RI]) {
418
15.4k
        if (-1 == h->codecs[DS_RI]->store(h->codecs[DS_RI], map, "RI",
419
15.4k
                                          fd->version))
420
0
            return NULL;
421
15.4k
        mc++;
422
15.4k
    }
423
20.8k
    if (CRAM_MAJOR_VERS(fd->version) != 1) {
424
20.8k
        if (h->codecs[DS_SC]) {
425
15.4k
            if (-1 == h->codecs[DS_SC]->store(h->codecs[DS_SC], map, "SC",
426
15.4k
                                              fd->version))
427
0
                return NULL;
428
15.4k
            mc++;
429
15.4k
        }
430
20.8k
        if (h->codecs[DS_RS]) {
431
50
            if (-1 == h->codecs[DS_RS]->store(h->codecs[DS_RS], map, "RS",
432
50
                                              fd->version))
433
0
                return NULL;
434
50
            mc++;
435
50
        }
436
20.8k
        if (h->codecs[DS_PD]) {
437
36
            if (-1 == h->codecs[DS_PD]->store(h->codecs[DS_PD], map, "PD",
438
36
                                              fd->version))
439
0
                return NULL;
440
36
            mc++;
441
36
        }
442
20.8k
        if (h->codecs[DS_HC]) {
443
1.31k
            if (-1 == h->codecs[DS_HC]->store(h->codecs[DS_HC], map, "HC",
444
1.31k
                                              fd->version))
445
0
                return NULL;
446
1.31k
            mc++;
447
1.31k
        }
448
20.8k
    }
449
20.8k
    if (h->codecs[DS_TM]) {
450
0
        if (-1 == h->codecs[DS_TM]->store(h->codecs[DS_TM], map, "TM",
451
0
                                          fd->version))
452
0
            return NULL;
453
0
        mc++;
454
0
    }
455
20.8k
    if (h->codecs[DS_TV]) {
456
0
        if (-1 == h->codecs[DS_TV]->store(h->codecs[DS_TV], map, "TV",
457
0
                                          fd->version))
458
0
            return NULL;
459
0
        mc++;
460
0
    }
461
20.8k
    r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
462
20.8k
    r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
463
20.8k
    BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
464
465
    /* tag encoding map */
466
20.8k
    mc = 0;
467
20.8k
    BLOCK_SIZE(map) = 0;
468
20.8k
    if (c->tags_used) {
469
15.4k
        khint_t k;
470
471
136k
        for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
472
120k
            int key;
473
120k
            if (!kh_exist(c->tags_used, k))
474
62.1k
                continue;
475
476
58.7k
            key = kh_key(c->tags_used, k);
477
58.7k
            cram_codec *cd = kh_val(c->tags_used, k)->codec;
478
479
58.7k
            r |= (fd->vv.varint_put32_blk(map, key) <= 0);
480
58.7k
            if (-1 == cd->store(cd, map, NULL, fd->version))
481
0
                return NULL;
482
483
58.7k
            mc++;
484
58.7k
        }
485
15.4k
    }
486
487
20.8k
    r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
488
20.8k
    r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
489
20.8k
    BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
490
491
20.8k
    hts_log_info("Wrote compression block header in %d bytes", (int)BLOCK_SIZE(cb));
492
493
20.8k
    BLOCK_UPLEN(cb);
494
495
20.8k
    cram_free_block(map);
496
497
20.8k
    if (r >= 0)
498
20.8k
        return cb;
499
500
0
 block_err:
501
0
    return NULL;
502
20.8k
}
503
504
505
/*
506
 * Encodes a slice compression header.
507
 *
508
 * Returns cram_block on success
509
 *         NULL on failure
510
 */
511
15.6k
cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) {
512
15.6k
    char *buf;
513
15.6k
    char *cp;
514
15.6k
    cram_block *b = cram_new_block(MAPPED_SLICE, 0);
515
15.6k
    int j;
516
517
15.6k
    if (!b)
518
0
        return NULL;
519
520
15.6k
    cp = buf = malloc(22+16+5*(8+s->hdr->num_blocks));
521
15.6k
    if (NULL == buf) {
522
0
        cram_free_block(b);
523
0
        return NULL;
524
0
    }
525
526
15.6k
    cp += fd->vv.varint_put32s(cp, NULL, s->hdr->ref_seq_id);
527
15.6k
    if (CRAM_MAJOR_VERS(fd->version) >= 4) {
528
0
        cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_start);
529
0
        cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_span);
530
15.6k
    } else {
531
15.6k
        if (s->hdr->ref_seq_start < 0 || s->hdr->ref_seq_start > INT_MAX) {
532
153
            hts_log_error("Reference position too large for CRAM 3");
533
153
            cram_free_block(b);
534
153
            free(buf);
535
153
            return NULL;
536
153
        }
537
15.4k
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_start);
538
15.4k
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_span);
539
15.4k
    }
540
15.4k
    cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_records);
541
15.4k
    if (CRAM_MAJOR_VERS(fd->version) == 2)
542
0
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->record_counter);
543
15.4k
    else if (CRAM_MAJOR_VERS(fd->version) >= 3)
544
15.4k
        cp += fd->vv.varint_put64(cp, NULL, s->hdr->record_counter);
545
15.4k
    cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_blocks);
546
15.4k
    cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_content_ids);
547
113k
    for (j = 0; j < s->hdr->num_content_ids; j++) {
548
97.8k
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->block_content_ids[j]);
549
97.8k
    }
550
15.4k
    if (s->hdr->content_type == MAPPED_SLICE)
551
15.4k
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_base_id);
552
553
15.4k
    if (CRAM_MAJOR_VERS(fd->version) != 1) {
554
15.4k
        memcpy(cp, s->hdr->md5, 16); cp += 16;
555
15.4k
    }
556
557
15.4k
    assert(cp-buf <= 22+16+5*(8+s->hdr->num_blocks));
558
559
15.4k
    b->data = (unsigned char *)buf;
560
15.4k
    b->comp_size = b->uncomp_size = cp-buf;
561
562
15.4k
    return b;
563
15.4k
}
564
565
566
/*
567
 * Encodes a single read.
568
 *
569
 * Returns 0 on success
570
 *        -1 on failure
571
 */
572
static int cram_encode_slice_read(cram_fd *fd,
573
                                  cram_container *c,
574
                                  cram_block_compression_hdr *h,
575
                                  cram_slice *s,
576
                                  cram_record *cr,
577
2.01M
                                  int64_t *last_pos) {
578
2.01M
    int r = 0;
579
2.01M
    int32_t i32;
580
2.01M
    int64_t i64;
581
2.01M
    unsigned char uc;
582
583
    //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name);
584
585
    //printf("BF=0x%x\n", cr->flags);
586
    //      bf = cram_flag_swap[cr->flags];
587
2.01M
    i32 = fd->cram_flag_swap[cr->flags & 0xfff];
588
2.01M
    r |= h->codecs[DS_BF]->encode(s, h->codecs[DS_BF], (char *)&i32, 1);
589
590
2.01M
    i32 = cr->cram_flags & CRAM_FLAG_MASK;
591
2.01M
    r |= h->codecs[DS_CF]->encode(s, h->codecs[DS_CF], (char *)&i32, 1);
592
593
2.01M
    if (CRAM_MAJOR_VERS(fd->version) != 1 && s->hdr->ref_seq_id == -2)
594
4.88k
        r |= h->codecs[DS_RI]->encode(s, h->codecs[DS_RI], (char *)&cr->ref_id, 1);
595
596
2.01M
    r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1);
597
598
2.01M
    if (c->pos_sorted) {
599
2.01M
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
600
0
            i64 = cr->apos - *last_pos;
601
0
            r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1);
602
2.01M
        } else {
603
2.01M
            i32 = cr->apos - *last_pos;
604
2.01M
            r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
605
2.01M
        }
606
2.01M
        *last_pos = cr->apos;
607
2.01M
    } else {
608
7.47k
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
609
0
            i64 = cr->apos;
610
0
            r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1);
611
7.47k
        } else {
612
7.47k
            i32 = cr->apos;
613
7.47k
            r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
614
7.47k
        }
615
7.47k
    }
616
617
2.01M
    r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1);
618
619
2.01M
    if (cr->cram_flags & CRAM_FLAG_DETACHED) {
620
2.01M
        i32 = cr->mate_flags;
621
2.01M
        r |= h->codecs[DS_MF]->encode(s, h->codecs[DS_MF], (char *)&i32, 1);
622
623
2.01M
        r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS],
624
2.01M
                                      (char *)&cr->mate_ref_id, 1);
625
626
2.01M
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
627
0
            r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
628
0
                                          (char *)&cr->mate_pos, 1);
629
0
            r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
630
0
                                          (char *)&cr->tlen, 1);
631
2.01M
        } else {
632
2.01M
            i32 = cr->mate_pos;
633
2.01M
            r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
634
2.01M
                                          (char *)&i32, 1);
635
2.01M
            i32 = cr->tlen;
636
2.01M
            r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
637
2.01M
                                          (char *)&i32, 1);
638
2.01M
        }
639
2.01M
    } else {
640
16
        if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) {
641
8
            r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF],
642
8
                                          (char *)&cr->mate_line, 1);
643
8
        }
644
16
        if (cr->cram_flags & CRAM_FLAG_EXPLICIT_TLEN) {
645
0
            if (CRAM_MAJOR_VERS(fd->version) >= 4) {
646
0
                r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
647
0
                                              (char *)&cr->tlen, 1);
648
0
            }
649
0
        }
650
16
    }
651
652
    /* Aux tags */
653
2.01M
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
654
0
        int j;
655
0
        uc = cr->ntags;
656
0
        r |= h->codecs[DS_TC]->encode(s, h->codecs[DS_TC], (char *)&uc, 1);
657
658
0
        for (j = 0; j < cr->ntags; j++) {
659
0
            uint32_t i32 = s->TN[cr->TN_idx + j]; // id
660
0
            r |= h->codecs[DS_TN]->encode(s, h->codecs[DS_TN], (char *)&i32, 1);
661
0
        }
662
2.01M
    } else {
663
2.01M
        r |= h->codecs[DS_TL]->encode(s, h->codecs[DS_TL], (char *)&cr->TL, 1);
664
2.01M
    }
665
666
    // qual
667
    // QS codec : Already stored in block[2].
668
669
    // features (diffs)
670
2.01M
    if (!(cr->flags & BAM_FUNMAP)) {
671
18.4k
        int prev_pos = 0, j;
672
673
18.4k
        r |= h->codecs[DS_FN]->encode(s, h->codecs[DS_FN],
674
18.4k
                                      (char *)&cr->nfeature, 1);
675
55.2k
        for (j = 0; j < cr->nfeature; j++) {
676
36.8k
            cram_feature *f = &s->features[cr->feature + j];
677
678
36.8k
            uc = f->X.code;
679
36.8k
            r |= h->codecs[DS_FC]->encode(s, h->codecs[DS_FC], (char *)&uc, 1);
680
36.8k
            i32 = f->X.pos - prev_pos;
681
36.8k
            r |= h->codecs[DS_FP]->encode(s, h->codecs[DS_FP], (char *)&i32, 1);
682
36.8k
            prev_pos = f->X.pos;
683
684
36.8k
            switch(f->X.code) {
685
                //char *seq;
686
687
511
            case 'X':
688
                //fprintf(stderr, "    FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base);
689
690
511
                uc = f->X.base;
691
511
                r |= h->codecs[DS_BS]->encode(s, h->codecs[DS_BS],
692
511
                                              (char *)&uc, 1);
693
511
                break;
694
13.5k
            case 'S':
695
                // Already done
696
                //r |= h->codecs[DS_SC]->encode(s, h->codecs[DS_SC],
697
                //                              BLOCK_DATA(s->soft_blk) + f->S.seq_idx,
698
                //                              f->S.len);
699
700
                //if (CRAM_MAJOR_VERS(fd->version) >= 3) {
701
                //    r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
702
                //                                  BLOCK_DATA(s->seqs_blk) + f->S.seq_idx,
703
                //                                  f->S.len);
704
                //}
705
13.5k
                break;
706
181
            case 'I':
707
                //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
708
                //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
709
                //                           seq, f->S.len);
710
                //if (CRAM_MAJOR_VERS(fd->version) >= 3) {
711
                //    r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
712
                //                                  BLOCK_DATA(s->seqs_blk) + f->I.seq_idx,
713
                //                                  f->I.len);
714
                //}
715
181
                break;
716
56
            case 'i':
717
56
                uc = f->i.base;
718
56
                r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
719
56
                                              (char *)&uc, 1);
720
                //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
721
                //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
722
                //                           seq, 1);
723
56
                break;
724
14.7k
            case 'D':
725
14.7k
                i32 = f->D.len;
726
14.7k
                r |= h->codecs[DS_DL]->encode(s, h->codecs[DS_DL],
727
14.7k
                                              (char *)&i32, 1);
728
14.7k
                break;
729
730
920
            case 'B':
731
                //                  // Used when we try to store a non ACGTN base or an N
732
                //                  // that aligns against a non ACGTN reference
733
734
920
                uc  = f->B.base;
735
920
                r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
736
920
                                              (char *)&uc, 1);
737
738
                //                  Already added
739
                //                  uc  = f->B.qual;
740
                //                  r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
741
                //                                           (char *)&uc, 1);
742
920
                break;
743
744
261
            case 'b':
745
                // string of bases
746
261
                r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
747
261
                                              (char *)BLOCK_DATA(s->seqs_blk)
748
261
                                                      + f->b.seq_idx,
749
261
                                              f->b.len);
750
261
                break;
751
752
27
            case 'Q':
753
                //                  Already added
754
                //                  uc  = f->B.qual;
755
                //                  r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
756
                //                                           (char *)&uc, 1);
757
27
                break;
758
759
121
            case 'N':
760
121
                i32 = f->N.len;
761
121
                r |= h->codecs[DS_RS]->encode(s, h->codecs[DS_RS],
762
121
                                              (char *)&i32, 1);
763
121
                break;
764
765
161
            case 'P':
766
161
                i32 = f->P.len;
767
161
                r |= h->codecs[DS_PD]->encode(s, h->codecs[DS_PD],
768
161
                                              (char *)&i32, 1);
769
161
                break;
770
771
6.35k
            case 'H':
772
6.35k
                i32 = f->H.len;
773
6.35k
                r |= h->codecs[DS_HC]->encode(s, h->codecs[DS_HC],
774
6.35k
                                              (char *)&i32, 1);
775
6.35k
                break;
776
777
778
0
            default:
779
0
                hts_log_error("Unhandled feature code %c", f->X.code);
780
0
                return -1;
781
36.8k
            }
782
36.8k
        }
783
784
18.4k
        r |= h->codecs[DS_MQ]->encode(s, h->codecs[DS_MQ],
785
18.4k
                                      (char *)&cr->mqual, 1);
786
2.00M
    } else {
787
2.00M
        char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
788
2.00M
        if (cr->len)
789
82.1k
            r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], seq, cr->len);
790
2.00M
    }
791
792
2.01M
    return r ? -1 : 0;
793
2.01M
}
794
795
796
/*
797
 * Applies various compression methods to specific blocks, depending on
798
 * known observations of how data series compress.
799
 *
800
 * Returns 0 on success
801
 *        -1 on failure
802
 */
803
15.6k
static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) {
804
15.6k
    int level = fd->level, i;
805
15.6k
    int method = 1<<GZIP | 1<<GZIP_RLE, methodF = method;
806
15.6k
    int v31_or_above = (fd->version >= (3<<8)+1);
807
808
    /* Compress the CORE Block too, with minimal zlib level */
809
15.6k
    if (level > 5 && s->block[0]->uncomp_size > 500)
810
0
        cram_compress_block2(fd, s, s->block[0], NULL, 1<<GZIP, 1);
811
812
15.6k
    if (fd->use_bz2)
813
0
        method |= 1<<BZIP2;
814
815
15.6k
    int method_rans   = (1<<RANS0) | (1<<RANS1);
816
15.6k
    int method_ranspr = method_rans;
817
818
15.6k
    if (fd->use_rans) {
819
15.6k
        method_ranspr = (1<<RANS_PR0)   | (1<<RANS_PR1);
820
15.6k
        if (level > 1)
821
15.6k
            method_ranspr |=
822
15.6k
                  (1<<RANS_PR64)  | (1<<RANS_PR9)
823
15.6k
                | (1<<RANS_PR128) | (1<<RANS_PR193);
824
15.6k
        if (level > 5)
825
0
            method_ranspr |= (1<<RANS_PR129) | (1<<RANS_PR192);
826
15.6k
    }
827
828
15.6k
    if (fd->use_rans) {
829
15.6k
        methodF |= v31_or_above ? method_ranspr : method_rans;
830
15.6k
        method  |= v31_or_above ? method_ranspr : method_rans;
831
15.6k
    }
832
833
15.6k
    int method_arith   = 0;
834
15.6k
    if (fd->use_arith) {
835
0
        method_arith = (1<<ARITH_PR0)   | (1<<ARITH_PR1);
836
0
        if (level > 1)
837
0
            method_arith |=
838
0
                  (1<<ARITH_PR64)  | (1<<ARITH_PR9)
839
0
                | (1<<ARITH_PR128) | (1<<ARITH_PR129)
840
0
                | (1<<ARITH_PR192) | (1u<<ARITH_PR193);
841
0
    }
842
15.6k
    if (fd->use_arith && v31_or_above) {
843
0
        methodF |= method_arith;
844
0
        method  |= method_arith;
845
0
    }
846
847
15.6k
    if (fd->use_lzma)
848
0
        method |= (1<<LZMA);
849
850
    /* Faster method for data series we only need entropy encoding on */
851
15.6k
    methodF = method & ~(1<<GZIP | 1<<BZIP2 | 1<<LZMA);
852
15.6k
    if (level >= 5) {
853
15.6k
        method |= 1<<GZIP_1;
854
15.6k
        methodF = method;
855
15.6k
    }
856
15.6k
    if (level == 1) {
857
0
        method &= ~(1<<GZIP);
858
0
        method |=   1<<GZIP_1;
859
0
        methodF = method;
860
0
    }
861
862
15.6k
    int qmethod  = method;
863
15.6k
    int qmethodF = method;
864
15.6k
    if (v31_or_above && fd->use_fqz) {
865
0
        qmethod  |= 1<<FQZ;
866
0
        qmethodF |= 1<<FQZ;
867
0
        if (fd->level > 4) {
868
0
            qmethod  |= 1<<FQZ_b;
869
0
            qmethodF |= 1<<FQZ_b;
870
0
        }
871
0
        if (fd->level > 6) {
872
0
            qmethod  |= (1<<FQZ_c) | (1<<FQZ_d);
873
0
            qmethodF |= (1<<FQZ_c) | (1<<FQZ_d);
874
0
        }
875
0
    }
876
877
15.6k
    pthread_mutex_lock(&fd->metrics_lock);
878
750k
    for (i = 0; i < DS_END; i++)
879
734k
        if (c->stats[i] && c->stats[i]->nvals > 16)
880
122
            fd->m[i]->unpackable = 1;
881
15.6k
    pthread_mutex_unlock(&fd->metrics_lock);
882
883
    /* Specific compression methods for certain block types */
884
15.6k
    if (cram_compress_block2(fd, s, s->block[DS_IN], fd->m[DS_IN], //IN (seq)
885
15.6k
                             method, level))
886
0
        return -1;
887
888
15.6k
    if (fd->level == 0) {
889
        /* Do nothing */
890
15.6k
    } else if (fd->level == 1) {
891
0
        if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
892
0
                                 qmethodF, 1))
893
0
            return -1;
894
0
        for (i = DS_aux; i <= DS_aux_oz; i++) {
895
0
            if (s->block[i])
896
0
                if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
897
0
                                         method, 1))
898
0
                    return -1;
899
0
        }
900
15.6k
    } else if (fd->level < 3) {
901
0
        if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
902
0
                                 qmethod, 1))
903
0
            return -1;
904
0
        if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA],
905
0
                                 method, 1))
906
0
            return -1;
907
0
        if (s->block[DS_BB])
908
0
            if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB],
909
0
                                     method, 1))
910
0
                return -1;
911
0
        for (i = DS_aux; i <= DS_aux_oz; i++) {
912
0
            if (s->block[i])
913
0
                if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
914
0
                                         method, level))
915
0
                    return -1;
916
0
        }
917
15.6k
    } else {
918
15.6k
        if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
919
15.6k
                                 qmethod, level))
920
0
            return -1;
921
15.6k
        if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA],
922
15.6k
                                 method, level))
923
0
            return -1;
924
15.6k
        if (s->block[DS_BB])
925
15.6k
            if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB],
926
15.6k
                                     method, level))
927
0
                return -1;
928
156k
        for (i = DS_aux; i <= DS_aux_oz; i++) {
929
140k
            if (s->block[i])
930
0
                if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
931
0
                                         method, level))
932
0
                    return -1;
933
140k
        }
934
15.6k
    }
935
936
    // NAME: best is generally xz, bzip2, zlib then rans1
937
15.6k
    int method_rn = method & ~(method_rans | method_ranspr | 1<<GZIP_RLE);
938
15.6k
    if (fd->version >= (3<<8)+1 && fd->use_tok)
939
15.6k
        method_rn |= fd->use_arith ? (1<<TOKA) : (1<<TOK3);
940
15.6k
    if (cram_compress_block2(fd, s, s->block[DS_RN], fd->m[DS_RN],
941
15.6k
                             method_rn, level))
942
0
        return -1;
943
944
    // NS shows strong local correlation as rearrangements are localised
945
15.6k
    if (s->block[DS_NS] && s->block[DS_NS] != s->block[0])
946
521
        if (cram_compress_block2(fd, s, s->block[DS_NS], fd->m[DS_NS],
947
521
                                 method, level))
948
0
            return -1;
949
950
951
    /*
952
     * Compress any auxiliary tags with their own per-tag metrics
953
     */
954
15.6k
    {
955
15.6k
        int i;
956
74.6k
        for (i = DS_END /*num_blk - naux_blk*/; i < s->hdr->num_blocks; i++) {
957
59.0k
            if (!s->block[i] || s->block[i] == s->block[0])
958
0
                continue;
959
960
59.0k
            if (s->block[i]->method != RAW)
961
0
                continue;
962
963
59.0k
            if (cram_compress_block2(fd, s, s->block[i], s->block[i]->m,
964
59.0k
                                     method, level))
965
0
                return -1;
966
59.0k
        }
967
15.6k
    }
968
969
    /*
970
     * Minimal compression of any block still uncompressed, bar CORE
971
     */
972
15.6k
    {
973
15.6k
        int i;
974
734k
        for (i = 1; i < s->hdr->num_blocks && i < DS_END; i++) {
975
719k
            if (!s->block[i] || s->block[i] == s->block[0])
976
604k
                continue;
977
978
114k
            if (s->block[i]->method != RAW)
979
3.73k
                continue;
980
981
111k
            if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
982
111k
                                    methodF, level))
983
0
                return -1;
984
111k
        }
985
15.6k
    }
986
987
15.6k
    return 0;
988
15.6k
}
989
990
/*
991
 * Allocates a block associated with the cram codec associated with
992
 * data series ds_id or the internal codec_id (depending on codec
993
 * type).
994
 *
995
 * The ds_ids are what end up written to disk as an external block.
996
 * The c_ids are internal and used when daisy-chaining transforms
997
 * such as MAP and RLE.  These blocks are also allocated, but
998
 * are ephemeral in nature.  (The codecs themselves cannot allocate
999
 * these as the same codec pointer may be operating on multiple slices
1000
 * if we're using a multi-slice container.)
1001
 *
1002
 * Returns 0 on success
1003
 *        -1 on failure
1004
 */
1005
453k
static int cram_allocate_block(cram_codec *codec, cram_slice *s, int ds_id) {
1006
453k
    if (!codec)
1007
148k
        return 0;
1008
1009
305k
    switch(codec->codec) {
1010
    // Codecs which are hard-coded to use the CORE block
1011
0
    case E_GOLOMB:
1012
196k
    case E_HUFFMAN:
1013
197k
    case E_BETA:
1014
197k
    case E_SUBEXP:
1015
197k
    case E_GOLOMB_RICE:
1016
197k
    case E_GAMMA:
1017
197k
        codec->out = s->block[0];
1018
197k
        break;
1019
1020
    // Codecs which don't use external blocks
1021
0
    case E_CONST_BYTE:
1022
0
    case E_CONST_INT:
1023
0
       codec->out = NULL;
1024
0
       break;
1025
1026
    // Codecs that emit directly to external blocks
1027
61.1k
    case E_EXTERNAL:
1028
61.1k
    case E_VARINT_UNSIGNED:
1029
61.1k
    case E_VARINT_SIGNED:
1030
61.1k
        if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id)))
1031
0
            return -1;
1032
61.1k
        codec->u.external.content_id = ds_id;
1033
61.1k
        codec->out = s->block[ds_id];
1034
61.1k
        break;
1035
1036
31.2k
    case E_BYTE_ARRAY_STOP: // Why no sub-codec?
1037
31.2k
        if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id)))
1038
0
            return -1;
1039
31.2k
        codec->u.byte_array_stop.content_id = ds_id;
1040
31.2k
        codec->out = s->block[ds_id];
1041
31.2k
        break;
1042
1043
1044
    // Codecs that contain sub-codecs which may in turn emit to external blocks
1045
15.6k
    case E_BYTE_ARRAY_LEN: {
1046
15.6k
        cram_codec *bal = codec->u.e_byte_array_len.len_codec;
1047
15.6k
        if (cram_allocate_block(bal, s, bal->u.external.content_id))
1048
0
            return -1;
1049
15.6k
        bal = codec->u.e_byte_array_len.val_codec;
1050
15.6k
        if (cram_allocate_block(bal, s, bal->u.external.content_id))
1051
0
            return -1;
1052
1053
15.6k
        break;
1054
15.6k
    }
1055
1056
15.6k
    case E_XRLE:
1057
0
        if (cram_allocate_block(codec->u.e_xrle.len_codec, s, ds_id))
1058
                                //ds_id == DS_QS ? DS_QS_len : ds_id))
1059
0
            return -1;
1060
0
        if (cram_allocate_block(codec->u.e_xrle.lit_codec, s, ds_id))
1061
0
            return -1;
1062
1063
0
        break;
1064
1065
0
    case E_XPACK:
1066
0
        if (cram_allocate_block(codec->u.e_xpack.sub_codec, s, ds_id))
1067
0
            return -1;
1068
0
        codec->out = cram_new_block(0, 0); // ephemeral
1069
0
        if (!codec->out)
1070
0
            return -1;
1071
1072
0
        break;
1073
1074
0
    case E_XDELTA:
1075
0
        if (cram_allocate_block(codec->u.e_xdelta.sub_codec, s, ds_id))
1076
0
            return -1;
1077
0
        codec->out = cram_new_block(0, 0); // ephemeral
1078
0
        if (!codec->out)
1079
0
            return -1;
1080
1081
0
        break;
1082
1083
0
    default:
1084
0
        break;
1085
305k
    }
1086
1087
305k
    return 0;
1088
305k
}
1089
1090
/*
1091
 * Encodes a single slice from a container
1092
 *
1093
 * Returns 0 on success
1094
 *        -1 on failure
1095
 */
1096
static int cram_encode_slice(cram_fd *fd, cram_container *c,
1097
                             cram_block_compression_hdr *h, cram_slice *s,
1098
15.6k
                             int embed_ref) {
1099
15.6k
    int rec, r = 0;
1100
15.6k
    int64_t last_pos;
1101
15.6k
    enum cram_DS_ID id;
1102
1103
    /*
1104
     * Slice external blocks:
1105
     * ID 0 => base calls (insertions, soft-clip)
1106
     * ID 1 => qualities
1107
     * ID 2 => names
1108
     * ID 3 => TS (insert size), NP (next frag)
1109
     * ID 4 => tag values
1110
     * ID 6 => tag IDs (TN), if CRAM_V1.0
1111
     * ID 7 => TD tag dictionary, if !CRAM_V1.0
1112
     */
1113
1114
    /* Create cram slice header */
1115
15.6k
    s->hdr->ref_base_id = embed_ref>0 && s->hdr->ref_seq_span > 0
1116
15.6k
        ? DS_ref
1117
15.6k
        : (CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : -1);
1118
15.6k
    s->hdr->record_counter = c->num_records + c->record_counter;
1119
15.6k
    c->num_records += s->hdr->num_records;
1120
1121
15.6k
    int ntags = c->tags_used ? c->tags_used->n_occupied : 0;
1122
15.6k
    s->block = calloc(DS_END + ntags*2, sizeof(s->block[0]));
1123
15.6k
    s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t));
1124
15.6k
    if (!s->block || !s->hdr->block_content_ids)
1125
0
        return -1;
1126
1127
    // Create first fixed blocks, always external.
1128
    // CORE
1129
15.6k
    if (!(s->block[0] = cram_new_block(CORE, 0)))
1130
0
        return -1;
1131
1132
    // TN block for CRAM v1
1133
15.6k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
1134
0
        if (h->codecs[DS_TN]->codec == E_EXTERNAL) {
1135
0
            if (!(s->block[DS_TN] = cram_new_block(EXTERNAL,DS_TN))) return -1;
1136
0
            h->codecs[DS_TN]->u.external.content_id = DS_TN;
1137
0
        } else {
1138
0
            s->block[DS_TN] = s->block[0];
1139
0
        }
1140
0
    }
1141
1142
    // Embedded reference
1143
15.6k
    if (embed_ref>0) {
1144
6.74k
        if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref)))
1145
0
            return -1;
1146
6.74k
        s->ref_id = DS_ref; // needed?
1147
6.74k
        BLOCK_APPEND(s->block[DS_ref],
1148
6.74k
                     c->ref + s->hdr->ref_seq_start - c->ref_start,
1149
6.74k
                     s->hdr->ref_seq_span);
1150
6.74k
    }
1151
1152
    /*
1153
     * All the data-series blocks if appropriate.
1154
     */
1155
437k
    for (id = DS_QS; id < DS_TN; id++) {
1156
422k
        if (cram_allocate_block(h->codecs[id], s, id) < 0)
1157
0
            return -1;
1158
422k
    }
1159
1160
    /*
1161
     * Add in the external tag blocks too.
1162
     */
1163
15.6k
    if (c->tags_used) {
1164
15.6k
        int n;
1165
15.6k
        s->hdr->num_blocks = DS_END;
1166
74.6k
        for (n = 0; n < s->naux_block; n++) {
1167
59.0k
            s->block[s->hdr->num_blocks++] = s->aux_block[n];
1168
59.0k
            s->aux_block[n] = NULL;
1169
59.0k
        }
1170
15.6k
    }
1171
1172
    /* Encode reads */
1173
15.6k
    last_pos = s->hdr->ref_seq_start;
1174
2.03M
    for (rec = 0; rec < s->hdr->num_records; rec++) {
1175
2.01M
        cram_record *cr = &s->crecs[rec];
1176
2.01M
        if (cram_encode_slice_read(fd, c, h, s, cr, &last_pos) == -1)
1177
0
            return -1;
1178
2.01M
    }
1179
1180
15.6k
    s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7);
1181
15.6k
    s->block[0]->comp_size = s->block[0]->uncomp_size;
1182
1183
    // Make sure the fixed blocks point to the correct sources
1184
15.6k
    if (s->block[DS_IN]) cram_free_block(s->block[DS_IN]);
1185
15.6k
    s->block[DS_IN] = s->base_blk; s->base_blk = NULL;
1186
15.6k
    if (s->block[DS_QS]) cram_free_block(s->block[DS_QS]);
1187
15.6k
    s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL;
1188
15.6k
    if (s->block[DS_RN]) cram_free_block(s->block[DS_RN]);
1189
15.6k
    s->block[DS_RN] = s->name_blk; s->name_blk = NULL;
1190
15.6k
    if (s->block[DS_SC]) cram_free_block(s->block[DS_SC]);
1191
15.6k
    s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL;
1192
1193
    // Finalise any data transforms.
1194
437k
    for (id = DS_QS; id < DS_TN; id++) {
1195
422k
       if (h->codecs[id] && h->codecs[id]->flush)
1196
0
           h->codecs[id]->flush(h->codecs[id]);
1197
422k
    }
1198
1199
    // Ensure block sizes are up to date.
1200
793k
    for (id = 1; id < s->hdr->num_blocks; id++) {
1201
778k
        if (!s->block[id] || s->block[id] == s->block[0])
1202
604k
            continue;
1203
1204
173k
        if (s->block[id]->uncomp_size == 0)
1205
173k
            BLOCK_UPLEN(s->block[id]);
1206
173k
    }
1207
1208
    // Compress it all
1209
15.6k
    if (cram_compress_slice(fd, c, s) == -1)
1210
0
        return -1;
1211
1212
    // Collapse empty blocks and create hdr_block
1213
15.6k
    {
1214
15.6k
        int i, j;
1215
1216
15.6k
        s->hdr->block_content_ids = realloc(s->hdr->block_content_ids,
1217
15.6k
                                            s->hdr->num_blocks * sizeof(int32_t));
1218
15.6k
        if (!s->hdr->block_content_ids)
1219
0
            return -1;
1220
1221
793k
        for (i = j = 1; i < s->hdr->num_blocks; i++) {
1222
778k
            if (!s->block[i] || s->block[i] == s->block[0])
1223
604k
                continue;
1224
173k
            if (s->block[i]->uncomp_size == 0) {
1225
75.4k
                cram_free_block(s->block[i]);
1226
75.4k
                s->block[i] = NULL;
1227
75.4k
                continue;
1228
75.4k
            }
1229
98.3k
            s->block[j] = s->block[i];
1230
98.3k
            s->hdr->block_content_ids[j-1] = s->block[i]->content_id;
1231
98.3k
            j++;
1232
98.3k
        }
1233
15.6k
        s->hdr->num_content_ids = j-1;
1234
15.6k
        s->hdr->num_blocks = j;
1235
1236
15.6k
        if (!(s->hdr_block = cram_encode_slice_header(fd, s)))
1237
153
            return -1;
1238
15.6k
    }
1239
1240
15.4k
    return r ? -1 : 0;
1241
1242
0
 block_err:
1243
0
    return -1;
1244
15.6k
}
1245
1246
2.01M
static inline const char *bam_data_end(bam1_t *b) {
1247
2.01M
    return (const char *)b->data + b->l_data;
1248
2.01M
}
1249
1250
/*
1251
 * A bounds checking version of bam_aux2i.
1252
 */
1253
0
static inline int bam_aux2i_end(const uint8_t *aux, const uint8_t *aux_end) {
1254
0
    int type = *aux++;
1255
0
    switch (type) {
1256
0
        case 'c':
1257
0
            if (aux_end - aux < 1) {
1258
0
                errno = EINVAL;
1259
0
                return 0;
1260
0
            }
1261
0
            return *(int8_t *)aux;
1262
0
        case 'C':
1263
0
            if (aux_end - aux < 1) {
1264
0
                errno = EINVAL;
1265
0
                return 0;
1266
0
            }
1267
0
            return *aux;
1268
0
        case 's':
1269
0
            if (aux_end - aux < 2) {
1270
0
                errno = EINVAL;
1271
0
                return 0;
1272
0
            }
1273
0
            return le_to_i16(aux);
1274
0
        case 'S':
1275
0
            if (aux_end - aux < 2) {
1276
0
                errno = EINVAL;
1277
0
                return 0;
1278
0
            }
1279
0
            return le_to_u16(aux);
1280
0
        case 'i':
1281
0
            if (aux_end - aux < 4) {
1282
0
                errno = EINVAL;
1283
0
                return 0;
1284
0
            }
1285
0
            return le_to_i32(aux);
1286
0
        case 'I':
1287
0
            if (aux_end - aux < 4) {
1288
0
                errno = EINVAL;
1289
0
                return 0;
1290
0
            }
1291
0
            return le_to_u32(aux);
1292
0
        default:
1293
0
            errno = EINVAL;
1294
0
    }
1295
0
    return 0;
1296
0
}
1297
1298
/*
1299
 * Returns the number of expected read names for this record.
1300
 */
1301
0
static int expected_template_count(bam_seq_t *b) {
1302
0
    int expected = bam_flag(b) & BAM_FPAIRED ? 2 : 1;
1303
1304
0
    uint8_t *TC = (uint8_t *)bam_aux_get(b, "TC");
1305
0
    if (TC) {
1306
0
        int n = bam_aux2i_end(TC, (uint8_t *)bam_data_end(b));
1307
0
        if (expected < n)
1308
0
            expected = n;
1309
0
    }
1310
1311
0
    if (!TC && bam_aux_get(b, "SA")) {
1312
        // We could count the semicolons, but we'd have to do this for
1313
        // read1, read2 and read(not-1-or-2) combining the results
1314
        // together.  This is a cheap and safe alternative for now.
1315
0
        expected = INT_MAX;
1316
0
    }
1317
1318
0
    return expected;
1319
0
}
1320
1321
/*
1322
 * Lossily reject read names.
1323
 *
1324
 * The rule here is that if all reads for this template reside in the
1325
 * same slice then we can lose the name.  Otherwise we keep them as we
1326
 * do not know when (or if) the other reads will turn up.
1327
 *
1328
 * Note there may be only 1 read (non-paired library) or more than 2
1329
 * reads (paired library with supplementary reads), or other weird
1330
 * setups.  We need to know how many are expected.  Ways to guess:
1331
 *
1332
 * - Flags (0x1 - has > 1 read)
1333
 * - TC aux field (not mandatory)
1334
 * - SA tags (count semicolons, NB per fragment so sum - hard)
1335
 * - RNEXT/PNEXT uniqueness count. (not implemented, tricky)
1336
 *
1337
 * Returns 0 on success
1338
 *        -1 on failure
1339
 */
1340
static int lossy_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
1341
16.3k
                            int bam_start) {
1342
16.3k
    int r1, r2, ret = -1;
1343
1344
    // Initialise cram_flags
1345
2.03M
    for (r2 = 0; r2 < s->hdr->num_records; r2++)
1346
2.02M
        s->crecs[r2].cram_flags = 0;
1347
1348
16.3k
    if (!fd->lossy_read_names)
1349
16.3k
        return 0;
1350
1351
0
    khash_t(m_s2u64) *names = kh_init(m_s2u64);
1352
0
    if (!names)
1353
0
        goto fail;
1354
1355
    // 1: Iterate through names to count frequency
1356
0
    for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
1357
        //cram_record *cr = &s->crecs[r2];
1358
0
        bam_seq_t *b = c->bams[r1];
1359
0
        khint_t k;
1360
0
        int n;
1361
0
        uint64_t e;
1362
0
        union {
1363
0
            uint64_t i64;
1364
0
            struct {
1365
0
                int32_t e,c; // expected & observed counts.
1366
0
            } counts;
1367
0
        } u;
1368
1369
0
        e = expected_template_count(b);
1370
0
        u.counts.e = e; u.counts.c = 1;
1371
1372
0
        k = kh_put(m_s2u64, names, bam_name(b), &n);
1373
0
        if (n == -1)
1374
0
            goto fail;
1375
1376
0
        if (n == 0) {
1377
            // not a new name
1378
0
            u.i64 = kh_val(names, k);
1379
0
            if (u.counts.e != e) {
1380
                // different expectation or already hit the max
1381
                //fprintf(stderr, "Err computing no. %s recs\n", bam_name(b));
1382
0
                kh_val(names, k) = 0;
1383
0
            } else {
1384
0
                u.counts.c++;
1385
0
                if (u.counts.e == u.counts.c) {
1386
                    // Reached expected count.
1387
0
                    kh_val(names, k) = -1;
1388
0
                } else {
1389
0
                    kh_val(names, k) = u.i64;
1390
0
                }
1391
0
            }
1392
0
        } else {
1393
            // new name
1394
0
            kh_val(names, k) = u.i64;
1395
0
        }
1396
0
    }
1397
1398
    // 2: Remove names if all present (hd.i == -1)
1399
0
    for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
1400
0
        cram_record *cr = &s->crecs[r2];
1401
0
        bam_seq_t *b = c->bams[r1];
1402
0
        khint_t k;
1403
1404
0
        k = kh_get(m_s2u64, names, bam_name(b));
1405
1406
0
        if (k == kh_end(names))
1407
0
            goto fail;
1408
1409
0
        if (kh_val(names, k) == -1)
1410
0
            cr->cram_flags = CRAM_FLAG_DISCARD_NAME;
1411
0
    }
1412
1413
0
    ret = 0;
1414
0
 fail: // ret==-1
1415
1416
0
    if (names)
1417
0
        kh_destroy(m_s2u64, names);
1418
1419
0
    return ret;
1420
0
}
1421
1422
/*
1423
 * Adds the reading names.  We do this here as a separate pass rather
1424
 * than per record in the process_one_read calls as that function can
1425
 * go back and change the CRAM_FLAG_DETACHED status of a previously
1426
 * processed read if it subsequently determines the TLEN field is
1427
 * incorrect.  Given DETACHED reads always try to decode read names,
1428
 * we need to know their status before generating the read-name block.
1429
 *
1430
 * Output is an update s->name_blk, and cr->name / cr->name_len
1431
 * fields.
1432
 */
1433
static int add_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
1434
15.6k
                          int bam_start) {
1435
15.6k
    int r1, r2;
1436
15.6k
    int keep_names = !fd->lossy_read_names;
1437
1438
15.6k
    for (r1 = bam_start, r2 = 0;
1439
2.03M
         r1 < c->curr_c_rec && r2 < s->hdr->num_records;
1440
2.01M
         r1++, r2++) {
1441
2.01M
        cram_record *cr = &s->crecs[r2];
1442
2.01M
        bam_seq_t *b = c->bams[r1];
1443
1444
2.01M
        cr->name        = BLOCK_SIZE(s->name_blk);
1445
2.01M
        if ((cr->cram_flags & CRAM_FLAG_DETACHED) || keep_names) {
1446
2.01M
            if (CRAM_MAJOR_VERS(fd->version) >= 4
1447
0
                && (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM)
1448
0
                && cr->mate_line) {
1449
                // Dedup read names in V4
1450
0
                BLOCK_APPEND(s->name_blk, "\0", 1);
1451
0
                cr->name_len    = 1;
1452
2.01M
            } else {
1453
2.01M
                BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b));
1454
2.01M
                cr->name_len    = bam_name_len(b);
1455
2.01M
            }
1456
2.01M
        } else {
1457
            // Can only discard duplicate names if not detached
1458
0
            cr->name_len = 0;
1459
0
        }
1460
1461
2.01M
        if (cram_stats_add(c->stats[DS_RN], cr->name_len) < 0)
1462
0
            goto block_err;
1463
2.01M
    }
1464
1465
15.6k
    return 0;
1466
1467
0
 block_err:
1468
0
    return -1;
1469
15.6k
}
1470
1471
// CRAM version >= 3.1
1472
25.0k
#define CRAM_ge31(v) ((v) >= 0x301)
1473
1474
// Returns the next cigar op code: one of the BAM_C* codes,
1475
// or -1 if no more are present.
1476
static inline
1477
int next_cigar_op(uint32_t *cigar, uint32_t ncigar, int *skip, int *spos,
1478
2.62k
                  uint32_t *cig_ind, uint32_t *cig_op, uint32_t *cig_len) {
1479
3.74k
    for(;;) {
1480
8.09k
        while (*cig_len == 0) {
1481
4.38k
            if (*cig_ind < ncigar) {
1482
4.34k
                *cig_op  = cigar[*cig_ind] & BAM_CIGAR_MASK;
1483
4.34k
                *cig_len = cigar[*cig_ind] >> BAM_CIGAR_SHIFT;
1484
4.34k
                (*cig_ind)++;
1485
4.34k
            } else {
1486
42
                return -1;
1487
42
            }
1488
4.38k
        }
1489
1490
3.70k
        if (skip[*cig_op]) {
1491
1.12k
            *spos += (bam_cigar_type(*cig_op)&1) * *cig_len;
1492
1.12k
            *cig_len = 0;
1493
1.12k
            continue;
1494
1.12k
        }
1495
1496
2.58k
        (*cig_len)--;
1497
2.58k
        break;
1498
3.70k
    }
1499
1500
2.58k
    return *cig_op;
1501
2.62k
}
1502
1503
// Ensure ref and hist are large enough.
1504
static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos,
1505
714k
                             hts_pos_t ref_start, hts_pos_t *ref_end) {
1506
714k
    if (pos < ref_start)
1507
114
        return -1;
1508
714k
    if (pos < *ref_end)
1509
705k
        return 0;
1510
1511
    // realloc
1512
8.86k
    if (pos - ref_start > UINT_MAX)
1513
53
        return -2; // protect overflow in new_end calculation
1514
1515
8.81k
    hts_pos_t old_end = *ref_end ? *ref_end : ref_start;
1516
8.81k
    hts_pos_t new_end = ref_start + 1000 + (pos-ref_start)*1.5;
1517
1518
    // Refuse to work on excessively large blocks.
1519
    // We'll just switch to referenceless encoding, which is probably better
1520
    // here as this must be very sparse data anyway.
1521
8.81k
    if (new_end - ref_start > UINT_MAX/sizeof(**hist)/2)
1522
488
        return -2;
1523
1524
8.32k
    char *tmp = realloc(*ref, new_end-ref_start+1);
1525
8.32k
    if (!tmp)
1526
0
        return -1;
1527
8.32k
    *ref = tmp;
1528
1529
8.32k
    uint32_t (*tmp5)[5] = realloc(**hist,
1530
8.32k
                                  (new_end - ref_start)*sizeof(**hist));
1531
8.32k
    if (!tmp5)
1532
0
        return -1;
1533
8.32k
    *hist = tmp5;
1534
8.32k
    *ref_end = new_end;
1535
1536
    // initialise
1537
8.32k
    old_end -= ref_start;
1538
8.32k
    new_end -= ref_start;
1539
8.32k
    memset(&(*ref)[old_end],  0,  new_end-old_end);
1540
8.32k
    memset(&(*hist)[old_end], 0, (new_end-old_end)*sizeof(**hist));
1541
1542
8.32k
    return 0;
1543
8.32k
}
1544
1545
// Walk through MD + seq to generate ref
1546
// Returns 1 on success, <0 on failure
1547
static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
1548
                              hts_pos_t ref_start, hts_pos_t *ref_end,
1549
5.98k
                              const uint8_t *MD) {
1550
5.98k
    uint8_t *seq = bam_get_seq(b);
1551
5.98k
    uint32_t *cigar = bam_get_cigar(b);
1552
5.98k
    uint32_t ncigar = b->core.n_cigar;
1553
5.98k
    uint32_t cig_op = 0, cig_len = 0, cig_ind = 0;
1554
1555
    // End position of the sequence on the reference.
1556
5.98k
    hts_pos_t rlen = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
1557
5.98k
    hts_pos_t rseq_end = b->core.pos + (rlen ? rlen : b->core.l_qseq);
1558
1559
    // No sequence means extend based on CIGAR instead
1560
5.98k
    if (!b->core.l_qseq && extend_ref(ref, hist, rseq_end,
1561
3.79k
                                      ref_start, ref_end) < 0)
1562
6
        return -1;
1563
1564
5.97k
    int iseq = 0, next_op;
1565
5.97k
    hts_pos_t iref = b->core.pos - ref_start;
1566
1567
    // Skip INS, REF_SKIP, *CLIP, PAD. and BACK.
1568
5.97k
    static int cig_skip[16] = {0,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1};
1569
6.92k
    while (iseq < b->core.l_qseq && *MD) {
1570
2.51k
        if (isdigit(*MD)) {
1571
            // match
1572
1.66k
            int overflow = 0;
1573
1.66k
            int len = hts_str2uint((char *)MD, (char **)&MD, 31, &overflow);
1574
1.66k
            if (overflow ||
1575
1.35k
                extend_ref(ref, hist, iref+ref_start + len,
1576
1.35k
                           ref_start, ref_end) < 0)
1577
519
                return -1;
1578
1.16k
            while (iseq < b->core.l_qseq && len) {
1579
                // rewrite to have internal loops?
1580
945
                if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
1581
945
                                             &iseq, &cig_ind, &cig_op,
1582
945
                                             &cig_len)) < 0)
1583
8
                    return -1;
1584
1585
937
                if (next_op != BAM_CMATCH &&
1586
916
                    next_op != BAM_CEQUAL) {
1587
913
                    hts_log_info("MD:Z and CIGAR are incompatible for "
1588
913
                                 "record %s", bam_get_qname(b));
1589
913
                    return -1;
1590
913
                }
1591
1592
                // Short-cut loop over same cigar op for efficiency
1593
24
                cig_len++;
1594
24
                do {
1595
24
                    cig_len--;
1596
24
                    (*ref)[iref++] = seq_nt16_str[bam_seqi(seq, iseq)];
1597
24
                    iseq++;
1598
24
                    len--;
1599
24
                } while (cig_len && iseq < b->core.l_qseq && len);
1600
24
            }
1601
223
            if (len > 0)
1602
3
                return -1; // MD is longer than seq
1603
853
        } else if (*MD == '^') {
1604
            // deletion
1605
191
            MD++;
1606
1.02k
            while (isalpha(*MD)) {
1607
1.02k
                if (extend_ref(ref, hist, iref+ref_start, ref_start,
1608
1.02k
                               ref_end) < 0)
1609
0
                    return -1;
1610
1.02k
                if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
1611
1.02k
                                             &iseq, &cig_ind, &cig_op,
1612
1.02k
                                             &cig_len)) < 0)
1613
3
                    return -1;
1614
1615
1.02k
                if (next_op != BAM_CDEL) {
1616
5
                    hts_log_info("MD:Z and CIGAR are incompatible");
1617
5
                    return -1;
1618
5
                }
1619
1620
1.01k
                (*ref)[iref++] = *MD++ & ~0x20;
1621
1.01k
            }
1622
662
        } else {
1623
            // substitution
1624
662
            if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end) < 0)
1625
4
                return -1;
1626
658
            if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
1627
658
                                         &iseq, &cig_ind, &cig_op,
1628
658
                                         &cig_len)) < 0)
1629
31
                return -1;
1630
1631
627
            if (next_op != BAM_CMATCH && next_op != BAM_CDIFF) {
1632
88
                hts_log_info("MD:Z and CIGAR are incompatible");
1633
88
                return -1;
1634
88
            }
1635
1636
539
            (*ref)[iref++] = *MD++ & ~0x20;
1637
539
            iseq++;
1638
539
        }
1639
2.51k
    }
1640
1641
4.40k
    return 1;
1642
5.97k
}
1643
1644
// Append a sequence to a ref/consensus structure.
1645
// We maintain both an absolute refefence (ACGTN where MD:Z is
1646
// present) and a 5-way frequency array for when no MD:Z is known.
1647
// We then subsequently convert the 5-way frequencies to a consensus
1648
// ref in a second pass.
1649
//
1650
// Returns >=0 on success,
1651
//         -1 on failure (eg inconsistent data)
1652
static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5],
1653
17.5k
                           hts_pos_t ref_start, hts_pos_t *ref_end) {
1654
17.5k
    const uint8_t *MD = bam_aux_get(b, "MD");
1655
17.5k
    int ret = 0;
1656
17.5k
    if (MD && *MD == 'Z') {
1657
        // We can use MD to directly compute the reference
1658
5.98k
        int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end, MD+1);
1659
1660
5.98k
        if (ret > 0)
1661
4.40k
            return ret;
1662
5.98k
    }
1663
1664
    // Otherwise we just use SEQ+CIGAR and build a consensus which we later
1665
    // turn into a fake reference
1666
13.1k
    uint32_t *cigar = bam_get_cigar(b);
1667
13.1k
    uint32_t ncigar = b->core.n_cigar;
1668
13.1k
    uint32_t i, j;
1669
13.1k
    hts_pos_t iseq = 0, iref = b->core.pos - ref_start;
1670
13.1k
    uint8_t *seq = bam_get_seq(b);
1671
782k
    for (i = 0; i < ncigar; i++) {
1672
769k
        switch (bam_cigar_op(cigar[i])) {
1673
20.0k
        case BAM_CSOFT_CLIP:
1674
31.2k
        case BAM_CINS:
1675
31.2k
            iseq += bam_cigar_oplen(cigar[i]);
1676
31.2k
            break;
1677
1678
690k
        case BAM_CMATCH:
1679
700k
        case BAM_CEQUAL:
1680
700k
        case BAM_CDIFF: {
1681
700k
            int len = bam_cigar_oplen(cigar[i]);
1682
            // Maps an nt16 (A=1 C=2 G=4 T=8 bits) to 0123 plus N=4
1683
700k
            static uint8_t L16[16] = {4,0,1,4, 2,4,4,4, 3,4,4,4, 4,4,4,4};
1684
1685
700k
            if (extend_ref(ref, hist, iref+ref_start + len,
1686
700k
                           ref_start, ref_end) < 0)
1687
277
                return -1;
1688
699k
            if (iseq + len <= b->core.l_qseq) {
1689
                // Nullify failed MD:Z if appropriate
1690
8.11k
                if (ret < 0)
1691
0
                    memset(&(*ref)[iref], 0, len);
1692
1693
11.2k
                for (j = 0; j < len; j++, iref++, iseq++)
1694
3.12k
                    (*hist)[iref][L16[bam_seqi(seq, iseq)]]++;
1695
691k
            } else {
1696
                // Probably a 2ndary read with seq "*"
1697
691k
                iseq += len;
1698
691k
                iref += len;
1699
691k
            }
1700
699k
            break;
1701
700k
        }
1702
1703
10.8k
        case BAM_CDEL:
1704
11.7k
        case BAM_CREF_SKIP:
1705
11.7k
            iref += bam_cigar_oplen(cigar[i]);
1706
769k
        }
1707
769k
    }
1708
1709
12.8k
    return 1;
1710
13.1k
}
1711
1712
// Automatically generates the reference and stashed it in c->ref, also
1713
// setting c->ref_start and c->ref_end.
1714
//
1715
// If we have MD:Z tags then we use them to directly infer the reference,
1716
// along with SEQ + CIGAR.  Otherwise we use SEQ/CIGAR only to build up
1717
// a consensus and then assume the reference as the majority rule.
1718
//
1719
// In this latter scenario we need to be wary of auto-generating MD and NM
1720
// during decode, but that's handled elsewhere via an additional aux tag.
1721
//
1722
// Returns 0 on success,
1723
//        -1 on failure
1724
7.30k
static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) {
1725
    // TODO: if we can find an external reference then use it, even if the
1726
    // user told us to do embed_ref=2.
1727
7.30k
    char *ref = NULL;
1728
7.30k
    uint32_t (*hist)[5] = NULL;
1729
7.30k
    hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0;
1730
7.30k
    if (ref_start < 0)
1731
3
        return -1; // cannot build consensus from unmapped data
1732
1733
    // initial allocation
1734
7.30k
    if (extend_ref(&ref, &hist,
1735
7.30k
                   c->bams[r1 + s->hdr->num_records-1]->core.pos +
1736
7.30k
                   c->bams[r1 + s->hdr->num_records-1]->core.l_qseq,
1737
7.30k
                   ref_start, &ref_end) < 0)
1738
161
        return -1;
1739
1740
    // Add each bam file to the reference/consensus arrays
1741
7.14k
    int r2;
1742
7.14k
    hts_pos_t last_pos = -1;
1743
24.4k
    for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) {
1744
17.5k
        if (c->bams[r1]->core.pos < last_pos) {
1745
48
            hts_log_error("Cannot build reference with unsorted data");
1746
48
            goto err;
1747
48
        }
1748
17.5k
        last_pos = c->bams[r1]->core.pos;
1749
17.5k
        if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end) < 0)
1750
277
            goto err;
1751
17.5k
    }
1752
1753
    // Compute the consensus
1754
6.81k
    hts_pos_t i;
1755
1.04G
    for (i = 0; i < ref_end-ref_start; i++) {
1756
1.04G
        if (!ref[i]) {
1757
1.04G
            int max_v = 0, max_j = 4, j;
1758
5.20G
            for (j = 0; j < 4; j++)
1759
                // don't call N (j==4) unless no coverage
1760
4.16G
                if (max_v < hist[i][j])
1761
665
                    max_v = hist[i][j], max_j = j;
1762
1.04G
            ref[i] = "ACGTN"[max_j];
1763
1.04G
        }
1764
1.04G
    }
1765
6.81k
    free(hist);
1766
1767
    // Put the reference in place so it appears to be an external
1768
    // ref file.
1769
6.81k
    c->ref       = ref;
1770
6.81k
    c->ref_start = ref_start+1;
1771
6.81k
    c->ref_end   = ref_end+1;
1772
6.81k
    c->ref_free  = 1;
1773
6.81k
    return 0;
1774
1775
325
 err:
1776
325
    free(ref);
1777
325
    free(hist);
1778
325
    return -1;
1779
7.14k
}
1780
1781
// Check if the SQ M5 tag matches the reference we've loaded.
1782
0
static int validate_md5(cram_fd *fd, int ref_id) {
1783
0
    if (fd->ignore_md5 || ref_id < 0 || ref_id >= fd->refs->nref)
1784
0
        return 0;
1785
1786
    // Have we already checked this ref?
1787
0
    if (fd->refs->ref_id[ref_id]->validated_md5)
1788
0
        return 0;
1789
1790
    // Check if we have the MD5 known.
1791
    // We should, but maybe we're using embedded references?
1792
0
    sam_hrecs_t *hrecs = fd->header->hrecs;
1793
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, "SQ", "SN",
1794
0
                                                 hrecs->ref[ref_id].name);
1795
0
    if (!ty)
1796
0
        return 0;
1797
1798
0
    sam_hrec_tag_t *m5tag = sam_hrecs_find_key(ty, "M5", NULL);
1799
0
    if (!m5tag)
1800
0
        return 0;
1801
1802
    // It's known, so compute md5 on the loaded reference sequence.
1803
0
    char *ref = fd->refs->ref_id[ref_id]->seq;
1804
0
    int64_t len = fd->refs->ref_id[ref_id]->length;
1805
0
    hts_md5_context *md5;
1806
0
    char unsigned buf[16];
1807
0
    char buf2[33];
1808
1809
0
    if (!(md5 = hts_md5_init()))
1810
0
        return -1;
1811
0
    hts_md5_update(md5, ref, len);
1812
0
    hts_md5_final(buf, md5);
1813
0
    hts_md5_destroy(md5);
1814
0
    hts_md5_hex(buf2, buf);
1815
1816
    // Compare it to header @SQ M5 tag
1817
0
    if (strcmp(m5tag->str+3, buf2)) {
1818
0
        hts_log_error("SQ header M5 tag discrepancy for reference '%s'",
1819
0
                      hrecs->ref[ref_id].name);
1820
0
        hts_log_error("Please use the correct reference, or "
1821
0
                      "consider using embed_ref=2");
1822
0
        return -1;
1823
0
    }
1824
0
    fd->refs->ref_id[ref_id]->validated_md5 = 1;
1825
1826
0
    return 0;
1827
0
}
1828
1829
/*
1830
 * Encodes all slices in a container into blocks.
1831
 * Returns 0 on success
1832
 *        -1 on failure
1833
 */
1834
15.8k
int cram_encode_container(cram_fd *fd, cram_container *c) {
1835
15.8k
    int i, j, slice_offset;
1836
15.8k
    cram_block_compression_hdr *h = c->comp_hdr;
1837
15.8k
    cram_block *c_hdr;
1838
15.8k
    int multi_ref = 0;
1839
15.8k
    int r1, r2, sn, nref, embed_ref, no_ref;
1840
15.8k
    spare_bams *spares;
1841
1842
15.8k
    if (!c->bams)
1843
0
        goto err;
1844
1845
15.8k
    if (CRAM_MAJOR_VERS(fd->version) == 1)
1846
0
        goto err;
1847
1848
//#define goto_err {fprintf(stderr, "ERR at %s:%d\n", __FILE__, __LINE__);goto err;}
1849
15.8k
#define goto_err goto err
1850
1851
    // Don't try embed ref if we repeatedly fail
1852
15.8k
    pthread_mutex_lock(&fd->ref_lock);
1853
15.8k
    int failed_embed = (fd->no_ref_counter >= 5); // maximum 5 tries
1854
15.8k
    if (!failed_embed && c->embed_ref == -2 && c->ref_id >= 0) {
1855
286
        hts_log_warning("Retrying embed_ref=2 mode for #%d/5", fd->no_ref_counter);
1856
286
        fd->no_ref = c->no_ref = 0;
1857
286
        fd->embed_ref = c->embed_ref = 2;
1858
15.6k
    } else if (failed_embed && c->embed_ref == -2) {
1859
        // We've tried several times, so this time give up for good
1860
8
        hts_log_warning("Keeping non-ref mode from now on");
1861
8
        fd->embed_ref = c->embed_ref = 0;
1862
8
    }
1863
15.8k
    pthread_mutex_unlock(&fd->ref_lock);
1864
1865
16.3k
 restart:
1866
    /* Cache references up-front if we have unsorted access patterns */
1867
16.3k
    pthread_mutex_lock(&fd->ref_lock);
1868
16.3k
    nref = fd->refs->nref;
1869
16.3k
    pthread_mutex_unlock(&fd->ref_lock);
1870
16.3k
    embed_ref = c->embed_ref;
1871
16.3k
    no_ref = c->no_ref;
1872
1873
    /* To create M5 strings */
1874
    /* Fetch reference sequence */
1875
16.3k
    if (!no_ref) {
1876
15.7k
        if (!c->bams || !c->curr_c_rec || !c->bams[0])
1877
27
            goto_err;
1878
15.7k
        bam_seq_t *b = c->bams[0];
1879
1880
15.7k
        if (embed_ref <= 1) {
1881
1.86k
            char *ref = cram_get_ref(fd, bam_ref(b), 1, 0);
1882
1.86k
            if (!ref && bam_ref(b) >= 0) {
1883
3
                if (!c->pos_sorted) {
1884
                    // TODO: maybe also check fd->no_ref?
1885
0
                    hts_log_warning("Failed to load reference #%d",
1886
0
                                    bam_ref(b));
1887
0
                    hts_log_warning("Switching to non-ref mode");
1888
1889
0
                    pthread_mutex_lock(&fd->ref_lock);
1890
0
                    c->embed_ref = fd->embed_ref = 0;
1891
0
                    c->no_ref = fd->no_ref = 1;
1892
0
                    pthread_mutex_unlock(&fd->ref_lock);
1893
0
                    goto restart;
1894
0
                }
1895
1896
3
                if (c->multi_seq || embed_ref == 0) {
1897
0
                    hts_log_error("Failed to load reference #%d", bam_ref(b));
1898
0
                    return -1;
1899
0
                }
1900
3
                hts_log_warning("Failed to load reference #%d", bam_ref(b));
1901
3
                hts_log_warning("Enabling embed_ref=2 mode to auto-generate"
1902
3
                                " reference");
1903
3
                if (embed_ref <= 0)
1904
3
                    hts_log_warning("NOTE: the CRAM file will be bigger than"
1905
3
                                    " using an external reference");
1906
3
                pthread_mutex_lock(&fd->ref_lock);
1907
3
                embed_ref = c->embed_ref = fd->embed_ref = 2;
1908
3
                pthread_mutex_unlock(&fd->ref_lock);
1909
3
                goto auto_ref;
1910
1.85k
            } else if (ref) {
1911
0
                if (validate_md5(fd, c->ref_seq_id) < 0)
1912
0
                    goto_err;
1913
0
            }
1914
1.85k
            if ((c->ref_id = bam_ref(b)) >= 0) {
1915
0
                c->ref_seq_id = c->ref_id;
1916
0
                c->ref       = fd->refs->ref_id[c->ref_seq_id]->seq;
1917
0
                c->ref_start = 1;
1918
0
                c->ref_end   = fd->refs->ref_id[c->ref_seq_id]->length;
1919
0
            }
1920
13.8k
        } else {
1921
13.8k
        auto_ref:
1922
            // Auto-embed ref.
1923
            // This starts as 'N' and is amended on-the-fly as we go
1924
            // based on MD:Z tags.
1925
13.8k
            if ((c->ref_id = bam_ref(b)) >= 0) {
1926
7.30k
                c->ref = NULL;
1927
                // c->ref_free is boolean; whether to free c->ref.  In this
1928
                // case c->ref will be our auto-embedded sequence instead of
1929
                // a "global" portion of reference from fd->refs.
1930
                // Do not confuse with fd->ref_free which is a pointer to a
1931
                // reference string to free.
1932
7.30k
                c->ref_free = 1;
1933
7.30k
            } else {
1934
                // Double check for broken input.  We shouldn't have
1935
                // embedded references enabled for unmapped data, but our
1936
                // data could be broken.
1937
6.56k
                embed_ref = 0;
1938
6.56k
                no_ref = c->no_ref = 1;
1939
6.56k
            }
1940
13.8k
        }
1941
15.7k
        c->ref_seq_id = c->ref_id;
1942
15.7k
    } else {
1943
626
        c->ref_id = bam_ref(c->bams[0]);
1944
626
        cram_ref_incr(fd->refs, c->ref_id);
1945
626
        c->ref_seq_id = c->ref_id;
1946
626
    }
1947
1948
16.3k
    if (!no_ref && c->refs_used) {
1949
399
        for (i = 0; i < nref; i++) {
1950
227
            if (c->refs_used[i]) {
1951
152
                if (cram_get_ref(fd, i, 1, 0)) {
1952
0
                    if (validate_md5(fd, i) < 0)
1953
0
                        goto_err;
1954
152
                } else {
1955
152
                    hts_log_warning("Failed to find reference, "
1956
152
                                    "switching to non-ref mode");
1957
152
                    no_ref = c->no_ref = 1;
1958
152
                }
1959
152
            }
1960
227
        }
1961
172
    }
1962
1963
    /* Turn bams into cram_records and gather basic stats */
1964
32.0k
    for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) {
1965
16.3k
        cram_slice *s = c->slices[sn];
1966
16.3k
        int64_t first_base = INT64_MAX, last_base = INT64_MIN;
1967
1968
16.3k
        int r1_start = r1;
1969
1970
16.3k
        assert(sn < c->curr_slice);
1971
1972
        // Discover which read names *may* be safely removed.
1973
        // Ie which ones have all their records in this slice.
1974
16.3k
        if (lossy_read_names(fd, c, s, r1_start) != 0)
1975
0
            return -1;
1976
1977
        // Tracking of MD tags so we can spot when the auto-generated values
1978
        // will differ from the current stored ones.  The kstring here is
1979
        // simply to avoid excessive malloc and free calls.  All initialisation
1980
        // is done within process_one_read().
1981
16.3k
        kstring_t MD = {0};
1982
1983
        // Embed consensus / MD-generated ref
1984
16.3k
        if (embed_ref == 2) {
1985
7.30k
            if (c->ref_id < 0 || cram_generate_reference(c, s, r1) < 0) {
1986
                // Should this be a permanent thing via fd->no_ref?
1987
                // Doing so means we cannot easily switch back again should
1988
                // things fix themselves later on.  This is likely not a
1989
                // concern though as failure to generate a reference implies
1990
                // unsorted data which is rarely recovered from.
1991
1992
                // Only if sn == 0.  We're hosed if we're on the 2nd slice and
1993
                // the first worked, as no-ref is a container global param.
1994
489
                if (sn > 0) {
1995
0
                    hts_log_error("Failed to build reference, "
1996
0
                                  "switching to non-ref mode");
1997
0
                    return -1;
1998
489
                } else {
1999
489
                    hts_log_warning("Failed to build reference, "
2000
489
                                    "switching to non-ref mode");
2001
489
                }
2002
489
                pthread_mutex_lock(&fd->ref_lock);
2003
489
                c->embed_ref = fd->embed_ref = -2; // was previously embed_ref
2004
489
                c->no_ref = fd->no_ref = 1;
2005
489
                fd->no_ref_counter++; // more likely to keep permanent action
2006
489
                pthread_mutex_unlock(&fd->ref_lock);
2007
489
                failed_embed = 1;
2008
489
                goto restart;
2009
6.81k
            } else {
2010
6.81k
                pthread_mutex_lock(&fd->ref_lock);
2011
6.81k
                fd->no_ref_counter -= (fd->no_ref_counter > 0);
2012
6.81k
                pthread_mutex_unlock(&fd->ref_lock);
2013
6.81k
            }
2014
2015
6.81k
            if (c->ref_end > fd->refs->ref_id[c->ref_id]->LN_length)
2016
6.32k
                c->ref_end = fd->refs->ref_id[c->ref_id]->LN_length;
2017
6.81k
        }
2018
2019
        // Iterate through records creating the cram blocks for some
2020
        // fields and just gathering stats for others.
2021
2.03M
        for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) {
2022
2.01M
            cram_record *cr = &s->crecs[r2];
2023
2.01M
            bam_seq_t *b = c->bams[r1];
2024
2025
            /* If multi-ref we need to cope with changing reference per seq */
2026
2.01M
            if (c->multi_seq && !no_ref) {
2027
593
                if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) {
2028
0
                    if (c->ref_seq_id >= 0)
2029
0
                        cram_ref_decr(fd->refs, c->ref_seq_id);
2030
2031
0
                    if (!cram_get_ref(fd, bam_ref(b), 1, 0)) {
2032
0
                        hts_log_error("Failed to load reference #%d", bam_ref(b));
2033
0
                        free(MD.s);
2034
0
                        return -1;
2035
0
                    }
2036
0
                    if (validate_md5(fd, bam_ref(b)) < 0)
2037
0
                        return -1;
2038
2039
0
                    c->ref_seq_id = bam_ref(b); // overwritten later by -2
2040
0
                    if (!fd->refs->ref_id[c->ref_seq_id]->seq)
2041
0
                        return -1;
2042
0
                    c->ref       = fd->refs->ref_id[c->ref_seq_id]->seq;
2043
0
                    c->ref_start = 1;
2044
0
                    c->ref_end   = fd->refs->ref_id[c->ref_seq_id]->length;
2045
0
                }
2046
593
            }
2047
2048
2.01M
            if (process_one_read(fd, c, s, cr, b, r2, &MD, embed_ref,
2049
2.01M
                                 no_ref) != 0) {
2050
178
                free(MD.s);
2051
178
                return -1;
2052
178
            }
2053
2054
2.01M
            if (first_base > cr->apos)
2055
16.1k
                first_base = cr->apos;
2056
2057
2.01M
            if (last_base < cr->aend)
2058
15.8k
                last_base = cr->aend;
2059
2.01M
        }
2060
2061
15.6k
        free(MD.s);
2062
2063
        // Process_one_read doesn't add read names as it can change
2064
        // its mind during the loop on the CRAM_FLAG_DETACHED setting
2065
        // of earlier records (if it detects the auto-generation of
2066
        // TLEN is incorrect).  This affects which read-names can be
2067
        // lossily compressed, so we do these in another pass.
2068
15.6k
        if (add_read_names(fd, c, s, r1_start) < 0)
2069
0
            return -1;
2070
2071
15.6k
        if (c->multi_seq) {
2072
283
            s->hdr->ref_seq_id    = -2;
2073
283
            s->hdr->ref_seq_start = 0;
2074
283
            s->hdr->ref_seq_span  = 0;
2075
15.4k
        } else if (c->ref_id == -1 && CRAM_ge31(fd->version)) {
2076
            // Spec states span=0, but it broke our range queries.
2077
            // See commit message for this and prior.
2078
8.31k
            s->hdr->ref_seq_id    = -1;
2079
8.31k
            s->hdr->ref_seq_start = 0;
2080
8.31k
            s->hdr->ref_seq_span  = 0;
2081
8.31k
        } else {
2082
7.08k
            s->hdr->ref_seq_id    = c->ref_id;
2083
7.08k
            s->hdr->ref_seq_start = first_base;
2084
7.08k
            s->hdr->ref_seq_span  = MAX(0, last_base - first_base + 1);
2085
7.08k
        }
2086
15.6k
        s->hdr->num_records = r2;
2087
2088
        // Processed a slice, now stash the aux blocks so the next
2089
        // slice can start aggregating them from the start again.
2090
15.6k
        if (c->tags_used->n_occupied) {
2091
12.1k
            int ntags = c->tags_used->n_occupied;
2092
12.1k
            s->aux_block = calloc(ntags*2, sizeof(*s->aux_block));
2093
12.1k
            if (!s->aux_block)
2094
0
                return -1;
2095
2096
12.1k
            khint_t k;
2097
2098
12.1k
            s->naux_block = 0;
2099
133k
            for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
2100
121k
                if (!kh_exist(c->tags_used, k))
2101
62.6k
                    continue;
2102
2103
59.1k
                cram_tag_map *tm = kh_val(c->tags_used, k);
2104
59.1k
                if (!tm) goto_err;
2105
59.1k
                if (!tm->blk) continue;
2106
59.1k
                s->aux_block[s->naux_block++] = tm->blk;
2107
59.1k
                tm->blk = NULL;
2108
59.1k
                if (!tm->blk2) continue;
2109
0
                s->aux_block[s->naux_block++] = tm->blk2;
2110
0
                tm->blk2 = NULL;
2111
0
            }
2112
12.1k
            assert(s->naux_block <= 2*c->tags_used->n_occupied);
2113
12.1k
        }
2114
15.6k
    }
2115
2116
15.6k
    if (c->multi_seq && !no_ref) {
2117
20
        if (c->ref_seq_id >= 0)
2118
0
            cram_ref_decr(fd->refs, c->ref_seq_id);
2119
20
    }
2120
2121
    /* Link our bams[] array onto the spare bam list for reuse */
2122
15.6k
    spares = malloc(sizeof(*spares));
2123
15.6k
    if (!spares) goto_err;
2124
15.6k
    pthread_mutex_lock(&fd->bam_list_lock);
2125
15.6k
    spares->bams = c->bams;
2126
15.6k
    spares->next = fd->bl;
2127
15.6k
    fd->bl = spares;
2128
15.6k
    pthread_mutex_unlock(&fd->bam_list_lock);
2129
15.6k
    c->bams = NULL;
2130
2131
    /* Detect if a multi-seq container */
2132
15.6k
    cram_stats_encoding(fd, c->stats[DS_RI]);
2133
15.6k
    multi_ref = c->stats[DS_RI]->nvals > 1;
2134
15.6k
    pthread_mutex_lock(&fd->metrics_lock);
2135
15.6k
    fd->last_RI_count = c->stats[DS_RI]->nvals;
2136
15.6k
    pthread_mutex_unlock(&fd->metrics_lock);
2137
2138
2139
15.6k
    if (multi_ref) {
2140
124
        hts_log_info("Multi-ref container");
2141
124
        c->ref_seq_id = -2;
2142
124
        c->ref_seq_start = 0;
2143
124
        c->ref_seq_span = 0;
2144
124
    }
2145
2146
2147
    /* Compute MD5s */
2148
15.6k
    no_ref = c->no_ref;
2149
15.6k
    int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0;
2150
2151
31.3k
    for (i = 0; i < c->curr_slice; i++) {
2152
15.6k
        cram_slice *s = c->slices[i];
2153
2154
15.6k
        if (CRAM_MAJOR_VERS(fd->version) != 1) {
2155
15.6k
            if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !no_ref) {
2156
6.73k
                hts_md5_context *md5 = hts_md5_init();
2157
6.73k
                if (!md5)
2158
0
                    return -1;
2159
6.73k
                hts_md5_update(md5,
2160
6.73k
                               c->ref + s->hdr->ref_seq_start - c->ref_start,
2161
6.73k
                               s->hdr->ref_seq_span);
2162
6.73k
                hts_md5_final(s->hdr->md5, md5);
2163
6.73k
                hts_md5_destroy(md5);
2164
8.95k
            } else {
2165
8.95k
                memset(s->hdr->md5, 0, 16);
2166
8.95k
            }
2167
15.6k
        }
2168
15.6k
    }
2169
2170
15.6k
    c->num_records = 0;
2171
15.6k
    c->num_blocks = 1; // cram_block_compression_hdr
2172
15.6k
    c->length = 0;
2173
2174
    //fprintf(stderr, "=== BF ===\n");
2175
15.6k
    h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]),
2176
15.6k
                                         c->stats[DS_BF], E_INT, NULL,
2177
15.6k
                                         fd->version, &fd->vv);
2178
15.6k
    if (c->stats[DS_BF]->nvals && !h->codecs[DS_BF]) goto_err;
2179
2180
    //fprintf(stderr, "=== CF ===\n");
2181
15.6k
    h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]),
2182
15.6k
                                         c->stats[DS_CF], E_INT, NULL,
2183
15.6k
                                         fd->version, &fd->vv);
2184
15.6k
    if (c->stats[DS_CF]->nvals && !h->codecs[DS_CF]) goto_err;
2185
2186
    //fprintf(stderr, "=== RN ===\n");
2187
    //h->codecs[DS_RN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RN]),
2188
    //                                     c->stats[DS_RN], E_BYTE_ARRAY, NULL,
2189
    //                                     fd->version);
2190
2191
    //fprintf(stderr, "=== AP ===\n");
2192
15.6k
    if (c->pos_sorted || CRAM_MAJOR_VERS(fd->version) >= 4) {
2193
15.0k
        if (c->pos_sorted)
2194
15.0k
            h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]),
2195
15.0k
                                                 c->stats[DS_AP],
2196
15.0k
                                                 is_v4 ? E_LONG : E_INT,
2197
15.0k
                                                 NULL, fd->version, &fd->vv);
2198
0
        else
2199
            // Unsorted data has no stats, but hard-code VARINT_SIGNED / EXT.
2200
0
            h->codecs[DS_AP] = cram_encoder_init(is_v4 ? E_VARINT_SIGNED
2201
0
                                                       : E_EXTERNAL,
2202
0
                                                 NULL,
2203
0
                                                 is_v4 ? E_LONG : E_INT,
2204
0
                                                 NULL, fd->version, &fd->vv);
2205
15.0k
    } else {
2206
        // Removed BETA in v4.0.
2207
        // Should we consider dropping use of it for 3.0 too?
2208
615
        hts_pos_t p[2] = {0, c->max_apos};
2209
615
        h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL,
2210
615
                                             is_v4 ? E_LONG : E_INT,
2211
615
                                             p, fd->version, &fd->vv);
2212
//      cram_xdelta_encoder e;
2213
//      e.word_size = is_v4 ? 8 : 4;
2214
//      e.sub_encoding = E_EXTERNAL;
2215
//      e.sub_codec_dat = (void *)DS_AP;
2216
//
2217
//      h->codecs[DS_AP] = cram_encoder_init(E_XDELTA, NULL,
2218
//                                           is_v4 ? E_LONG : E_INT,
2219
//                                           &e, fd->version, &fd->vv);
2220
615
    }
2221
15.6k
    if (!h->codecs[DS_AP]) goto_err;
2222
2223
    //fprintf(stderr, "=== RG ===\n");
2224
15.6k
    h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]),
2225
15.6k
                                         c->stats[DS_RG],
2226
15.6k
                                         E_INT,
2227
15.6k
                                         NULL,
2228
15.6k
                                         fd->version, &fd->vv);
2229
15.6k
    if (c->stats[DS_RG]->nvals && !h->codecs[DS_RG]) goto_err;
2230
2231
    //fprintf(stderr, "=== MQ ===\n");
2232
15.6k
    h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]),
2233
15.6k
                                         c->stats[DS_MQ], E_INT, NULL,
2234
15.6k
                                         fd->version, &fd->vv);
2235
15.6k
    if (c->stats[DS_MQ]->nvals && !h->codecs[DS_MQ]) goto_err;
2236
2237
    //fprintf(stderr, "=== NS ===\n");
2238
15.6k
    h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]),
2239
15.6k
                                         c->stats[DS_NS], E_INT, NULL,
2240
15.6k
                                         fd->version, &fd->vv);
2241
15.6k
    if (c->stats[DS_NS]->nvals && !h->codecs[DS_NS]) goto_err;
2242
2243
    //fprintf(stderr, "=== MF ===\n");
2244
15.6k
    h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]),
2245
15.6k
                                         c->stats[DS_MF], E_INT, NULL,
2246
15.6k
                                         fd->version, &fd->vv);
2247
15.6k
    if (c->stats[DS_MF]->nvals && !h->codecs[DS_MF]) goto_err;
2248
2249
    //fprintf(stderr, "=== TS ===\n");
2250
15.6k
    h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]),
2251
15.6k
                                         c->stats[DS_TS],
2252
15.6k
                                         is_v4 ? E_LONG : E_INT,
2253
15.6k
                                         NULL, fd->version, &fd->vv);
2254
15.6k
    if (c->stats[DS_TS]->nvals && !h->codecs[DS_TS]) goto_err;
2255
2256
    //fprintf(stderr, "=== NP ===\n");
2257
15.6k
    h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]),
2258
15.6k
                                         c->stats[DS_NP],
2259
15.6k
                                         is_v4 ? E_LONG : E_INT,
2260
15.6k
                                         NULL, fd->version, &fd->vv);
2261
15.6k
    if (c->stats[DS_NP]->nvals && !h->codecs[DS_NP]) goto_err;
2262
2263
    //fprintf(stderr, "=== NF ===\n");
2264
15.6k
    h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]),
2265
15.6k
                                         c->stats[DS_NF], E_INT, NULL,
2266
15.6k
                                         fd->version, &fd->vv);
2267
15.6k
    if (c->stats[DS_NF]->nvals && !h->codecs[DS_NF]) goto_err;
2268
2269
    //fprintf(stderr, "=== RL ===\n");
2270
15.6k
    h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]),
2271
15.6k
                                         c->stats[DS_RL], E_INT, NULL,
2272
15.6k
                                         fd->version, &fd->vv);
2273
15.6k
    if (c->stats[DS_RL]->nvals && !h->codecs[DS_RL]) goto_err;
2274
2275
    //fprintf(stderr, "=== FN ===\n");
2276
15.6k
    h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]),
2277
15.6k
                                         c->stats[DS_FN], E_INT, NULL,
2278
15.6k
                                         fd->version, &fd->vv);
2279
15.6k
    if (c->stats[DS_FN]->nvals && !h->codecs[DS_FN]) goto_err;
2280
2281
    //fprintf(stderr, "=== FC ===\n");
2282
15.6k
    h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]),
2283
15.6k
                                         c->stats[DS_FC], E_BYTE, NULL,
2284
15.6k
                                         fd->version, &fd->vv);
2285
15.6k
    if (c->stats[DS_FC]->nvals && !h->codecs[DS_FC]) goto_err;
2286
2287
    //fprintf(stderr, "=== FP ===\n");
2288
15.6k
    h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]),
2289
15.6k
                                         c->stats[DS_FP], E_INT, NULL,
2290
15.6k
                                         fd->version, &fd->vv);
2291
15.6k
    if (c->stats[DS_FP]->nvals && !h->codecs[DS_FP]) goto_err;
2292
2293
    //fprintf(stderr, "=== DL ===\n");
2294
15.6k
    h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]),
2295
15.6k
                                         c->stats[DS_DL], E_INT, NULL,
2296
15.6k
                                         fd->version, &fd->vv);
2297
15.6k
    if (c->stats[DS_DL]->nvals && !h->codecs[DS_DL]) goto_err;
2298
2299
    //fprintf(stderr, "=== BA ===\n");
2300
15.6k
    h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]),
2301
15.6k
                                         c->stats[DS_BA], E_BYTE, NULL,
2302
15.6k
                                         fd->version, &fd->vv);
2303
15.6k
    if (c->stats[DS_BA]->nvals && !h->codecs[DS_BA]) goto_err;
2304
2305
15.6k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2306
15.6k
        cram_byte_array_len_encoder e;
2307
2308
15.6k
        e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
2309
15.6k
            ? E_VARINT_UNSIGNED
2310
15.6k
            : E_EXTERNAL;
2311
15.6k
        e.len_dat = (void *)DS_BB_len;
2312
        //e.len_dat = (void *)DS_BB;
2313
2314
15.6k
        e.val_encoding = E_EXTERNAL;
2315
15.6k
        e.val_dat = (void *)DS_BB;
2316
2317
15.6k
        h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
2318
15.6k
                                             E_BYTE_ARRAY, (void *)&e,
2319
15.6k
                                             fd->version, &fd->vv);
2320
15.6k
        if (!h->codecs[DS_BB]) goto_err;
2321
15.6k
    } else {
2322
0
        h->codecs[DS_BB] = NULL;
2323
0
    }
2324
2325
    //fprintf(stderr, "=== BS ===\n");
2326
15.6k
    h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]),
2327
15.6k
                                         c->stats[DS_BS], E_BYTE, NULL,
2328
15.6k
                                         fd->version, &fd->vv);
2329
15.6k
    if (c->stats[DS_BS]->nvals && !h->codecs[DS_BS]) goto_err;
2330
2331
15.6k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
2332
0
        h->codecs[DS_TL] = NULL;
2333
0
        h->codecs[DS_RI] = NULL;
2334
0
        h->codecs[DS_RS] = NULL;
2335
0
        h->codecs[DS_PD] = NULL;
2336
0
        h->codecs[DS_HC] = NULL;
2337
0
        h->codecs[DS_SC] = NULL;
2338
2339
        //fprintf(stderr, "=== TC ===\n");
2340
0
        h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]),
2341
0
                                             c->stats[DS_TC], E_BYTE, NULL,
2342
0
                                             fd->version, &fd->vv);
2343
0
        if (c->stats[DS_TC]->nvals && !h->codecs[DS_TC]) goto_err;
2344
2345
        //fprintf(stderr, "=== TN ===\n");
2346
0
        h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]),
2347
0
                                             c->stats[DS_TN], E_INT, NULL,
2348
0
                                             fd->version, &fd->vv);
2349
0
        if (c->stats[DS_TN]->nvals && !h->codecs[DS_TN]) goto_err;
2350
15.6k
    } else {
2351
15.6k
        h->codecs[DS_TC] = NULL;
2352
15.6k
        h->codecs[DS_TN] = NULL;
2353
2354
        //fprintf(stderr, "=== TL ===\n");
2355
15.6k
        h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]),
2356
15.6k
                                             c->stats[DS_TL], E_INT, NULL,
2357
15.6k
                                             fd->version, &fd->vv);
2358
15.6k
        if (c->stats[DS_TL]->nvals && !h->codecs[DS_TL]) goto_err;
2359
2360
2361
        //fprintf(stderr, "=== RI ===\n");
2362
15.6k
        h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]),
2363
15.6k
                                             c->stats[DS_RI], E_INT, NULL,
2364
15.6k
                                             fd->version, &fd->vv);
2365
15.6k
        if (c->stats[DS_RI]->nvals && !h->codecs[DS_RI]) goto_err;
2366
2367
        //fprintf(stderr, "=== RS ===\n");
2368
15.6k
        h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]),
2369
15.6k
                                             c->stats[DS_RS], E_INT, NULL,
2370
15.6k
                                             fd->version, &fd->vv);
2371
15.6k
        if (c->stats[DS_RS]->nvals && !h->codecs[DS_RS]) goto_err;
2372
2373
        //fprintf(stderr, "=== PD ===\n");
2374
15.6k
        h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]),
2375
15.6k
                                             c->stats[DS_PD], E_INT, NULL,
2376
15.6k
                                             fd->version, &fd->vv);
2377
15.6k
        if (c->stats[DS_PD]->nvals && !h->codecs[DS_PD]) goto_err;
2378
2379
        //fprintf(stderr, "=== HC ===\n");
2380
15.6k
        h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]),
2381
15.6k
                                             c->stats[DS_HC], E_INT, NULL,
2382
15.6k
                                             fd->version, &fd->vv);
2383
15.6k
        if (c->stats[DS_HC]->nvals && !h->codecs[DS_HC]) goto_err;
2384
2385
        //fprintf(stderr, "=== SC ===\n");
2386
15.6k
        if (1) {
2387
15.6k
            int i2[2] = {0, DS_SC};
2388
2389
15.6k
            h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2390
15.6k
                                                 E_BYTE_ARRAY, (void *)i2,
2391
15.6k
                                                 fd->version, &fd->vv);
2392
15.6k
        } else {
2393
            // Appears to be no practical benefit to using this method,
2394
            // but it may work better if we start mixing SC, IN and BB
2395
            // elements into the same external block.
2396
0
            cram_byte_array_len_encoder e;
2397
2398
0
            e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
2399
0
                ? E_VARINT_UNSIGNED
2400
0
                : E_EXTERNAL;
2401
0
            e.len_dat = (void *)DS_SC_len;
2402
2403
0
            e.val_encoding = E_EXTERNAL;
2404
0
            e.val_dat = (void *)DS_SC;
2405
2406
0
            h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
2407
0
                                                 E_BYTE_ARRAY, (void *)&e,
2408
0
                                                 fd->version, &fd->vv);
2409
0
        }
2410
15.6k
        if (!h->codecs[DS_SC]) goto_err;
2411
15.6k
    }
2412
2413
    //fprintf(stderr, "=== IN ===\n");
2414
15.6k
    {
2415
15.6k
        int i2[2] = {0, DS_IN};
2416
15.6k
        h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2417
15.6k
                                             E_BYTE_ARRAY, (void *)i2,
2418
15.6k
                                             fd->version, &fd->vv);
2419
15.6k
        if (!h->codecs[DS_IN]) goto_err;
2420
15.6k
    }
2421
2422
15.6k
    h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE,
2423
15.6k
                                         (void *)DS_QS,
2424
15.6k
                                         fd->version, &fd->vv);
2425
15.6k
    if (!h->codecs[DS_QS]) goto_err;
2426
15.6k
    {
2427
15.6k
        int i2[2] = {0, DS_RN};
2428
15.6k
        h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2429
15.6k
                                             E_BYTE_ARRAY, (void *)i2,
2430
15.6k
                                             fd->version, &fd->vv);
2431
15.6k
        if (!h->codecs[DS_RN]) goto_err;
2432
15.6k
    }
2433
2434
2435
    /* Encode slices */
2436
31.1k
    for (i = 0; i < c->curr_slice; i++) {
2437
15.6k
        hts_log_info("Encode slice %d", i);
2438
2439
15.6k
        int local_embed_ref =
2440
15.6k
            embed_ref>0 && c->slices[i]->hdr->ref_seq_id != -1 ? 1 : 0;
2441
15.6k
        if (cram_encode_slice(fd, c, h, c->slices[i], local_embed_ref) != 0)
2442
153
            return -1;
2443
15.6k
    }
2444
2445
    /* Create compression header */
2446
15.4k
    {
2447
15.4k
        h->ref_seq_id    = c->ref_seq_id;
2448
15.4k
        h->ref_seq_start = c->ref_seq_start;
2449
15.4k
        h->ref_seq_span  = c->ref_seq_span;
2450
15.4k
        h->num_records   = c->num_records;
2451
15.4k
        h->qs_seq_orient = c->qs_seq_orient;
2452
        // slight misnomer - sorted or treat as-if sorted (ap_delta force to 1)
2453
15.4k
        h->AP_delta      = c->pos_sorted;
2454
15.4k
        memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20);
2455
2456
15.4k
        if (!(c_hdr = cram_encode_compression_header(fd, c, h, embed_ref)))
2457
0
            return -1;
2458
15.4k
    }
2459
2460
    /* Compute landmarks */
2461
    /* Fill out slice landmarks */
2462
15.4k
    c->num_landmarks = c->curr_slice;
2463
15.4k
    c->landmark = malloc(c->num_landmarks * sizeof(*c->landmark));
2464
15.4k
    if (!c->landmark)
2465
0
        return -1;
2466
2467
    /*
2468
     * Slice offset starts after the first block, so we need to simulate
2469
     * writing it to work out the correct offset
2470
     */
2471
15.4k
    {
2472
15.4k
        slice_offset = c_hdr->method == RAW
2473
15.4k
            ? c_hdr->uncomp_size
2474
15.4k
            : c_hdr->comp_size;
2475
15.4k
        slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
2476
15.4k
            fd->vv.varint_size(c_hdr->content_id) +
2477
15.4k
            fd->vv.varint_size(c_hdr->comp_size) +
2478
15.4k
            fd->vv.varint_size(c_hdr->uncomp_size);
2479
15.4k
    }
2480
2481
15.4k
    c->ref_seq_id    = c->slices[0]->hdr->ref_seq_id;
2482
15.4k
    if (c->ref_seq_id == -1 && CRAM_ge31(fd->version)) {
2483
        // Spec states span=0, but it broke our range queries.
2484
        // See commit message for this and prior.
2485
8.31k
        c->ref_seq_start = 0;
2486
8.31k
        c->ref_seq_span  = 0;
2487
8.31k
    } else {
2488
7.16k
        c->ref_seq_start = c->slices[0]->hdr->ref_seq_start;
2489
7.16k
        c->ref_seq_span  = c->slices[0]->hdr->ref_seq_span;
2490
7.16k
    }
2491
30.9k
    for (i = 0; i < c->curr_slice; i++) {
2492
15.4k
        cram_slice *s = c->slices[i];
2493
2494
15.4k
        c->num_blocks += s->hdr->num_blocks + 1; // slice header
2495
15.4k
        c->landmark[i] = slice_offset;
2496
2497
15.4k
        if (s->hdr->ref_seq_start + s->hdr->ref_seq_span >
2498
15.4k
            c->ref_seq_start + c->ref_seq_span) {
2499
0
            c->ref_seq_span = s->hdr->ref_seq_start + s->hdr->ref_seq_span
2500
0
                - c->ref_seq_start;
2501
0
        }
2502
2503
15.4k
        slice_offset += s->hdr_block->method == RAW
2504
15.4k
            ? s->hdr_block->uncomp_size
2505
15.4k
            : s->hdr_block->comp_size;
2506
2507
15.4k
        slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
2508
15.4k
            fd->vv.varint_size(s->hdr_block->content_id) +
2509
15.4k
            fd->vv.varint_size(s->hdr_block->comp_size) +
2510
15.4k
            fd->vv.varint_size(s->hdr_block->uncomp_size);
2511
2512
128k
        for (j = 0; j < s->hdr->num_blocks; j++) {
2513
113k
            slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
2514
113k
                fd->vv.varint_size(s->block[j]->content_id) +
2515
113k
                fd->vv.varint_size(s->block[j]->comp_size) +
2516
113k
                fd->vv.varint_size(s->block[j]->uncomp_size);
2517
2518
113k
            slice_offset += s->block[j]->method == RAW
2519
113k
                ? s->block[j]->uncomp_size
2520
113k
                : s->block[j]->comp_size;
2521
113k
        }
2522
15.4k
    }
2523
15.4k
    c->length += slice_offset; // just past the final slice
2524
2525
15.4k
    c->comp_hdr_block = c_hdr;
2526
2527
15.4k
    if (c->ref_seq_id >= 0) {
2528
6.88k
        if (c->ref_free) {
2529
6.85k
            free(c->ref);
2530
6.85k
            c->ref = NULL;
2531
6.85k
        } else {
2532
27
            cram_ref_decr(fd->refs, c->ref_seq_id);
2533
27
        }
2534
6.88k
    }
2535
2536
    /* Cache references up-front if we have unsorted access patterns */
2537
15.4k
    if (!no_ref && c->refs_used) {
2538
46
        for (i = 0; i < fd->refs->nref; i++) {
2539
26
            if (c->refs_used[i])
2540
0
                cram_ref_decr(fd->refs, i);
2541
26
        }
2542
20
    }
2543
2544
15.4k
    return 0;
2545
2546
78
 err:
2547
78
    return -1;
2548
15.4k
}
2549
2550
2551
/*
2552
 * Adds a feature code to a read within a slice. For purposes of minimising
2553
 * memory allocations and fragmentation we have one array of features for all
2554
 * reads within the slice. We return the index into this array for this new
2555
 * feature.
2556
 *
2557
 * Returns feature index on success
2558
 *         -1 on failure.
2559
 */
2560
static int cram_add_feature(cram_container *c, cram_slice *s,
2561
37.4k
                            cram_record *r, cram_feature *f) {
2562
37.4k
    if (s->nfeatures >= s->afeatures) {
2563
6.38k
        s->afeatures = s->afeatures ? s->afeatures*2 : 1024;
2564
6.38k
        s->features = realloc(s->features, s->afeatures*sizeof(*s->features));
2565
6.38k
        if (!s->features)
2566
0
            return -1;
2567
6.38k
    }
2568
2569
37.4k
    if (!r->nfeature++) {
2570
18.5k
        r->feature = s->nfeatures;
2571
18.5k
        if (cram_stats_add(c->stats[DS_FP], f->X.pos) < 0)
2572
0
            return -1;
2573
18.9k
    } else {
2574
18.9k
        if (cram_stats_add(c->stats[DS_FP],
2575
18.9k
                           f->X.pos - s->features[r->feature + r->nfeature-2].X.pos) < 0)
2576
0
            return -1;
2577
2578
18.9k
    }
2579
37.4k
    if (cram_stats_add(c->stats[DS_FC], f->X.code) < 0)
2580
0
        return -1;
2581
2582
37.4k
    s->features[s->nfeatures++] = *f;
2583
2584
37.4k
    return 0;
2585
37.4k
}
2586
2587
static int cram_add_substitution(cram_fd *fd, cram_container *c,
2588
                                 cram_slice *s, cram_record *r,
2589
662
                                 int pos, char base, char qual, char ref) {
2590
662
    cram_feature f;
2591
2592
    // seq=ACGTN vs ref=ACGT or seq=ACGT vs ref=ACGTN
2593
662
    if (fd->L2[(uc)base]<4 || (fd->L2[(uc)base]<5 && fd->L2[(uc)ref]<4)) {
2594
511
        f.X.pos = pos+1;
2595
511
        f.X.code = 'X';
2596
511
        f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f];
2597
511
        if (cram_stats_add(c->stats[DS_BS], f.X.base) < 0)
2598
0
            return -1;
2599
511
    } else {
2600
151
        f.B.pos = pos+1;
2601
151
        f.B.code = 'B';
2602
151
        f.B.base = base;
2603
151
        f.B.qual = qual;
2604
151
        if (cram_stats_add(c->stats[DS_BA], f.B.base) < 0) return -1;
2605
151
        if (cram_stats_add(c->stats[DS_QS], f.B.qual) < 0) return -1;
2606
151
        BLOCK_APPEND_CHAR(s->qual_blk, qual);
2607
151
    }
2608
662
    return cram_add_feature(c, s, r, &f);
2609
2610
0
 block_err:
2611
0
    return -1;
2612
662
}
2613
2614
static int cram_add_bases(cram_fd *fd, cram_container *c,
2615
                          cram_slice *s, cram_record *r,
2616
291
                          int pos, int len, char *base) {
2617
291
    cram_feature f;
2618
2619
291
    f.b.pos = pos+1;
2620
291
    f.b.code = 'b';
2621
291
    f.b.seq_idx = base - (char *)BLOCK_DATA(s->seqs_blk);
2622
291
    f.b.len = len;
2623
2624
291
    return cram_add_feature(c, s, r, &f);
2625
291
}
2626
2627
static int cram_add_base(cram_fd *fd, cram_container *c,
2628
                         cram_slice *s, cram_record *r,
2629
787
                         int pos, char base, char qual) {
2630
787
    cram_feature f;
2631
787
    f.B.pos = pos+1;
2632
787
    f.B.code = 'B';
2633
787
    f.B.base = base;
2634
787
    f.B.qual = qual;
2635
787
    if (cram_stats_add(c->stats[DS_BA], base) < 0) return -1;
2636
787
    if (cram_stats_add(c->stats[DS_QS], qual) < 0) return -1;
2637
787
    BLOCK_APPEND_CHAR(s->qual_blk, qual);
2638
787
    return cram_add_feature(c, s, r, &f);
2639
2640
0
 block_err:
2641
0
    return -1;
2642
787
}
2643
2644
static int cram_add_quality(cram_fd *fd, cram_container *c,
2645
                            cram_slice *s, cram_record *r,
2646
27
                            int pos, char qual) {
2647
27
    cram_feature f;
2648
27
    f.Q.pos = pos+1;
2649
27
    f.Q.code = 'Q';
2650
27
    f.Q.qual = qual;
2651
27
    if (cram_stats_add(c->stats[DS_QS], qual) < 0) return -1;
2652
27
    BLOCK_APPEND_CHAR(s->qual_blk, qual);
2653
27
    return cram_add_feature(c, s, r, &f);
2654
2655
0
 block_err:
2656
0
    return -1;
2657
27
}
2658
2659
static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r,
2660
14.8k
                             int pos, int len, char *base) {
2661
14.8k
    cram_feature f;
2662
14.8k
    f.D.pos = pos+1;
2663
14.8k
    f.D.code = 'D';
2664
14.8k
    f.D.len = len;
2665
14.8k
    if (cram_stats_add(c->stats[DS_DL], len) < 0) return -1;
2666
14.8k
    return cram_add_feature(c, s, r, &f);
2667
14.8k
}
2668
2669
static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r,
2670
13.6k
                             int pos, int len, char *base, int version) {
2671
13.6k
    cram_feature f;
2672
13.6k
    f.S.pos = pos+1;
2673
13.6k
    f.S.code = 'S';
2674
13.6k
    f.S.len = len;
2675
13.6k
    switch (CRAM_MAJOR_VERS(version)) {
2676
0
    case 1:
2677
0
        f.S.seq_idx = BLOCK_SIZE(s->base_blk);
2678
0
        BLOCK_APPEND(s->base_blk, base, len);
2679
0
        BLOCK_APPEND_CHAR(s->base_blk, '\0');
2680
0
        break;
2681
2682
0
    case 2:
2683
13.6k
    default:
2684
13.6k
        f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
2685
13.6k
        if (base) {
2686
1.63k
            BLOCK_APPEND(s->soft_blk, base, len);
2687
12.0k
        } else {
2688
12.0k
            int i;
2689
92.0k
            for (i = 0; i < len; i++)
2690
80.0k
                BLOCK_APPEND_CHAR(s->soft_blk, 'N');
2691
12.0k
        }
2692
13.6k
        BLOCK_APPEND_CHAR(s->soft_blk, '\0');
2693
13.6k
        break;
2694
2695
        //default:
2696
        //    // v3.0 onwards uses BB data-series
2697
        //    f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
2698
13.6k
    }
2699
13.6k
    return cram_add_feature(c, s, r, &f);
2700
2701
0
 block_err:
2702
0
    return -1;
2703
13.6k
}
2704
2705
static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r,
2706
6.42k
                             int pos, int len, char *base) {
2707
6.42k
    cram_feature f;
2708
6.42k
    f.S.pos = pos+1;
2709
6.42k
    f.S.code = 'H';
2710
6.42k
    f.S.len = len;
2711
6.42k
    if (cram_stats_add(c->stats[DS_HC], len) < 0) return -1;
2712
6.42k
    return cram_add_feature(c, s, r, &f);
2713
6.42k
}
2714
2715
static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r,
2716
204
                         int pos, int len, char *base) {
2717
204
    cram_feature f;
2718
204
    f.S.pos = pos+1;
2719
204
    f.S.code = 'N';
2720
204
    f.S.len = len;
2721
204
    if (cram_stats_add(c->stats[DS_RS], len) < 0) return -1;
2722
204
    return cram_add_feature(c, s, r, &f);
2723
204
}
2724
2725
static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r,
2726
342
                        int pos, int len, char *base) {
2727
342
    cram_feature f;
2728
342
    f.S.pos = pos+1;
2729
342
    f.S.code = 'P';
2730
342
    f.S.len = len;
2731
342
    if (cram_stats_add(c->stats[DS_PD], len) < 0) return -1;
2732
342
    return cram_add_feature(c, s, r, &f);
2733
342
}
2734
2735
static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r,
2736
260
                              int pos, int len, char *base) {
2737
260
    cram_feature f;
2738
260
    f.I.pos = pos+1;
2739
260
    if (len == 1) {
2740
57
        char b = base ? *base : 'N';
2741
57
        f.i.code = 'i';
2742
57
        f.i.base = b;
2743
57
        if (cram_stats_add(c->stats[DS_BA], b) < 0) return -1;
2744
203
    } else {
2745
203
        f.I.code = 'I';
2746
203
        f.I.len = len;
2747
203
        f.S.seq_idx = BLOCK_SIZE(s->base_blk);
2748
203
        if (base) {
2749
99
            BLOCK_APPEND(s->base_blk, base, len);
2750
104
        } else {
2751
104
            int i;
2752
13.6M
            for (i = 0; i < len; i++)
2753
13.6M
                BLOCK_APPEND_CHAR(s->base_blk, 'N');
2754
104
        }
2755
203
        BLOCK_APPEND_CHAR(s->base_blk, '\0');
2756
203
    }
2757
260
    return cram_add_feature(c, s, r, &f);
2758
2759
0
 block_err:
2760
0
    return -1;
2761
260
}
2762
2763
/*
2764
 * Encodes auxiliary data. Largely duplicated from above, but done so to
2765
 * keep it simple and avoid a myriad of version ifs.
2766
 *
2767
 * Returns the RG header line pointed to by the BAM aux fields on success,
2768
 *         NULL on failure or no rg present, also sets "*err" to non-zero
2769
 */
2770
static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
2771
                                      cram_container *c,
2772
                                      cram_slice *s, cram_record *cr,
2773
                                      int verbatim_NM, int verbatim_MD,
2774
                                      int NM, kstring_t *MD, int cf_tag,
2775
2.01M
                                      int no_ref, int *err) {
2776
2.01M
    char *aux, *orig;
2777
2.01M
    sam_hrec_rg_t *brg = NULL;
2778
2.01M
    int aux_size = bam_get_l_aux(b);
2779
2.01M
    const char *aux_end = bam_data_end(b);
2780
2.01M
    cram_block *td_b = c->comp_hdr->TD_blk;
2781
2.01M
    int TD_blk_size = BLOCK_SIZE(td_b), new;
2782
2.01M
    char *key;
2783
2.01M
    khint_t k;
2784
2785
2.01M
    if (err) *err = 1;
2786
2787
2.01M
    orig = aux = (char *)bam_aux(b);
2788
2789
2790
    // cF:i  => Extra CRAM bit flags.
2791
    // 1:  Don't auto-decode MD (may be invalid)
2792
    // 2:  Don't auto-decode NM (may be invalid)
2793
2.01M
    if (cf_tag && CRAM_MAJOR_VERS(fd->version) < 4) {
2794
        // Temporary copy of aux so we can ammend it.
2795
17.0k
        aux = malloc(aux_size+4);
2796
17.0k
        if (!aux)
2797
0
            return NULL;
2798
2799
17.0k
        memcpy(aux, orig, aux_size);
2800
17.0k
        aux[aux_size++] = 'c';
2801
17.0k
        aux[aux_size++] = 'F';
2802
17.0k
        aux[aux_size++] = 'C';
2803
17.0k
        aux[aux_size++] = cf_tag;
2804
17.0k
        orig = aux;
2805
17.0k
        aux_end = aux + aux_size;
2806
17.0k
    }
2807
2808
    // Copy aux keys to td_b and aux values to slice aux blocks
2809
2.33M
    while (aux_end - aux >= 1 && aux[0] != 0) {
2810
311k
        int r;
2811
2812
        // Room for code + type + at least 1 byte of data
2813
311k
        if (aux - orig >= aux_size - 3)
2814
4
            goto err;
2815
2816
        // RG:Z
2817
311k
        if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
2818
391
            char *rg = &aux[3];
2819
391
            aux = rg;
2820
46.1k
            while (aux < aux_end && *aux++);
2821
391
            if (aux == aux_end && aux[-1] != '\0') {
2822
2
                hts_log_error("Unterminated RG:Z tag for read \"%s\"",
2823
2
                              bam_get_qname(b));
2824
2
                goto err;
2825
2
            }
2826
389
            brg = sam_hrecs_find_rg(fd->header->hrecs, rg);
2827
389
            if (brg) {
2828
122
                if (CRAM_MAJOR_VERS(fd->version) >= 4)
2829
0
                    BLOCK_APPEND(td_b, "RG*", 3);
2830
122
                continue;
2831
267
            } else {
2832
                // RG:Z tag will be stored verbatim
2833
267
                hts_log_warning("Missing @RG header for RG \"%s\"", rg);
2834
267
                aux = rg - 3;
2835
267
            }
2836
389
        }
2837
2838
        // MD:Z
2839
311k
        if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
2840
9.18k
            if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) {
2841
1.72k
                if (MD && MD->s && strncasecmp(MD->s, aux+3, orig + aux_size - (aux+3)) == 0) {
2842
0
                    while (aux < aux_end && *aux++);
2843
0
                    if (aux == aux_end && aux[-1] != '\0') {
2844
0
                        hts_log_error("Unterminated MD:Z tag for read \"%s\"",
2845
0
                                      bam_get_qname(b));
2846
0
                        goto err;
2847
0
                    }
2848
0
                    if (CRAM_MAJOR_VERS(fd->version) >= 4)
2849
0
                        BLOCK_APPEND(td_b, "MD*", 3);
2850
0
                    continue;
2851
0
                }
2852
1.72k
            }
2853
9.18k
        }
2854
2855
        // NM:i
2856
311k
        if (aux[0] == 'N' && aux[1] == 'M') {
2857
59
            if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_NM) {
2858
0
                int NM_ = bam_aux2i_end((uint8_t *)aux+2, (uint8_t *)aux_end);
2859
0
                if (NM_ == NM) {
2860
0
                    switch(aux[2]) {
2861
0
                    case 'A': case 'C': case 'c': aux+=4; break;
2862
0
                    case 'S': case 's':           aux+=5; break;
2863
0
                    case 'I': case 'i': case 'f': aux+=7; break;
2864
0
                    default:
2865
0
                        hts_log_error("Unhandled type code for NM tag");
2866
0
                        goto err;
2867
0
                    }
2868
0
                    if (CRAM_MAJOR_VERS(fd->version) >= 4)
2869
0
                        BLOCK_APPEND(td_b, "NM*", 3);
2870
0
                    continue;
2871
0
                }
2872
0
            }
2873
59
        }
2874
2875
311k
        BLOCK_APPEND(td_b, aux, 3);
2876
2877
        // Container level tags_used, for TD series
2878
        // Maps integer key ('X0i') to cram_tag_map struct.
2879
311k
        int key = (((unsigned char *) aux)[0]<<16 |
2880
311k
                   ((unsigned char *) aux)[1]<<8  |
2881
311k
                   ((unsigned char *) aux)[2]);
2882
311k
        k = kh_put(m_tagmap, c->tags_used, key, &r);
2883
311k
        if (-1 == r)
2884
0
            goto err;
2885
311k
        else if (r != 0)
2886
59.4k
            kh_val(c->tags_used, k) = NULL;
2887
2888
311k
        if (r == 1) {
2889
59.4k
            khint_t k_global;
2890
2891
            // Global tags_used for cram_metrics support
2892
59.4k
            pthread_mutex_lock(&fd->metrics_lock);
2893
59.4k
            k_global = kh_put(m_metrics, fd->tags_used, key, &r);
2894
59.4k
            if (-1 == r) {
2895
0
                pthread_mutex_unlock(&fd->metrics_lock);
2896
0
                goto err;
2897
0
            }
2898
59.4k
            if (r >= 1) {
2899
5.23k
                kh_val(fd->tags_used, k_global) = cram_new_metrics();
2900
5.23k
                if (!kh_val(fd->tags_used, k_global)) {
2901
0
                    kh_del(m_metrics, fd->tags_used, k_global);
2902
0
                    pthread_mutex_unlock(&fd->metrics_lock);
2903
0
                    goto err;
2904
0
                }
2905
5.23k
            }
2906
2907
59.4k
            pthread_mutex_unlock(&fd->metrics_lock);
2908
2909
59.4k
            int i2[2] = {'\t',key};
2910
59.4k
            size_t sk = key;
2911
59.4k
            cram_tag_map *m = calloc(1, sizeof(*m));
2912
59.4k
            if (!m)
2913
0
                goto_err;
2914
59.4k
            kh_val(c->tags_used, k) = m;
2915
2916
59.4k
            cram_codec *c;
2917
2918
            // Use a block content id based on the tag id.
2919
            // Codec type depends on tag data type.
2920
59.4k
            switch(aux[2]) {
2921
22.5k
            case 'Z': case 'H':
2922
                // string as byte_array_stop
2923
22.5k
                c = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2924
22.5k
                                      E_BYTE_ARRAY, (void *)i2,
2925
22.5k
                                      fd->version, &fd->vv);
2926
22.5k
                break;
2927
2928
28.9k
            case 'A': case 'c': case 'C': {
2929
                // byte array len, 1 byte
2930
28.9k
                cram_byte_array_len_encoder e;
2931
28.9k
                cram_stats st;
2932
2933
28.9k
                if (CRAM_MAJOR_VERS(fd->version) <= 3) {
2934
28.9k
                    e.len_encoding = E_HUFFMAN;
2935
28.9k
                    e.len_dat = NULL; // will get codes from st
2936
28.9k
                } else {
2937
0
                    e.len_encoding = E_CONST_INT;
2938
0
                    e.len_dat = NULL; // will get codes from st
2939
0
                }
2940
28.9k
                memset(&st, 0, sizeof(st));
2941
28.9k
                if (cram_stats_add(&st, 1) < 0) goto block_err;
2942
28.9k
                cram_stats_encoding(fd, &st);
2943
2944
28.9k
                e.val_encoding = E_EXTERNAL;
2945
28.9k
                e.val_dat = (void *)sk;
2946
2947
28.9k
                c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2948
28.9k
                                      E_BYTE_ARRAY, (void *)&e,
2949
28.9k
                                      fd->version, &fd->vv);
2950
28.9k
                break;
2951
28.9k
            }
2952
2953
1.66k
            case 's': case 'S': {
2954
                // byte array len, 2 byte
2955
1.66k
                cram_byte_array_len_encoder e;
2956
1.66k
                cram_stats st;
2957
2958
1.66k
                if (CRAM_MAJOR_VERS(fd->version) <= 3) {
2959
1.66k
                    e.len_encoding = E_HUFFMAN;
2960
1.66k
                    e.len_dat = NULL; // will get codes from st
2961
1.66k
                } else {
2962
0
                    e.len_encoding = E_CONST_INT;
2963
0
                    e.len_dat = NULL; // will get codes from st
2964
0
                }
2965
1.66k
                memset(&st, 0, sizeof(st));
2966
1.66k
                if (cram_stats_add(&st, 2) < 0) goto block_err;
2967
1.66k
                cram_stats_encoding(fd, &st);
2968
2969
1.66k
                e.val_encoding = E_EXTERNAL;
2970
1.66k
                e.val_dat = (void *)sk;
2971
2972
1.66k
                c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2973
1.66k
                                      E_BYTE_ARRAY, (void *)&e,
2974
1.66k
                                      fd->version, &fd->vv);
2975
1.66k
                break;
2976
1.66k
            }
2977
5.19k
            case 'i': case 'I': case 'f': {
2978
                // byte array len, 4 byte
2979
5.19k
                cram_byte_array_len_encoder e;
2980
5.19k
                cram_stats st;
2981
2982
5.19k
                if (CRAM_MAJOR_VERS(fd->version) <= 3) {
2983
5.19k
                    e.len_encoding = E_HUFFMAN;
2984
5.19k
                    e.len_dat = NULL; // will get codes from st
2985
5.19k
                } else {
2986
0
                    e.len_encoding = E_CONST_INT;
2987
0
                    e.len_dat = NULL; // will get codes from st
2988
0
                }
2989
5.19k
                memset(&st, 0, sizeof(st));
2990
5.19k
                if (cram_stats_add(&st, 4) < 0) goto block_err;
2991
5.19k
                cram_stats_encoding(fd, &st);
2992
2993
5.19k
                e.val_encoding = E_EXTERNAL;
2994
5.19k
                e.val_dat = (void *)sk;
2995
2996
5.19k
                c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2997
5.19k
                                      E_BYTE_ARRAY, (void *)&e,
2998
5.19k
                                      fd->version, &fd->vv);
2999
5.19k
                break;
3000
5.19k
            }
3001
3002
1.06k
            case 'B': {
3003
                // Byte array of variable size, but we generate our tag
3004
                // byte stream at the wrong stage (during reading and not
3005
                // after slice header construction). So we use
3006
                // BYTE_ARRAY_LEN with the length codec being external
3007
                // too.
3008
1.06k
                cram_byte_array_len_encoder e;
3009
3010
1.06k
                e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
3011
1.06k
                    ? E_VARINT_UNSIGNED
3012
1.06k
                    : E_EXTERNAL;
3013
1.06k
                e.len_dat = (void *)sk; // or key+128 for len?
3014
3015
1.06k
                e.val_encoding = E_EXTERNAL;
3016
1.06k
                e.val_dat = (void *)sk;
3017
3018
1.06k
                c = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
3019
1.06k
                                      E_BYTE_ARRAY, (void *)&e,
3020
1.06k
                                      fd->version, &fd->vv);
3021
1.06k
                break;
3022
5.19k
            }
3023
3024
46
            default:
3025
46
                hts_log_error("Unsupported SAM aux type '%c'", aux[2]);
3026
46
                c = NULL;
3027
59.4k
            }
3028
3029
59.4k
            if (!c)
3030
46
                goto_err;
3031
3032
59.4k
            m->codec = c;
3033
3034
            // Link to fd-global tag metrics
3035
59.4k
            pthread_mutex_lock(&fd->metrics_lock);
3036
59.4k
            m->m = k_global ? (cram_metrics *)kh_val(fd->tags_used, k_global) : NULL;
3037
59.4k
            pthread_mutex_unlock(&fd->metrics_lock);
3038
59.4k
        }
3039
3040
311k
        cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
3041
311k
        if (!tm) goto_err;
3042
311k
        cram_codec *codec = tm->codec;
3043
311k
        if (!tm->codec) goto_err;
3044
3045
311k
        switch(aux[2]) {
3046
164k
        case 'A': case 'C': case 'c':
3047
164k
            if (aux_end - aux < 3+1)
3048
0
                goto err;
3049
3050
164k
            if (!tm->blk) {
3051
28.9k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3052
0
                    goto err;
3053
28.9k
                codec->u.e_byte_array_len.val_codec->out = tm->blk;
3054
28.9k
            }
3055
3056
164k
            aux+=3;
3057
            //codec->encode(s, codec, aux, 1);
3058
            // Functionally equivalent, but less code.
3059
164k
            BLOCK_APPEND_CHAR(tm->blk, *aux);
3060
164k
            aux++;
3061
164k
            break;
3062
3063
11.8k
        case 'S': case 's':
3064
11.8k
            if (aux_end - aux < 3+2)
3065
1
                goto err;
3066
3067
11.8k
            if (!tm->blk) {
3068
1.66k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3069
0
                    goto err;
3070
1.66k
                codec->u.e_byte_array_len.val_codec->out = tm->blk;
3071
1.66k
            }
3072
3073
11.8k
            aux+=3;
3074
            //codec->encode(s, codec, aux, 2);
3075
11.8k
            BLOCK_APPEND(tm->blk, aux, 2);
3076
11.8k
            aux+=2;
3077
11.8k
            break;
3078
3079
52.9k
        case 'I': case 'i': case 'f':
3080
52.9k
            if (aux_end - aux < 3+4)
3081
3
                goto err;
3082
3083
52.9k
            if (!tm->blk) {
3084
5.19k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3085
0
                    goto err;
3086
5.19k
                codec->u.e_byte_array_len.val_codec->out = tm->blk;
3087
5.19k
            }
3088
3089
52.9k
            aux+=3;
3090
            //codec->encode(s, codec, aux, 4);
3091
52.9k
            BLOCK_APPEND(tm->blk, aux, 4);
3092
52.9k
            aux+=4;
3093
52.9k
            break;
3094
3095
0
        case 'd':
3096
0
            if (aux_end - aux < 3+8)
3097
0
                goto err;
3098
3099
0
            if (!tm->blk) {
3100
0
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3101
0
                    goto err;
3102
0
                codec->u.e_byte_array_len.val_codec->out = tm->blk;
3103
0
            }
3104
3105
0
            aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
3106
            //codec->encode(s, codec, aux, 8);
3107
0
            BLOCK_APPEND(tm->blk, aux, 8);
3108
0
            aux+=8;
3109
0
            break;
3110
3111
65.2k
        case 'Z': case 'H': {
3112
65.2k
            if (aux_end - aux < 3)
3113
0
                goto err;
3114
3115
65.2k
            if (!tm->blk) {
3116
22.5k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3117
0
                    goto err;
3118
22.5k
                codec->out = tm->blk;
3119
22.5k
            }
3120
3121
65.2k
            char *aux_s;
3122
65.2k
            aux += 3;
3123
65.2k
            aux_s = aux;
3124
4.76M
            while (aux < aux_end && *aux++);
3125
65.2k
            if (aux == aux_end && aux[-1] != '\0') {
3126
1
                hts_log_error("Unterminated %c%c:%c tag for read \"%s\"",
3127
1
                              aux_s[-3], aux_s[-2], aux_s[-1],
3128
1
                              bam_get_qname(b));
3129
1
                goto err;
3130
1
            }
3131
65.2k
            if (codec->encode(s, codec, aux_s, aux - aux_s) < 0)
3132
0
                goto err;
3133
65.2k
            break;
3134
65.2k
        }
3135
3136
65.2k
        case 'B': {
3137
16.8k
            if (aux_end - aux < 4+4)
3138
4
                goto err;
3139
3140
16.8k
            int type = aux[3];
3141
16.8k
            uint64_t count = (((uint64_t)((unsigned char *)aux)[4]) << 0 |
3142
16.8k
                              ((uint64_t)((unsigned char *)aux)[5]) << 8 |
3143
16.8k
                              ((uint64_t)((unsigned char *)aux)[6]) <<16 |
3144
16.8k
                              ((uint64_t)((unsigned char *)aux)[7]) <<24);
3145
16.8k
            uint64_t blen;
3146
16.8k
            if (!tm->blk) {
3147
1.06k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3148
0
                    goto err;
3149
1.06k
                if (codec->u.e_byte_array_len.val_codec->codec == E_XDELTA) {
3150
0
                    if (!(tm->blk2 = cram_new_block(EXTERNAL, key+128)))
3151
0
                        goto err;
3152
0
                    codec->u.e_byte_array_len.len_codec->out = tm->blk2;
3153
0
                    codec->u.e_byte_array_len.val_codec->u.e_xdelta.sub_codec->out = tm->blk;
3154
1.06k
                } else {
3155
1.06k
                    codec->u.e_byte_array_len.len_codec->out = tm->blk;
3156
1.06k
                    codec->u.e_byte_array_len.val_codec->out = tm->blk;
3157
1.06k
                }
3158
1.06k
            }
3159
3160
            // skip TN field
3161
16.8k
            aux+=3;
3162
3163
            // We use BYTE_ARRAY_LEN with external length, so store that first
3164
16.8k
            switch (type) {
3165
8.61k
            case 'c': case 'C':
3166
8.61k
                blen = count;
3167
8.61k
                break;
3168
2.24k
            case 's': case 'S':
3169
2.24k
                blen = 2*count;
3170
2.24k
                break;
3171
5.95k
            case 'i': case 'I': case 'f':
3172
5.95k
                blen = 4*count;
3173
5.95k
                break;
3174
7
            default:
3175
7
                hts_log_error("Unknown sub-type '%c' for aux type 'B'", type);
3176
7
                goto err;
3177
16.8k
            }
3178
3179
16.8k
            blen += 5; // sub-type & length
3180
16.8k
            if (aux_end - aux < blen || blen > INT_MAX)
3181
1
                goto err;
3182
3183
16.8k
            if (codec->encode(s, codec, aux, (int) blen) < 0)
3184
0
                goto err;
3185
16.8k
            aux += blen;
3186
16.8k
            break;
3187
16.8k
        }
3188
0
        default:
3189
0
            hts_log_error("Unknown aux type '%c'", aux_end - aux < 2 ? '?' : aux[2]);
3190
0
            goto err;
3191
311k
        }
3192
311k
        tm->blk->m = tm->m;
3193
311k
    }
3194
3195
    // FIXME: sort BLOCK_DATA(td_b) by char[3] triples
3196
3197
    // And and increment TD hash entry
3198
2.01M
    BLOCK_APPEND_CHAR(td_b, 0);
3199
3200
    // Duplicate key as BLOCK_DATA() can be realloced to a new pointer.
3201
2.01M
    key = string_ndup(c->comp_hdr->TD_keys,
3202
2.01M
                      (char *)BLOCK_DATA(td_b) + TD_blk_size,
3203
2.01M
                      BLOCK_SIZE(td_b) - TD_blk_size);
3204
2.01M
    if (!key)
3205
0
        goto block_err;
3206
2.01M
    k = kh_put(m_s2i, c->comp_hdr->TD_hash, key, &new);
3207
2.01M
    if (new < 0) {
3208
0
        goto err;
3209
2.01M
    } else if (new == 0) {
3210
1.99M
        BLOCK_SIZE(td_b) = TD_blk_size;
3211
1.99M
    } else {
3212
21.2k
        kh_val(c->comp_hdr->TD_hash, k) = c->comp_hdr->nTL;
3213
21.2k
        c->comp_hdr->nTL++;
3214
21.2k
    }
3215
3216
2.01M
    cr->TL = kh_val(c->comp_hdr->TD_hash, k);
3217
2.01M
    if (cram_stats_add(c->stats[DS_TL], cr->TL) < 0)
3218
0
        goto block_err;
3219
3220
2.01M
    if (orig != (char *)bam_aux(b))
3221
17.0k
        free(orig);
3222
3223
2.01M
    if (err) *err = 0;
3224
3225
2.01M
    return brg;
3226
3227
69
 err:
3228
69
 block_err:
3229
69
    if (orig != (char *)bam_aux(b))
3230
15
        free(orig);
3231
69
    return NULL;
3232
69
}
3233
3234
/*
3235
 * During cram_next_container or before the final flush at end of
3236
 * file, we update the current slice headers and increment the slice
3237
 * number to the next slice.
3238
 *
3239
 * See cram_next_container() and cram_close().
3240
 */
3241
15.8k
void cram_update_curr_slice(cram_container *c, int version) {
3242
15.8k
    cram_slice *s = c->slice;
3243
15.8k
    if (c->multi_seq) {
3244
285
        s->hdr->ref_seq_id    = -2;
3245
285
        s->hdr->ref_seq_start = 0;
3246
285
        s->hdr->ref_seq_span  = 0;
3247
15.6k
    } else if (c->curr_ref == -1 && CRAM_ge31(version)) {
3248
        // Spec states span=0, but it broke our range queries.
3249
        // See commit message for this and prior.
3250
8.40k
        s->hdr->ref_seq_id    = -1;
3251
8.40k
        s->hdr->ref_seq_start = 0;
3252
8.40k
        s->hdr->ref_seq_span  = 0;
3253
8.40k
    } else {
3254
7.20k
        s->hdr->ref_seq_id    = c->curr_ref;
3255
7.20k
        s->hdr->ref_seq_start = c->first_base;
3256
7.20k
        s->hdr->ref_seq_span  = MAX(0, c->last_base - c->first_base + 1);
3257
7.20k
    }
3258
15.8k
    s->hdr->num_records   = c->curr_rec;
3259
3260
15.8k
    if (c->curr_slice == 0) {
3261
15.8k
        if (c->ref_seq_id != s->hdr->ref_seq_id)
3262
8.69k
            c->ref_seq_id  = s->hdr->ref_seq_id;
3263
15.8k
        c->ref_seq_start = c->first_base;
3264
15.8k
    }
3265
3266
15.8k
    c->curr_slice++;
3267
15.8k
}
3268
3269
/*
3270
 * Handles creation of a new container or new slice, flushing any
3271
 * existing containers when appropriate.
3272
 *
3273
 * Really this is next slice, which may or may not lead to a new container.
3274
 *
3275
 * Returns cram_container pointer on success
3276
 *         NULL on failure.
3277
 */
3278
15.9k
static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) {
3279
15.9k
    cram_container *c = fd->ctr;
3280
15.9k
    int i;
3281
3282
    /* First occurrence */
3283
15.9k
    if (c->curr_ref == -2)
3284
2.71k
        c->curr_ref = bam_ref(b);
3285
3286
15.9k
    if (c->slice)
3287
13.2k
        cram_update_curr_slice(c, fd->version);
3288
3289
    /* Flush container */
3290
15.9k
    if (c->curr_slice == c->max_slice ||
3291
13.2k
        (bam_ref(b) != c->curr_ref && !c->multi_seq)) {
3292
13.2k
        c->ref_seq_span = fd->last_base - c->ref_seq_start + 1;
3293
13.2k
        hts_log_info("Flush container %d/%"PRId64"..%"PRId64,
3294
13.2k
                     c->ref_seq_id, c->ref_seq_start,
3295
13.2k
                     c->ref_seq_start + c->ref_seq_span -1);
3296
3297
        /* Encode slices */
3298
13.2k
        if (-1 == cram_flush_container_mt(fd, c))
3299
69
            return NULL;
3300
13.1k
        if (!fd->pool) {
3301
            // Move to sep func, as we need cram_flush_container for
3302
            // the closing phase to flush the partial container.
3303
26.3k
            for (i = 0; i < c->max_slice; i++) {
3304
13.1k
                cram_free_slice(c->slices[i]);
3305
13.1k
                c->slices[i] = NULL;
3306
13.1k
            }
3307
3308
13.1k
            c->slice = NULL;
3309
13.1k
            c->curr_slice = 0;
3310
3311
            /* Easy approach for purposes of freeing stats */
3312
13.1k
            cram_free_container(c);
3313
13.1k
        }
3314
3315
13.1k
        c = fd->ctr = cram_new_container(fd->seqs_per_slice,
3316
13.1k
                                         fd->slices_per_container);
3317
13.1k
        if (!c)
3318
0
            return NULL;
3319
3320
13.1k
        pthread_mutex_lock(&fd->ref_lock);
3321
13.1k
        c->no_ref = fd->no_ref;
3322
13.1k
        c->embed_ref = fd->embed_ref;
3323
13.1k
        c->record_counter = fd->record_counter;
3324
13.1k
        pthread_mutex_unlock(&fd->ref_lock);
3325
13.1k
        c->curr_ref = bam_ref(b);
3326
13.1k
    }
3327
3328
15.8k
    c->last_pos = c->first_base = c->last_base = bam_pos(b)+1;
3329
3330
    /* New slice */
3331
15.8k
    c->slice = c->slices[c->curr_slice] =
3332
15.8k
        cram_new_slice(MAPPED_SLICE, c->max_rec);
3333
15.8k
    if (!c->slice)
3334
0
        return NULL;
3335
3336
15.8k
    if (c->multi_seq) {
3337
0
        c->slice->hdr->ref_seq_id = -2;
3338
0
        c->slice->hdr->ref_seq_start = 0;
3339
0
        c->slice->last_apos = 1;
3340
15.8k
    } else {
3341
15.8k
        c->slice->hdr->ref_seq_id = bam_ref(b);
3342
        // wrong for unsorted data, will fix during encoding.
3343
15.8k
        c->slice->hdr->ref_seq_start = bam_pos(b)+1;
3344
15.8k
        c->slice->last_apos = bam_pos(b)+1;
3345
15.8k
    }
3346
3347
15.8k
    c->curr_rec = 0;
3348
15.8k
    c->s_num_bases = 0;
3349
15.8k
    c->n_mapped = 0;
3350
3351
    // QO field: 0 implies original orientation, 1 implies sequence orientation
3352
    // 1 is often preferable for NovaSeq, but impact is slight. ~0.5% diff.
3353
    // Conversely other data sets it's often better than 1% saving for 0.
3354
    // Short of trying both and learning, for now we use use 0 for V4, 1 for V3.
3355
15.8k
    c->qs_seq_orient = CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : 1;
3356
3357
15.8k
    return c;
3358
15.8k
}
3359
3360
3361
/*
3362
 * Converts a single bam record into a cram record.
3363
 * Possibly used within a thread.
3364
 *
3365
 * Returns 0 on success;
3366
 *        -1 on failure
3367
 */
3368
static int process_one_read(cram_fd *fd, cram_container *c,
3369
                            cram_slice *s, cram_record *cr,
3370
                            bam_seq_t *b, int rnum, kstring_t *MD,
3371
2.01M
                            int embed_ref, int no_ref) {
3372
2.01M
    int i, fake_qual = -1, NM = 0;
3373
2.01M
    char *cp;
3374
2.01M
    char *ref, *seq, *qual;
3375
3376
    // Any places with N in seq and/or reference can lead to ambiguous
3377
    // interpretation of the SAM NM:i tag.  So we store these verbatim
3378
    // to ensure valid data round-trips the same regardless of who
3379
    // defines it as valid.
3380
    // Similarly when alignments go beyond end of the reference.
3381
2.01M
    int verbatim_NM = fd->store_nm;
3382
2.01M
    int verbatim_MD = fd->store_md;
3383
3384
    // FIXME: multi-ref containers
3385
3386
2.01M
    cr->flags       = bam_flag(b);
3387
2.01M
    cr->len         = bam_seq_len(b);
3388
2.01M
    uint8_t *md;
3389
2.01M
    if (!(md = bam_aux_get(b, "MD")))
3390
2.00M
        MD = NULL;
3391
9.67k
    else
3392
9.67k
        MD->l = 0;
3393
3394
2.01M
    int cf_tag = 0;
3395
3396
2.01M
    if (embed_ref == 2) {
3397
17.0k
        cf_tag  = MD ? 0 : 1;                   // No MD
3398
17.0k
        cf_tag |= bam_aux_get(b, "NM") ? 0 : 2; // No NM
3399
17.0k
    }
3400
3401
    //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg);
3402
3403
2.01M
    ref = c->ref ? c->ref - (c->ref_start-1) : NULL;
3404
2.01M
    cr->ref_id      = bam_ref(b);
3405
2.01M
    if (cram_stats_add(c->stats[DS_RI], cr->ref_id) < 0)
3406
0
        goto block_err;
3407
2.01M
    if (cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]) < 0)
3408
0
        goto block_err;
3409
3410
    // Non reference based encoding means storing the bases verbatim as features, which in
3411
    // turn means every base also has a quality already stored.
3412
2.01M
    if (!no_ref || CRAM_MAJOR_VERS(fd->version) >= 3)
3413
2.01M
        cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
3414
3415
2.01M
    if (cr->len <= 0 && CRAM_MAJOR_VERS(fd->version) >= 3)
3416
1.93M
        cr->cram_flags |= CRAM_FLAG_NO_SEQ;
3417
    //cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK);
3418
3419
2.01M
    c->num_bases   += cr->len;
3420
2.01M
    cr->apos        = bam_pos(b)+1;
3421
2.01M
    if (cr->apos < 0 || cr->apos > INT64_MAX/2)
3422
18
        goto err;
3423
2.01M
    if (c->pos_sorted) {
3424
2.01M
        if (cr->apos < s->last_apos && !fd->ap_delta) {
3425
332
            c->pos_sorted = 0;
3426
2.01M
        } else {
3427
2.01M
            if (cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos) < 0)
3428
0
                goto block_err;
3429
2.01M
            s->last_apos = cr->apos;
3430
2.01M
        }
3431
2.01M
    } else {
3432
        //cram_stats_add(c->stats[DS_AP], cr->apos);
3433
6.89k
    }
3434
2.01M
    c->max_apos += (cr->apos > c->max_apos) * (cr->apos - c->max_apos);
3435
3436
    /*
3437
     * This seqs_ds is largely pointless and it could reuse the same memory
3438
     * over and over.
3439
     * s->base_blk is what we need for encoding.
3440
     */
3441
2.01M
    cr->seq         = BLOCK_SIZE(s->seqs_blk);
3442
2.01M
    cr->qual        = BLOCK_SIZE(s->qual_blk);
3443
2.01M
    BLOCK_GROW(s->seqs_blk, cr->len+1);
3444
2.01M
    BLOCK_GROW(s->qual_blk, cr->len);
3445
3446
    // Convert BAM nibble encoded sequence to string of base pairs
3447
2.01M
    seq = cp = (char *)BLOCK_END(s->seqs_blk);
3448
2.01M
    *seq = 0;
3449
2.01M
    nibble2base(bam_seq(b), cp, cr->len);
3450
2.01M
    BLOCK_SIZE(s->seqs_blk) += cr->len;
3451
3452
2.01M
    qual = cp = (char *)bam_qual(b);
3453
3454
3455
    /* Copy and parse */
3456
2.01M
    if (!(cr->flags & BAM_FUNMAP)) {
3457
18.6k
        uint32_t *cig_to, *cig_from;
3458
18.6k
        int64_t apos = cr->apos-1, spos = 0;
3459
18.6k
        int64_t MD_last = apos; // last position of edit in MD tag
3460
3461
18.6k
        if (apos < 0) {
3462
2
            hts_log_error("Mapped read with position <= 0 is disallowed");
3463
2
            return -1;
3464
2
        }
3465
3466
18.6k
        cr->cigar       = s->ncigar;
3467
18.6k
        cr->ncigar      = bam_cigar_len(b);
3468
18.7k
        while (cr->cigar + cr->ncigar >= s->cigar_alloc) {
3469
72
            s->cigar_alloc = s->cigar_alloc ? s->cigar_alloc*2 : 1024;
3470
72
            s->cigar = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar));
3471
72
            if (!s->cigar)
3472
0
                return -1;
3473
72
        }
3474
3475
18.6k
        cig_to = (uint32_t *)s->cigar;
3476
18.6k
        cig_from = (uint32_t *)bam_cigar(b);
3477
3478
18.6k
        cr->feature = 0;
3479
18.6k
        cr->nfeature = 0;
3480
63.1k
        for (i = 0; i < cr->ncigar; i++) {
3481
44.5k
            enum cigar_op cig_op = cig_from[i] & BAM_CIGAR_MASK;
3482
44.5k
            uint32_t cig_len = cig_from[i] >> BAM_CIGAR_SHIFT;
3483
44.5k
            cig_to[i] = cig_from[i];
3484
3485
            /* Can also generate events from here for CRAM diffs */
3486
3487
44.5k
            switch (cig_op) {
3488
0
                int l;
3489
3490
                // Don't trust = and X ops to be correct.
3491
8.11k
            case BAM_CMATCH:
3492
8.19k
            case BAM_CBASE_MATCH:
3493
8.72k
            case BAM_CBASE_MISMATCH:
3494
                //fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n",
3495
                //      cig_len, &ref[apos], cig_len, &seq[spos]);
3496
8.72k
                l = 0;
3497
8.72k
                if (!no_ref && cr->len) {
3498
5.55k
                    int end = cig_len+apos < c->ref_end
3499
5.55k
                        ? cig_len : c->ref_end - apos;
3500
5.55k
                    char *sp = &seq[spos];
3501
5.55k
                    char *rp = &ref[apos];
3502
5.55k
                    char *qp = &qual[spos];
3503
5.55k
                    if (end > cr->len) {
3504
1
                        hts_log_error("CIGAR and query sequence are of different length");
3505
1
                        return -1;
3506
1
                    }
3507
8.30k
                    for (l = 0; l < end; l++) {
3508
                        // This case is just too disputed and different tools
3509
                        // interpret these in different ways.  We give up and
3510
                        // store verbatim.
3511
2.74k
                        if (rp[l] == 'N' && sp[l] == 'N')
3512
121
                            verbatim_NM = verbatim_MD = 1;
3513
2.74k
                        if (rp[l] != sp[l]) {
3514
                            // Build our own MD tag if one is on the sequence, so
3515
                            // we can ensure it matches and thus can be discarded.
3516
662
                            if (MD && ref) {
3517
192
                                if (kputuw(apos+l - MD_last, MD) < 0) goto err;
3518
192
                                if (kputc(rp[l], MD) < 0) goto err;
3519
192
                                MD_last = apos+l+1;
3520
192
                            }
3521
662
                            NM++;
3522
662
                            if (!sp[l])
3523
0
                                break;
3524
662
                            if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) {
3525
#if 0
3526
                                // Disabled for the time being as it doesn't
3527
                                // seem to gain us much.
3528
                                int ol=l;
3529
                                while (l<end && rp[l] != sp[l])
3530
                                    l++;
3531
                                if (l-ol > 1) {
3532
                                    if (cram_add_bases(fd, c, s, cr, spos+ol,
3533
                                                       l-ol, &seq[spos+ol]))
3534
                                        return -1;
3535
                                    l--;
3536
                                } else {
3537
                                    l = ol;
3538
                                    if (cram_add_substitution(fd, c, s, cr,
3539
                                                              spos+l, sp[l],
3540
                                                              qp[l], rp[l]))
3541
                                        return -1;
3542
                                }
3543
#else
3544
                                // With urmap pushed to the limit and lots
3545
                                // of unaligned data (should be soft-clipped)
3546
                                // this saves ~2-7%. Worth it?
3547
0
                                int nl = l;
3548
0
                                int max_end = nl, max_score = 0, score = 0;
3549
0
                                while (nl < end) {
3550
0
                                    if (rp[nl] != sp[nl]) {
3551
0
                                        score += 3;
3552
0
                                        if (max_score < score) {
3553
0
                                            max_score = score;
3554
0
                                            max_end = nl;
3555
0
                                        }
3556
0
                                    } else {
3557
0
                                        score--;
3558
0
                                        if (score < -2 ||
3559
0
                                            max_score - score > 7)
3560
0
                                            break;
3561
0
                                    }
3562
0
                                    nl++;
3563
0
                                }
3564
0
                                if (max_score > 20) {
3565
0
                                    cram_add_bases(fd, c, s, cr, spos+l,
3566
0
                                                   max_end-l, &seq[spos+l]);
3567
0
                                    l = max_end-1;
3568
0
                                } else {
3569
0
                                    while (l < nl) {
3570
0
                                        if (rp[l] != sp[l])
3571
0
                                            cram_add_substitution(fd, c, s,
3572
0
                                                                  cr, spos+l,
3573
0
                                                                  sp[l], qp[l],
3574
0
                                                                  rp[l]);
3575
0
                                        l++;
3576
0
                                    }
3577
0
                                    l--;
3578
0
                                }
3579
0
#endif
3580
662
                            } else {
3581
662
                                if (cram_add_substitution(fd, c, s, cr, spos+l,
3582
662
                                                          sp[l], qp[l], rp[l]))
3583
0
                                    return -1;
3584
662
                            }
3585
662
                        }
3586
2.74k
                    }
3587
5.55k
                    spos += l;
3588
5.55k
                    apos += l;
3589
5.55k
                }
3590
3591
8.72k
                if (l < cig_len && cr->len) {
3592
1.06k
                    if (no_ref) {
3593
291
                        if (CRAM_MAJOR_VERS(fd->version) == 3) {
3594
291
                            if (cram_add_bases(fd, c, s, cr, spos,
3595
291
                                               cig_len-l, &seq[spos]))
3596
0
                                return -1;
3597
291
                            spos += cig_len-l;
3598
291
                        } else {
3599
0
                            for (; l < cig_len && seq[spos]; l++, spos++) {
3600
0
                                if (cram_add_base(fd, c, s, cr, spos,
3601
0
                                                  seq[spos], qual[spos]))
3602
0
                                    return -1;
3603
0
                            }
3604
0
                        }
3605
770
                    } else {
3606
                        /* off end of sequence or non-ref based output */
3607
770
                        verbatim_NM = verbatim_MD = 1;
3608
1.55k
                        for (; l < cig_len && seq[spos]; l++, spos++) {
3609
787
                            if (cram_add_base(fd, c, s, cr, spos,
3610
787
                                              seq[spos], qual[spos]))
3611
0
                                return -1;
3612
787
                        }
3613
770
                    }
3614
1.06k
                    apos += cig_len;
3615
7.66k
                } else if (!cr->len) {
3616
                    /* Seq "*" */
3617
2.71k
                    verbatim_NM = verbatim_MD = 1;
3618
2.71k
                    apos += cig_len;
3619
2.71k
                    spos += cig_len;
3620
2.71k
                }
3621
8.72k
                break;
3622
3623
14.8k
            case BAM_CDEL:
3624
14.8k
                if (MD && ref) {
3625
3.14k
                    if (kputuw(apos - MD_last, MD) < 0) goto err;
3626
3.14k
                    if (apos < c->ref_end) {
3627
829
                        if (kputc_('^', MD) < 0) goto err;
3628
829
                        if (kputsn(&ref[apos], MIN(c->ref_end - apos, cig_len), MD) < 0)
3629
0
                            goto err;
3630
829
                    }
3631
3.14k
                }
3632
14.8k
                NM += cig_len;
3633
3634
14.8k
                if (cram_add_deletion(c, s, cr, spos, cig_len, &seq[spos]))
3635
0
                    return -1;
3636
14.8k
                apos += cig_len;
3637
14.8k
                MD_last = apos;
3638
14.8k
                break;
3639
3640
204
            case BAM_CREF_SKIP:
3641
204
                if (cram_add_skip(c, s, cr, spos, cig_len, &seq[spos]))
3642
0
                    return -1;
3643
204
                apos += cig_len;
3644
204
                MD_last += cig_len;
3645
204
                break;
3646
3647
260
            case BAM_CINS:
3648
260
                if (cram_add_insertion(c, s, cr, spos, cig_len,
3649
260
                                       cr->len ? &seq[spos] : NULL))
3650
0
                    return -1;
3651
260
                if (no_ref && cr->len) {
3652
95
                    for (l = 0; l < cig_len; l++, spos++) {
3653
27
                        cram_add_quality(fd, c, s, cr, spos, qual[spos]);
3654
27
                    }
3655
192
                } else {
3656
192
                    spos += cig_len;
3657
192
                }
3658
260
                NM += cig_len;
3659
260
                break;
3660
3661
13.6k
            case BAM_CSOFT_CLIP:
3662
13.6k
                if (cram_add_softclip(c, s, cr, spos, cig_len,
3663
13.6k
                                      cr->len ? &seq[spos] : NULL,
3664
13.6k
                                      fd->version))
3665
0
                    return -1;
3666
3667
13.6k
                if (no_ref &&
3668
2.35k
                    !(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
3669
0
                    if (cr->len) {
3670
0
                        for (l = 0; l < cig_len; l++, spos++) {
3671
0
                            cram_add_quality(fd, c, s, cr, spos, qual[spos]);
3672
0
                        }
3673
0
                    } else {
3674
0
                        for (l = 0; l < cig_len; l++, spos++) {
3675
0
                            cram_add_quality(fd, c, s, cr, spos, -1);
3676
0
                        }
3677
0
                    }
3678
13.6k
                } else {
3679
13.6k
                    spos += cig_len;
3680
13.6k
                }
3681
13.6k
                break;
3682
3683
6.42k
            case BAM_CHARD_CLIP:
3684
6.42k
                if (cram_add_hardclip(c, s, cr, spos, cig_len, &seq[spos]))
3685
0
                    return -1;
3686
6.42k
                break;
3687
3688
6.42k
            case BAM_CPAD:
3689
342
                if (cram_add_pad(c, s, cr, spos, cig_len, &seq[spos]))
3690
0
                    return -1;
3691
342
                break;
3692
3693
342
            default:
3694
86
                hts_log_error("Unknown CIGAR op code %d", cig_op);
3695
86
                return -1;
3696
44.5k
            }
3697
44.5k
        }
3698
18.5k
        if (cr->len && spos != cr->len) {
3699
2
            hts_log_error("CIGAR and query sequence are of different length");
3700
2
            return -1;
3701
2
        }
3702
18.5k
        fake_qual = spos;
3703
        // Protect against negative length refs (fuzz 382922241)
3704
18.5k
        cr->aend = no_ref ? apos : MIN(apos, MAX(0, c->ref_end));
3705
18.5k
        if (cram_stats_add(c->stats[DS_FN], cr->nfeature) < 0)
3706
0
            goto block_err;
3707
3708
18.5k
        if (MD && ref)
3709
5.81k
            if (kputuw(apos - MD_last, MD) < 0) goto err;
3710
2.00M
    } else {
3711
        // Unmapped
3712
2.00M
        cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
3713
2.00M
        cr->cigar  = 0;
3714
2.00M
        cr->ncigar = 0;
3715
2.00M
        cr->nfeature = 0;
3716
2.00M
        cr->aend = MIN(cr->apos, c->ref_end);
3717
268M
        for (i = 0; i < cr->len; i++)
3718
266M
            if (cram_stats_add(c->stats[DS_BA], seq[i]) < 0)
3719
0
                goto block_err;
3720
2.00M
        fake_qual = 0;
3721
2.00M
    }
3722
3723
2.01M
    cr->ntags      = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags);
3724
2.01M
    int err = 0;
3725
2.01M
    sam_hrec_rg_t *brg =
3726
2.01M
        cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD,
3727
2.01M
                        cf_tag, no_ref, &err);
3728
2.01M
    if (err)
3729
69
        goto block_err;
3730
3731
    /* Read group, identified earlier */
3732
2.01M
    if (brg) {
3733
122
        cr->rg = brg->id;
3734
2.01M
    } else if (CRAM_MAJOR_VERS(fd->version) == 1) {
3735
0
        sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, "UNKNOWN");
3736
0
        if (!brg) goto block_err;
3737
0
        cr->rg = brg->id;
3738
2.01M
    } else {
3739
2.01M
        cr->rg = -1;
3740
2.01M
    }
3741
2.01M
    if (cram_stats_add(c->stats[DS_RG], cr->rg) < 0)
3742
0
        goto block_err;
3743
3744
    /*
3745
     * Append to the qual block now. We do this here as
3746
     * cram_add_substitution() can generate BA/QS events which need to
3747
     * be in the qual block before we append the rest of the data.
3748
     */
3749
2.01M
    if (cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES) {
3750
        /* Special case of seq "*" */
3751
2.01M
        if (cr->len == 0) {
3752
1.93M
            cr->len = fake_qual;
3753
1.93M
            BLOCK_GROW(s->qual_blk, cr->len);
3754
1.93M
            cp = (char *)BLOCK_END(s->qual_blk);
3755
1.93M
            memset(cp, 255, cr->len);
3756
1.93M
        } else {
3757
87.3k
            BLOCK_GROW(s->qual_blk, cr->len);
3758
87.3k
            cp = (char *)BLOCK_END(s->qual_blk);
3759
87.3k
            char *from = (char *)&bam_qual(b)[0];
3760
87.3k
            char *to = &cp[0];
3761
87.3k
            memcpy(to, from, cr->len);
3762
3763
            // Store quality in original orientation for better compression.
3764
87.3k
            if (!c->qs_seq_orient) {
3765
0
                if (cr->flags & BAM_FREVERSE) {
3766
0
                    int i, j;
3767
0
                    for (i = 0, j = cr->len-1; i < j; i++, j--) {
3768
0
                        unsigned char c;
3769
0
                        c = to[i];
3770
0
                        to[i] = to[j];
3771
0
                        to[j] = c;
3772
0
                    }
3773
0
                }
3774
0
            }
3775
87.3k
        }
3776
2.01M
        BLOCK_SIZE(s->qual_blk) += cr->len;
3777
2.01M
    } else {
3778
0
        if (cr->len == 0)
3779
0
            cr->len = fake_qual >= 0 ? fake_qual : cr->aend - cr->apos + 1;
3780
0
    }
3781
3782
2.01M
    if (cram_stats_add(c->stats[DS_RL], cr->len) < 0)
3783
0
        goto block_err;
3784
3785
    /* Now we know apos and aend both, update mate-pair information */
3786
2.01M
    {
3787
2.01M
        int new;
3788
2.01M
        khint_t k;
3789
2.01M
        int sec = (cr->flags & BAM_FSECONDARY) ? 1 : 0;
3790
3791
        //fprintf(stderr, "Checking %"PRId64"/%.*s\t", rnum,
3792
        //      cr->name_len, DSTRING_STR(s->name_ds)+cr->name);
3793
2.01M
        if (cr->flags & BAM_FPAIRED) {
3794
9.45k
            char *key = string_ndup(s->pair_keys, bam_name(b), bam_name_len(b));
3795
9.45k
            if (!key)
3796
0
                return -1;
3797
3798
9.45k
            k = kh_put(m_s2i, s->pair[sec], key, &new);
3799
9.45k
            if (-1 == new)
3800
0
                return -1;
3801
9.45k
            else if (new > 0)
3802
1.97k
                kh_val(s->pair[sec], k) = rnum
3803
1.97k
                    | ((unsigned)((cr->flags & BAM_FREAD1)!=0)<<30)
3804
1.97k
                    | ((unsigned)((cr->flags & BAM_FREAD2)!=0)<<31);
3805
2.00M
        } else {
3806
2.00M
            new = 1;
3807
2.00M
            k = 0; // Prevents false-positive warning from gcc -Og
3808
2.00M
        }
3809
3810
2.01M
        if (new == 0) {
3811
7.48k
            cram_record *p = &s->crecs[kh_val(s->pair[sec], k) & ((1<<30)-1)];
3812
7.48k
            int64_t aleft, aright;
3813
7.48k
            int sign;
3814
3815
7.48k
            aleft = MIN(cr->apos, p->apos);
3816
7.48k
            aright = MAX(cr->aend, p->aend);
3817
7.48k
            if (cr->apos < p->apos) {
3818
58
                sign = 1;
3819
7.42k
            } else if (cr->apos > p->apos) {
3820
170
                sign = -1;
3821
7.25k
            } else if (cr->flags & BAM_FREAD1) {
3822
5.01k
                sign = 1;
3823
5.01k
            } else {
3824
2.24k
                sign = -1;
3825
2.24k
            }
3826
3827
            // Multiple sets of secondary reads means we cannot tell which is
3828
            // which, so we store TLEN etc verbatim.
3829
7.48k
            int has_r1 = kh_val(s->pair[sec], k) & (1<<30);
3830
7.48k
            unsigned int has_r2 = kh_val(s->pair[sec], k) & (1u<<31);
3831
7.48k
            if ((has_r1 && (cr->flags & BAM_FREAD1)) ||
3832
2.51k
                (has_r2 && (cr->flags & BAM_FREAD2)))
3833
5.85k
                goto detached;
3834
3835
            // This vs p: tlen, matepos, flags. Permit TLEN 0 and/or TLEN +/-
3836
            // a small amount, if appropriate options set.
3837
1.62k
            if ((!fd->tlen_zero && MAX(bam_mate_pos(b)+1, 0) != p->apos) &&
3838
821
                !(fd->tlen_zero && bam_mate_pos(b) == 0))
3839
821
                goto detached;
3840
3841
806
            if (((bam_flag(b) & BAM_FMUNMAP) != 0) !=
3842
806
                ((p->flags & BAM_FUNMAP) != 0))
3843
574
                goto detached;
3844
3845
232
            if (((bam_flag(b) & BAM_FMREVERSE) != 0) !=
3846
232
                ((p->flags & BAM_FREVERSE) != 0))
3847
34
                goto detached;
3848
3849
3850
            // p vs this: tlen, matepos, flags
3851
198
            if (p->ref_id != cr->ref_id &&
3852
16
                !(fd->tlen_zero && p->ref_id == -1))
3853
16
                goto detached;
3854
3855
182
            if (p->mate_pos != cr->apos &&
3856
54
                !(fd->tlen_zero && p->mate_pos == 0))
3857
54
                goto detached;
3858
3859
128
            if (((p->flags & BAM_FMUNMAP) != 0) !=
3860
128
                ((p->mate_flags & CRAM_M_UNMAP) != 0))
3861
3
                goto detached;
3862
3863
125
            if (((p->flags & BAM_FMREVERSE) != 0) !=
3864
125
                ((p->mate_flags & CRAM_M_REVERSE) != 0))
3865
6
                goto detached;
3866
3867
            // Supplementary reads are just too ill defined
3868
119
            if ((cr->flags & BAM_FSUPPLEMENTARY) ||
3869
119
                (p->flags & BAM_FSUPPLEMENTARY))
3870
7
                goto detached;
3871
3872
            // When in lossy name mode, if a read isn't detached we
3873
            // cannot store the name.  The corollary is that when we
3874
            // must store the name, it must be detached (inefficient).
3875
112
            if (fd->lossy_read_names &&
3876
0
                (!(cr->cram_flags & CRAM_FLAG_DISCARD_NAME) ||
3877
0
                 !((p->cram_flags & CRAM_FLAG_DISCARD_NAME))))
3878
0
                goto detached;
3879
3880
            // Now check TLEN.  We do this last as sometimes it's the
3881
            // only thing that differs.  In CRAM4 we have a better way
3882
            // of handling this that doesn't break detached status
3883
112
            int explicit_tlen = 0;
3884
112
            int tflag1 = ((bam_ins_size(b) &&
3885
44
                           llabs(bam_ins_size(b) - sign*(aright-aleft+1))
3886
44
                           > fd->tlen_approx)
3887
78
                          || (!bam_ins_size(b) && !fd->tlen_zero));
3888
3889
112
            int tflag2 = ((p->tlen && llabs(p->tlen - -sign*(aright-aleft+1))
3890
44
                           > fd->tlen_approx)
3891
90
                          || (!p->tlen && !fd->tlen_zero));
3892
3893
112
            if (tflag1 || tflag2) {
3894
104
                if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3895
0
                    explicit_tlen = CRAM_FLAG_EXPLICIT_TLEN;
3896
104
                } else {
3897
                    // Stil do detached for unmapped data in CRAM4 as this
3898
                    // also impacts RNEXT calculation.
3899
104
                    goto detached;
3900
104
                }
3901
104
            }
3902
3903
            /*
3904
             * The fields below are unused when encoding this read as it is
3905
             * no longer detached.  In theory they may get referred to when
3906
             * processing a 3rd or 4th read in this template?, so we set them
3907
             * here just to be sure.
3908
             *
3909
             * They do not need cram_stats_add() calls those as they are
3910
             * not emitted.
3911
             */
3912
8
            cr->mate_pos = p->apos;
3913
8
            cram_stats_add(c->stats[DS_NP], cr->mate_pos);
3914
8
            cr->tlen = explicit_tlen ? bam_ins_size(b) : sign*(aright-aleft+1);
3915
8
            cram_stats_add(c->stats[DS_TS], cr->tlen);
3916
8
            cr->mate_flags =
3917
8
                ((p->flags & BAM_FMUNMAP)   == BAM_FMUNMAP)   * CRAM_M_UNMAP +
3918
8
                ((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE;
3919
3920
            // Decrement statistics aggregated earlier
3921
8
            if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
3922
8
                cram_stats_del(c->stats[DS_NP], p->mate_pos);
3923
8
                cram_stats_del(c->stats[DS_MF], p->mate_flags);
3924
8
                if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN))
3925
8
                    cram_stats_del(c->stats[DS_TS], p->tlen);
3926
8
                cram_stats_del(c->stats[DS_NS], p->mate_ref_id);
3927
8
            }
3928
3929
            /* Similarly we could correct the p-> values too, but these will no
3930
             * longer have any code that refers back to them as the new 'p'
3931
             * for this template is our current 'cr'.
3932
             */
3933
            //p->mate_pos = cr->apos;
3934
            //p->mate_flags =
3935
            //  ((cr->flags & BAM_FMUNMAP)   == BAM_FMUNMAP)  * CRAM_M_UNMAP +
3936
            //  ((cr->flags & BAM_FMREVERSE) == BAM_FMREVERSE)* CRAM_M_REVERSE;
3937
            //p->tlen = p->apos - cr->aend;
3938
3939
            // Clear detached from cr flags
3940
8
            cr->cram_flags &= ~CRAM_FLAG_DETACHED;
3941
8
            cr->cram_flags |= explicit_tlen;
3942
8
            if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0)
3943
0
                goto block_err;
3944
3945
            // Clear detached from p flags and set downstream
3946
8
            if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
3947
8
                cram_stats_del(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK);
3948
8
                p->cram_flags &= ~CRAM_FLAG_STATS_ADDED;
3949
8
            }
3950
3951
8
            p->cram_flags  &= ~CRAM_FLAG_DETACHED;
3952
8
            p->cram_flags  |=  CRAM_FLAG_MATE_DOWNSTREAM | explicit_tlen;;
3953
8
            if (cram_stats_add(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK) < 0)
3954
0
                goto block_err;
3955
3956
8
            p->mate_line = rnum - ((kh_val(s->pair[sec], k) & ((1<<30)-1)) + 1);
3957
8
            if (cram_stats_add(c->stats[DS_NF], p->mate_line) < 0)
3958
0
                goto block_err;
3959
3960
8
            int r12_flags = kh_val(s->pair[sec], k) & (3u<<30);
3961
8
            kh_val(s->pair[sec], k) = (unsigned int)rnum | r12_flags
3962
8
                | (((cr->flags & BAM_FREAD1)!=0)<<30)
3963
8
                | ((unsigned)((cr->flags & BAM_FREAD2)!=0)<<31);
3964
2.01M
        } else {
3965
2.01M
        detached:
3966
            //fprintf(stderr, "unpaired\n");
3967
3968
            /* Derive mate flags from this flag */
3969
2.01M
            cr->mate_flags = 0;
3970
2.01M
            if (bam_flag(b) & BAM_FMUNMAP)
3971
7.76k
                cr->mate_flags |= CRAM_M_UNMAP;
3972
2.01M
            if (bam_flag(b) & BAM_FMREVERSE)
3973
200
                cr->mate_flags |= CRAM_M_REVERSE;
3974
3975
2.01M
            if (cram_stats_add(c->stats[DS_MF], cr->mate_flags) < 0)
3976
0
                goto block_err;
3977
3978
2.01M
            cr->mate_pos    = MAX(bam_mate_pos(b)+1, 0);
3979
2.01M
            if (cram_stats_add(c->stats[DS_NP], cr->mate_pos) < 0)
3980
0
                goto block_err;
3981
3982
2.01M
            cr->tlen        = bam_ins_size(b);
3983
2.01M
            if (cram_stats_add(c->stats[DS_TS], cr->tlen) < 0)
3984
0
                goto block_err;
3985
3986
2.01M
            cr->cram_flags |= CRAM_FLAG_DETACHED;
3987
2.01M
            if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0)
3988
0
                goto block_err;
3989
2.01M
            if (cram_stats_add(c->stats[DS_NS], bam_mate_ref(b)) < 0)
3990
0
                goto block_err;
3991
3992
2.01M
            cr->cram_flags |= CRAM_FLAG_STATS_ADDED;
3993
2.01M
        }
3994
2.01M
    }
3995
3996
2.01M
    cr->mqual       = bam_map_qual(b);
3997
2.01M
    if (cram_stats_add(c->stats[DS_MQ], cr->mqual) < 0)
3998
0
        goto block_err;
3999
4000
2.01M
    cr->mate_ref_id = bam_mate_ref(b);
4001
4002
2.01M
    if (!(bam_flag(b) & BAM_FUNMAP)) {
4003
18.5k
        if (c->first_base > cr->apos)
4004
53
            c->first_base = cr->apos;
4005
4006
18.5k
        if (c->last_base < cr->aend)
4007
1.03k
            c->last_base = cr->aend;
4008
18.5k
    }
4009
4010
2.01M
    return 0;
4011
4012
69
 block_err:
4013
87
 err:
4014
87
    return -1;
4015
69
}
4016
4017
/*
4018
 * Write iterator: put BAM format sequences into a CRAM file.
4019
 * We buffer up a containers worth of data at a time.
4020
 *
4021
 * Returns 0 on success
4022
 *        -1 on failure
4023
 */
4024
2.01M
int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
4025
2.01M
    cram_container *c;
4026
4027
2.01M
    if (!fd->ctr) {
4028
2.71k
        fd->ctr = cram_new_container(fd->seqs_per_slice,
4029
2.71k
                                     fd->slices_per_container);
4030
2.71k
        if (!fd->ctr)
4031
0
            return -1;
4032
2.71k
        fd->ctr->record_counter = fd->record_counter;
4033
4034
2.71k
        pthread_mutex_lock(&fd->ref_lock);
4035
2.71k
        fd->ctr->no_ref = fd->no_ref;
4036
2.71k
        fd->ctr->embed_ref = fd->embed_ref;
4037
2.71k
        pthread_mutex_unlock(&fd->ref_lock);
4038
2.71k
    }
4039
2.01M
    c = fd->ctr;
4040
4041
2.01M
    int embed_ref = c->embed_ref;
4042
4043
2.01M
    if (!c->slice || c->curr_rec == c->max_rec ||
4044
2.01M
        (bam_ref(b) != c->curr_ref && c->curr_ref >= -1) ||
4045
2.00M
        (c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice)) {
4046
17.4k
        int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1;
4047
17.4k
        int curr_ref = c->slice ? c->curr_ref : bam_ref(b);
4048
4049
        /*
4050
         * Start packing slices when we routinely have under 1/4tr full.
4051
         *
4052
         * This option isn't available if we choose to embed references
4053
         * since we can only have one per slice.
4054
         *
4055
         * The multi_seq var here refers to our intention for the next slice.
4056
         * This slice has already been encoded so we output as-is.
4057
         */
4058
17.4k
        if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 &&
4059
15.6k
            fd->last_slice && fd->last_slice < c->max_rec/4+10 &&
4060
12.5k
            embed_ref<=0) {
4061
246
            if (!c->multi_seq)
4062
170
                hts_log_info("Multi-ref enabled for next container");
4063
246
            multi_seq = 1;
4064
17.1k
        } else if (fd->multi_seq == 1) {
4065
1.57k
            pthread_mutex_lock(&fd->metrics_lock);
4066
1.57k
            if (fd->last_RI_count <= c->max_slice && fd->multi_seq_user != 1) {
4067
155
                multi_seq = 0;
4068
155
                hts_log_info("Multi-ref disabled for next container");
4069
155
            }
4070
1.57k
            pthread_mutex_unlock(&fd->metrics_lock);
4071
1.57k
        }
4072
4073
17.4k
        slice_rec = c->slice_rec;
4074
17.4k
        curr_rec  = c->curr_rec;
4075
4076
17.4k
        if (CRAM_MAJOR_VERS(fd->version) == 1 ||
4077
17.4k
            c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice ||
4078
15.9k
            c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice) {
4079
15.9k
            if (NULL == (c = cram_next_container(fd, b))) {
4080
69
                if (fd->ctr) {
4081
                    // prevent cram_close attempting to flush
4082
69
                    fd->ctr_mt = fd->ctr; // delay free when threading
4083
69
                    fd->ctr = NULL;
4084
69
                }
4085
69
                return -1;
4086
69
            }
4087
15.9k
        }
4088
4089
        /*
4090
         * Due to our processing order, some things we've already done we
4091
         * cannot easily undo. So when we first notice we should be packing
4092
         * multiple sequences per container we emit the small partial
4093
         * container as-is and then start a fresh one in a different mode.
4094
         */
4095
17.3k
        if (multi_seq == 0 && fd->multi_seq == 1 && fd->multi_seq_user == -1) {
4096
            // User selected auto-mode, we're currently using multi-seq, but
4097
            // have detected we don't need to.  Switch back to auto.
4098
154
            fd->multi_seq = -1;
4099
17.2k
        } else if (multi_seq) {
4100
            // We detected we need multi-seq
4101
1.66k
            fd->multi_seq = 1;
4102
1.66k
            c->multi_seq = 1;
4103
1.66k
            c->pos_sorted = 0;
4104
4105
            // Cram_next_container may end up flushing an existing one and
4106
            // triggering fd->embed_ref=2 if no reference is found.
4107
            // Embedded refs are incompatible with multi-seq, so we bail
4108
            // out and switch to no_ref in this scenario.  We do this
4109
            // within the container only, as multi_seq may be temporary
4110
            // and we switch back away from it again.
4111
1.66k
            pthread_mutex_lock(&fd->ref_lock);
4112
1.66k
            if (fd->embed_ref > 0 && c->curr_rec == 0 && c->curr_slice == 0) {
4113
35
                hts_log_warning("Changing from embed_ref to no_ref mode");
4114
                // Should we update fd->embed_ref and no_ref here too?
4115
                // Doing so means if we go into multi-seq and back out
4116
                // again, eg due a cluster of tiny refs in the middle of
4117
                // much larger ones, then we bake in no-ref mode.
4118
                //
4119
                // However for unsorted data we're realistically not
4120
                // going to switch back.
4121
35
                c->embed_ref = fd->embed_ref = 0; // or -1 for auto?
4122
35
                c->no_ref = fd->no_ref = 1;
4123
35
            }
4124
1.66k
            pthread_mutex_unlock(&fd->ref_lock);
4125
4126
1.66k
            if (!c->refs_used) {
4127
285
                pthread_mutex_lock(&fd->ref_lock);
4128
285
                c->refs_used = calloc(fd->refs->nref, sizeof(int));
4129
285
                pthread_mutex_unlock(&fd->ref_lock);
4130
285
                if (!c->refs_used)
4131
0
                    return -1;
4132
285
            }
4133
1.66k
        }
4134
4135
17.3k
        fd->last_slice = curr_rec - slice_rec;
4136
17.3k
        c->slice_rec = c->curr_rec;
4137
4138
        // Have we seen this reference before?
4139
17.3k
        if (bam_ref(b) >= 0 && curr_ref >= 0 && bam_ref(b) != curr_ref &&
4140
0
            embed_ref<=0 && !fd->unsorted && multi_seq) {
4141
4142
0
            if (!c->refs_used) {
4143
0
                pthread_mutex_lock(&fd->ref_lock);
4144
0
                c->refs_used = calloc(fd->refs->nref, sizeof(int));
4145
0
                pthread_mutex_unlock(&fd->ref_lock);
4146
0
                if (!c->refs_used)
4147
0
                    return -1;
4148
0
            } else if (c->refs_used && c->refs_used[bam_ref(b)]) {
4149
0
                pthread_mutex_lock(&fd->ref_lock);
4150
0
                fd->unsorted = 1;
4151
0
                fd->multi_seq = 1;
4152
0
                pthread_mutex_unlock(&fd->ref_lock);
4153
0
            }
4154
0
        }
4155
4156
17.3k
        c->curr_ref = bam_ref(b);
4157
17.3k
        if (c->refs_used && c->curr_ref >= 0) c->refs_used[c->curr_ref]++;
4158
17.3k
    }
4159
4160
2.01M
    if (!c->bams) {
4161
        /* First time through, allocate a set of bam pointers */
4162
15.8k
        pthread_mutex_lock(&fd->bam_list_lock);
4163
15.8k
        if (fd->bl) {
4164
13.1k
            spare_bams *spare = fd->bl;
4165
13.1k
            c->bams = spare->bams;
4166
13.1k
            fd->bl = spare->next;
4167
13.1k
            free(spare);
4168
13.1k
        } else {
4169
2.71k
            c->bams = calloc(c->max_c_rec, sizeof(bam_seq_t *));
4170
2.71k
            if (!c->bams) {
4171
0
                pthread_mutex_unlock(&fd->bam_list_lock);
4172
0
                return -1;
4173
0
            }
4174
2.71k
        }
4175
15.8k
        pthread_mutex_unlock(&fd->bam_list_lock);
4176
15.8k
    }
4177
4178
    /* Copy or alloc+copy the bam record, for later encoding */
4179
2.01M
    if (c->bams[c->curr_c_rec]) {
4180
1.56M
        if (bam_copy1(c->bams[c->curr_c_rec], b) == NULL)
4181
0
            return -1;
4182
1.56M
    } else {
4183
452k
        c->bams[c->curr_c_rec] = bam_dup1(b);
4184
452k
        if (c->bams[c->curr_c_rec] == NULL)
4185
0
            return -1;
4186
452k
    }
4187
2.01M
    if (bam_seq_len(b)) {
4188
87.3k
        c->s_num_bases += bam_seq_len(b);
4189
1.93M
    } else {
4190
        // No sequence in BAM record.  CRAM doesn't directly support this
4191
        // case, it ends up being stored as a string of N's for each query
4192
        // consuming CIGAR operation.  As this can become very inefficient
4193
        // in time and memory, data where the query length is excessively
4194
        // long are rejected.
4195
1.93M
        hts_pos_t qlen = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
4196
1.93M
        if (qlen > 100000000) {
4197
40
            hts_log_error("CIGAR query length %"PRIhts_pos
4198
40
                          " for read \"%s\" is too long",
4199
40
                          qlen, bam_get_qname(b));
4200
40
            return -1;
4201
40
        }
4202
1.93M
        c->s_num_bases += qlen;
4203
1.93M
    }
4204
2.01M
    c->curr_rec++;
4205
2.01M
    c->curr_c_rec++;
4206
2.01M
    c->s_aux_bytes += bam_get_l_aux(b);
4207
2.01M
    c->n_mapped += (bam_flag(b) & BAM_FUNMAP) ? 0 : 1;
4208
2.01M
    fd->record_counter++;
4209
4210
2.01M
    return 0;
4211
2.01M
}