Coverage Report

Created: 2025-07-18 07:26

/src/htslib/cram/cram_encode.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2012-2020, 2022-2024 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
32
#include <config.h>
33
34
#include <stdio.h>
35
#include <errno.h>
36
#include <assert.h>
37
#include <stdlib.h>
38
#include <string.h>
39
#include <strings.h>
40
#include <zlib.h>
41
#include <sys/types.h>
42
#include <sys/stat.h>
43
#include <math.h>
44
#include <inttypes.h>
45
#include <ctype.h>
46
47
#include "cram.h"
48
#include "os.h"
49
#include "../sam_internal.h" // for nibble2base
50
#include "../htslib/hts.h"
51
#include "../htslib/hts_endian.h"
52
#include "../textutils_internal.h"
53
54
KHASH_MAP_INIT_STR(m_s2u64, uint64_t)
55
56
#define Z_CRAM_STRAT Z_FILTERED
57
//#define Z_CRAM_STRAT Z_RLE
58
//#define Z_CRAM_STRAT Z_HUFFMAN_ONLY
59
//#define Z_CRAM_STRAT Z_DEFAULT_STRATEGY
60
61
static int process_one_read(cram_fd *fd, cram_container *c,
62
                            cram_slice *s, cram_record *cr,
63
                            bam_seq_t *b, int rnum, kstring_t *MD,
64
                            int embed_ref, int no_ref);
65
66
/*
67
 * Returns index of val into key.
68
 * Basically strchr(key, val)-key;
69
 */
70
1.27M
static int sub_idx(char *key, char val) {
71
1.27M
    int i;
72
73
3.19M
    for (i = 0; i < 4 && *key++ != val; i++);
74
1.27M
    return i;
75
1.27M
}
76
77
/*
78
 * Encodes a compression header block into a generic cram_block structure.
79
 *
80
 * Returns cram_block ptr on success
81
 *         NULL on failure
82
 */
83
cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
84
                                           cram_block_compression_hdr *h,
85
67.6k
                                           int embed_ref) {
86
67.6k
    cram_block *cb  = cram_new_block(COMPRESSION_HEADER, 0);
87
67.6k
    cram_block *map = cram_new_block(COMPRESSION_HEADER, 0);
88
67.6k
    int i, mc, r = 0;
89
90
67.6k
    int no_ref = c->no_ref;
91
92
67.6k
    if (!cb || !map)
93
0
        return NULL;
94
95
    /*
96
     * This is a concatenation of several blocks of data:
97
     * header + landmarks, preservation map, read encoding map, and the tag
98
     * encoding map.
99
     * All 4 are variable sized and we need to know how large these are
100
     * before creating the compression header itself as this starts with
101
     * the total size (stored as a variable length string).
102
     */
103
104
    // Duplicated from container itself, and removed in 1.1
105
67.6k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
106
0
        r |= itf8_put_blk(cb, h->ref_seq_id);
107
0
        r |= itf8_put_blk(cb, h->ref_seq_start);
108
0
        r |= itf8_put_blk(cb, h->ref_seq_span);
109
0
        r |= itf8_put_blk(cb, h->num_records);
110
0
        r |= itf8_put_blk(cb, h->num_landmarks);
111
0
        for (i = 0; i < h->num_landmarks; i++) {
112
0
            r |= itf8_put_blk(cb, h->landmark[i]);
113
0
        }
114
0
    }
115
116
67.6k
    if (h->preservation_map) {
117
0
        kh_destroy(map, h->preservation_map);
118
0
        h->preservation_map = NULL;
119
0
    }
120
121
    /* Create in-memory preservation map */
122
    /* FIXME: should create this when we create the container */
123
67.6k
    if (c->num_records > 0) {
124
63.9k
        khint_t k;
125
63.9k
        int r;
126
127
63.9k
        if (!(h->preservation_map = kh_init(map)))
128
0
            return NULL;
129
130
63.9k
        k = kh_put(map, h->preservation_map, "RN", &r);
131
63.9k
        if (-1 == r) return NULL;
132
63.9k
        kh_val(h->preservation_map, k).i = !fd->lossy_read_names;
133
134
63.9k
        if (CRAM_MAJOR_VERS(fd->version) == 1) {
135
0
            k = kh_put(map, h->preservation_map, "PI", &r);
136
0
            if (-1 == r) return NULL;
137
0
            kh_val(h->preservation_map, k).i = 0;
138
139
0
            k = kh_put(map, h->preservation_map, "UI", &r);
140
0
            if (-1 == r) return NULL;
141
0
            kh_val(h->preservation_map, k).i = 1;
142
143
0
            k = kh_put(map, h->preservation_map, "MI", &r);
144
0
            if (-1 == r) return NULL;
145
0
            kh_val(h->preservation_map, k).i = 1;
146
147
63.9k
        } else {
148
            // Technically SM was in 1.0, but wasn't in Java impl.
149
63.9k
            k = kh_put(map, h->preservation_map, "SM", &r);
150
63.9k
            if (-1 == r) return NULL;
151
63.9k
            kh_val(h->preservation_map, k).i = 0;
152
153
63.9k
            k = kh_put(map, h->preservation_map, "TD", &r);
154
63.9k
            if (-1 == r) return NULL;
155
63.9k
            kh_val(h->preservation_map, k).i = 0;
156
157
63.9k
            k = kh_put(map, h->preservation_map, "AP", &r);
158
63.9k
            if (-1 == r) return NULL;
159
63.9k
            kh_val(h->preservation_map, k).i = h->AP_delta;
160
161
63.9k
            if (CRAM_MAJOR_VERS(fd->version) >= 4) {
162
0
                k = kh_put(map, h->preservation_map, "QO", &r);
163
0
                if (-1 == r) return NULL;
164
0
                kh_val(h->preservation_map, k).i = h->qs_seq_orient;
165
0
            }
166
167
63.9k
            if (no_ref || embed_ref>0) {
168
                // Reference Required == No
169
62.7k
                k = kh_put(map, h->preservation_map, "RR", &r);
170
62.7k
                if (-1 == r) return NULL;
171
62.7k
                kh_val(h->preservation_map, k).i = 0;
172
62.7k
            }
173
63.9k
        }
174
63.9k
    }
175
176
    /* Encode preservation map; could collapse this and above into one */
177
67.6k
    mc = 0;
178
67.6k
    BLOCK_SIZE(map) = 0;
179
67.6k
    if (h->preservation_map) {
180
63.9k
        khint_t k;
181
182
63.9k
        for (k = kh_begin(h->preservation_map);
183
575k
             k != kh_end(h->preservation_map);
184
511k
             k++) {
185
511k
            const char *key;
186
511k
            khash_t(map) *pmap = h->preservation_map;
187
188
189
511k
            if (!kh_exist(pmap, k))
190
193k
                continue;
191
192
318k
            key = kh_key(pmap, k);
193
318k
            BLOCK_APPEND(map, key, 2);
194
195
318k
            switch(CRAM_KEY(key[0], key[1])) {
196
0
            case CRAM_KEY('M','I'):
197
0
            case CRAM_KEY('U','I'):
198
0
            case CRAM_KEY('P','I'):
199
63.9k
            case CRAM_KEY('A','P'):
200
127k
            case CRAM_KEY('R','N'):
201
190k
            case CRAM_KEY('R','R'):
202
190k
            case CRAM_KEY('Q','O'):
203
190k
                BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
204
190k
                break;
205
206
190k
            case CRAM_KEY('S','M'): {
207
63.9k
                char smat[5], *mp = smat;
208
                // Output format is for order ACGTN (minus ref base)
209
                // to store the code value 0-3 for each symbol.
210
                //
211
                // Note this is different to storing the symbols in order
212
                // that the codes occur from 0-3, which is what we used to
213
                // do.  (It didn't matter as we always had a fixed table in
214
                // the order.)
215
63.9k
                *mp++ =
216
63.9k
                    (sub_idx(h->substitution_matrix[0], 'C') << 6) |
217
63.9k
                    (sub_idx(h->substitution_matrix[0], 'G') << 4) |
218
63.9k
                    (sub_idx(h->substitution_matrix[0], 'T') << 2) |
219
63.9k
                    (sub_idx(h->substitution_matrix[0], 'N') << 0);
220
63.9k
                *mp++ =
221
63.9k
                    (sub_idx(h->substitution_matrix[1], 'A') << 6) |
222
63.9k
                    (sub_idx(h->substitution_matrix[1], 'G') << 4) |
223
63.9k
                    (sub_idx(h->substitution_matrix[1], 'T') << 2) |
224
63.9k
                    (sub_idx(h->substitution_matrix[1], 'N') << 0);
225
63.9k
                *mp++ =
226
63.9k
                    (sub_idx(h->substitution_matrix[2], 'A') << 6) |
227
63.9k
                    (sub_idx(h->substitution_matrix[2], 'C') << 4) |
228
63.9k
                    (sub_idx(h->substitution_matrix[2], 'T') << 2) |
229
63.9k
                    (sub_idx(h->substitution_matrix[2], 'N') << 0);
230
63.9k
                *mp++ =
231
63.9k
                    (sub_idx(h->substitution_matrix[3], 'A') << 6) |
232
63.9k
                    (sub_idx(h->substitution_matrix[3], 'C') << 4) |
233
63.9k
                    (sub_idx(h->substitution_matrix[3], 'G') << 2) |
234
63.9k
                    (sub_idx(h->substitution_matrix[3], 'N') << 0);
235
63.9k
                *mp++ =
236
63.9k
                    (sub_idx(h->substitution_matrix[4], 'A') << 6) |
237
63.9k
                    (sub_idx(h->substitution_matrix[4], 'C') << 4) |
238
63.9k
                    (sub_idx(h->substitution_matrix[4], 'G') << 2) |
239
63.9k
                    (sub_idx(h->substitution_matrix[4], 'T') << 0);
240
63.9k
                BLOCK_APPEND(map, smat, 5);
241
63.9k
                break;
242
63.9k
            }
243
244
63.9k
            case CRAM_KEY('T','D'): {
245
63.9k
                r |= (fd->vv.varint_put32_blk(map, BLOCK_SIZE(h->TD_blk)) <= 0);
246
63.9k
                BLOCK_APPEND(map,
247
63.9k
                             BLOCK_DATA(h->TD_blk),
248
63.9k
                             BLOCK_SIZE(h->TD_blk));
249
63.9k
                break;
250
63.9k
            }
251
252
63.9k
            default:
253
0
                hts_log_warning("Unknown preservation key '%.2s'", key);
254
0
                break;
255
318k
            }
256
257
318k
            mc++;
258
318k
        }
259
63.9k
    }
260
67.6k
    r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
261
67.6k
    r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
262
67.6k
    BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
263
264
    /* rec encoding map */
265
67.6k
    mc = 0;
266
67.6k
    BLOCK_SIZE(map) = 0;
267
67.6k
    if (h->codecs[DS_BF]) {
268
63.9k
        if (-1 == h->codecs[DS_BF]->store(h->codecs[DS_BF], map, "BF",
269
63.9k
                                          fd->version))
270
0
            return NULL;
271
63.9k
        mc++;
272
63.9k
    }
273
67.6k
    if (h->codecs[DS_CF]) {
274
63.9k
        if (-1 == h->codecs[DS_CF]->store(h->codecs[DS_CF], map, "CF",
275
63.9k
                                          fd->version))
276
0
            return NULL;
277
63.9k
        mc++;
278
63.9k
    }
279
67.6k
    if (h->codecs[DS_RL]) {
280
63.9k
        if (-1 == h->codecs[DS_RL]->store(h->codecs[DS_RL], map, "RL",
281
63.9k
                                          fd->version))
282
0
            return NULL;
283
63.9k
        mc++;
284
63.9k
    }
285
67.6k
    if (h->codecs[DS_AP]) {
286
63.9k
        if (-1 == h->codecs[DS_AP]->store(h->codecs[DS_AP], map, "AP",
287
63.9k
                                          fd->version))
288
0
            return NULL;
289
63.9k
        mc++;
290
63.9k
    }
291
67.6k
    if (h->codecs[DS_RG]) {
292
63.9k
        if (-1 == h->codecs[DS_RG]->store(h->codecs[DS_RG], map, "RG",
293
63.9k
                                          fd->version))
294
0
            return NULL;
295
63.9k
        mc++;
296
63.9k
    }
297
67.6k
    if (h->codecs[DS_MF]) {
298
63.9k
        if (-1 == h->codecs[DS_MF]->store(h->codecs[DS_MF], map, "MF",
299
63.9k
                                          fd->version))
300
0
            return NULL;
301
63.9k
        mc++;
302
63.9k
    }
303
67.6k
    if (h->codecs[DS_NS]) {
304
63.9k
        if (-1 == h->codecs[DS_NS]->store(h->codecs[DS_NS], map, "NS",
305
63.9k
                                          fd->version))
306
0
            return NULL;
307
63.9k
        mc++;
308
63.9k
    }
309
67.6k
    if (h->codecs[DS_NP]) {
310
63.9k
        if (-1 == h->codecs[DS_NP]->store(h->codecs[DS_NP], map, "NP",
311
63.9k
                                          fd->version))
312
0
            return NULL;
313
63.9k
        mc++;
314
63.9k
    }
315
67.6k
    if (h->codecs[DS_TS]) {
316
63.9k
        if (-1 == h->codecs[DS_TS]->store(h->codecs[DS_TS], map, "TS",
317
63.9k
                                          fd->version))
318
0
            return NULL;
319
63.9k
        mc++;
320
63.9k
    }
321
67.6k
    if (h->codecs[DS_NF]) {
322
0
        if (-1 == h->codecs[DS_NF]->store(h->codecs[DS_NF], map, "NF",
323
0
                                          fd->version))
324
0
            return NULL;
325
0
        mc++;
326
0
    }
327
67.6k
    if (h->codecs[DS_TC]) {
328
0
        if (-1 == h->codecs[DS_TC]->store(h->codecs[DS_TC], map, "TC",
329
0
                                          fd->version))
330
0
            return NULL;
331
0
        mc++;
332
0
    }
333
67.6k
    if (h->codecs[DS_TN]) {
334
0
        if (-1 == h->codecs[DS_TN]->store(h->codecs[DS_TN], map, "TN",
335
0
                                          fd->version))
336
0
            return NULL;
337
0
        mc++;
338
0
    }
339
67.6k
    if (h->codecs[DS_TL]) {
340
63.9k
        if (-1 == h->codecs[DS_TL]->store(h->codecs[DS_TL], map, "TL",
341
63.9k
                                          fd->version))
342
0
            return NULL;
343
63.9k
        mc++;
344
63.9k
    }
345
67.6k
    if (h->codecs[DS_FN]) {
346
29.5k
        if (-1 == h->codecs[DS_FN]->store(h->codecs[DS_FN], map, "FN",
347
29.5k
                                          fd->version))
348
0
            return NULL;
349
29.5k
        mc++;
350
29.5k
    }
351
67.6k
    if (h->codecs[DS_FC]) {
352
29.5k
        if (-1 == h->codecs[DS_FC]->store(h->codecs[DS_FC], map, "FC",
353
29.5k
                                          fd->version))
354
0
            return NULL;
355
29.5k
        mc++;
356
29.5k
    }
357
67.6k
    if (h->codecs[DS_FP]) {
358
29.5k
        if (-1 == h->codecs[DS_FP]->store(h->codecs[DS_FP], map, "FP",
359
29.5k
                                          fd->version))
360
0
            return NULL;
361
29.5k
        mc++;
362
29.5k
    }
363
67.6k
    if (h->codecs[DS_BS]) {
364
194
        if (-1 == h->codecs[DS_BS]->store(h->codecs[DS_BS], map, "BS",
365
194
                                          fd->version))
366
0
            return NULL;
367
194
        mc++;
368
194
    }
369
67.6k
    if (h->codecs[DS_IN]) {
370
63.9k
        if (-1 == h->codecs[DS_IN]->store(h->codecs[DS_IN], map, "IN",
371
63.9k
                                          fd->version))
372
0
            return NULL;
373
63.9k
        mc++;
374
63.9k
    }
375
67.6k
    if (h->codecs[DS_DL]) {
376
8.59k
        if (-1 == h->codecs[DS_DL]->store(h->codecs[DS_DL], map, "DL",
377
8.59k
                                          fd->version))
378
0
            return NULL;
379
8.59k
        mc++;
380
8.59k
    }
381
67.6k
    if (h->codecs[DS_BA]) {
382
2.21k
        if (-1 == h->codecs[DS_BA]->store(h->codecs[DS_BA], map, "BA",
383
2.21k
                                          fd->version))
384
0
            return NULL;
385
2.21k
        mc++;
386
2.21k
    }
387
67.6k
    if (h->codecs[DS_BB]) {
388
63.9k
        if (-1 == h->codecs[DS_BB]->store(h->codecs[DS_BB], map, "BB",
389
63.9k
                                          fd->version))
390
0
            return NULL;
391
63.9k
        mc++;
392
63.9k
    }
393
67.6k
    if (h->codecs[DS_MQ]) {
394
63.9k
        if (-1 == h->codecs[DS_MQ]->store(h->codecs[DS_MQ], map, "MQ",
395
63.9k
                                          fd->version))
396
0
            return NULL;
397
63.9k
        mc++;
398
63.9k
    }
399
67.6k
    if (h->codecs[DS_RN]) {
400
63.9k
        if (-1 == h->codecs[DS_RN]->store(h->codecs[DS_RN], map, "RN",
401
63.9k
                                          fd->version))
402
0
            return NULL;
403
63.9k
        mc++;
404
63.9k
    }
405
67.6k
    if (h->codecs[DS_QS]) {
406
63.9k
        if (-1 == h->codecs[DS_QS]->store(h->codecs[DS_QS], map, "QS",
407
63.9k
                                          fd->version))
408
0
            return NULL;
409
63.9k
        mc++;
410
63.9k
    }
411
67.6k
    if (h->codecs[DS_QQ]) {
412
0
        if (-1 == h->codecs[DS_QQ]->store(h->codecs[DS_QQ], map, "QQ",
413
0
                                          fd->version))
414
0
            return NULL;
415
0
        mc++;
416
0
    }
417
67.6k
    if (h->codecs[DS_RI]) {
418
63.9k
        if (-1 == h->codecs[DS_RI]->store(h->codecs[DS_RI], map, "RI",
419
63.9k
                                          fd->version))
420
0
            return NULL;
421
63.9k
        mc++;
422
63.9k
    }
423
67.6k
    if (CRAM_MAJOR_VERS(fd->version) != 1) {
424
67.6k
        if (h->codecs[DS_SC]) {
425
63.9k
            if (-1 == h->codecs[DS_SC]->store(h->codecs[DS_SC], map, "SC",
426
63.9k
                                              fd->version))
427
0
                return NULL;
428
63.9k
            mc++;
429
63.9k
        }
430
67.6k
        if (h->codecs[DS_RS]) {
431
827
            if (-1 == h->codecs[DS_RS]->store(h->codecs[DS_RS], map, "RS",
432
827
                                              fd->version))
433
0
                return NULL;
434
827
            mc++;
435
827
        }
436
67.6k
        if (h->codecs[DS_PD]) {
437
20
            if (-1 == h->codecs[DS_PD]->store(h->codecs[DS_PD], map, "PD",
438
20
                                              fd->version))
439
0
                return NULL;
440
20
            mc++;
441
20
        }
442
67.6k
        if (h->codecs[DS_HC]) {
443
2.64k
            if (-1 == h->codecs[DS_HC]->store(h->codecs[DS_HC], map, "HC",
444
2.64k
                                              fd->version))
445
0
                return NULL;
446
2.64k
            mc++;
447
2.64k
        }
448
67.6k
    }
449
67.6k
    if (h->codecs[DS_TM]) {
450
0
        if (-1 == h->codecs[DS_TM]->store(h->codecs[DS_TM], map, "TM",
451
0
                                          fd->version))
452
0
            return NULL;
453
0
        mc++;
454
0
    }
455
67.6k
    if (h->codecs[DS_TV]) {
456
0
        if (-1 == h->codecs[DS_TV]->store(h->codecs[DS_TV], map, "TV",
457
0
                                          fd->version))
458
0
            return NULL;
459
0
        mc++;
460
0
    }
461
67.6k
    r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
462
67.6k
    r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
463
67.6k
    BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
464
465
    /* tag encoding map */
466
67.6k
    mc = 0;
467
67.6k
    BLOCK_SIZE(map) = 0;
468
67.6k
    if (c->tags_used) {
469
63.9k
        khint_t k;
470
471
449k
        for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
472
385k
            int key;
473
385k
            if (!kh_exist(c->tags_used, k))
474
214k
                continue;
475
476
171k
            key = kh_key(c->tags_used, k);
477
171k
            cram_codec *cd = kh_val(c->tags_used, k)->codec;
478
479
171k
            r |= (fd->vv.varint_put32_blk(map, key) <= 0);
480
171k
            if (-1 == cd->store(cd, map, NULL, fd->version))
481
0
                return NULL;
482
483
171k
            mc++;
484
171k
        }
485
63.9k
    }
486
487
67.6k
    r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
488
67.6k
    r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
489
67.6k
    BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
490
491
67.6k
    hts_log_info("Wrote compression block header in %d bytes", (int)BLOCK_SIZE(cb));
492
493
67.6k
    BLOCK_UPLEN(cb);
494
495
67.6k
    cram_free_block(map);
496
497
67.6k
    if (r >= 0)
498
67.6k
        return cb;
499
500
0
 block_err:
501
0
    return NULL;
502
67.6k
}
503
504
505
/*
506
 * Encodes a slice compression header.
507
 *
508
 * Returns cram_block on success
509
 *         NULL on failure
510
 */
511
64.0k
cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) {
512
64.0k
    char *buf;
513
64.0k
    char *cp;
514
64.0k
    cram_block *b = cram_new_block(MAPPED_SLICE, 0);
515
64.0k
    int j;
516
517
64.0k
    if (!b)
518
0
        return NULL;
519
520
64.0k
    cp = buf = malloc(22+16+5*(8+s->hdr->num_blocks));
521
64.0k
    if (NULL == buf) {
522
0
        cram_free_block(b);
523
0
        return NULL;
524
0
    }
525
526
64.0k
    cp += fd->vv.varint_put32s(cp, NULL, s->hdr->ref_seq_id);
527
64.0k
    if (CRAM_MAJOR_VERS(fd->version) >= 4) {
528
0
        cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_start);
529
0
        cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_span);
530
64.0k
    } else {
531
64.0k
        if (s->hdr->ref_seq_start < 0 || s->hdr->ref_seq_start > INT_MAX) {
532
15
            hts_log_error("Reference position too large for CRAM 3");
533
15
            cram_free_block(b);
534
15
            free(buf);
535
15
            return NULL;
536
15
        }
537
63.9k
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_start);
538
63.9k
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_span);
539
63.9k
    }
540
63.9k
    cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_records);
541
63.9k
    if (CRAM_MAJOR_VERS(fd->version) == 2)
542
0
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->record_counter);
543
63.9k
    else if (CRAM_MAJOR_VERS(fd->version) >= 3)
544
63.9k
        cp += fd->vv.varint_put64(cp, NULL, s->hdr->record_counter);
545
63.9k
    cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_blocks);
546
63.9k
    cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_content_ids);
547
369k
    for (j = 0; j < s->hdr->num_content_ids; j++) {
548
305k
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->block_content_ids[j]);
549
305k
    }
550
63.9k
    if (s->hdr->content_type == MAPPED_SLICE)
551
63.9k
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_base_id);
552
553
63.9k
    if (CRAM_MAJOR_VERS(fd->version) != 1) {
554
63.9k
        memcpy(cp, s->hdr->md5, 16); cp += 16;
555
63.9k
    }
556
557
63.9k
    assert(cp-buf <= 22+16+5*(8+s->hdr->num_blocks));
558
559
63.9k
    b->data = (unsigned char *)buf;
560
63.9k
    b->comp_size = b->uncomp_size = cp-buf;
561
562
63.9k
    return b;
563
63.9k
}
564
565
566
/*
567
 * Encodes a single read.
568
 *
569
 * Returns 0 on success
570
 *        -1 on failure
571
 */
572
static int cram_encode_slice_read(cram_fd *fd,
573
                                  cram_container *c,
574
                                  cram_block_compression_hdr *h,
575
                                  cram_slice *s,
576
                                  cram_record *cr,
577
3.43M
                                  int64_t *last_pos) {
578
3.43M
    int r = 0;
579
3.43M
    int32_t i32;
580
3.43M
    int64_t i64;
581
3.43M
    unsigned char uc;
582
583
    //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name);
584
585
    //printf("BF=0x%x\n", cr->flags);
586
    //      bf = cram_flag_swap[cr->flags];
587
3.43M
    i32 = fd->cram_flag_swap[cr->flags & 0xfff];
588
3.43M
    r |= h->codecs[DS_BF]->encode(s, h->codecs[DS_BF], (char *)&i32, 1);
589
590
3.43M
    i32 = cr->cram_flags & CRAM_FLAG_MASK;
591
3.43M
    r |= h->codecs[DS_CF]->encode(s, h->codecs[DS_CF], (char *)&i32, 1);
592
593
3.43M
    if (CRAM_MAJOR_VERS(fd->version) != 1 && s->hdr->ref_seq_id == -2)
594
12.8k
        r |= h->codecs[DS_RI]->encode(s, h->codecs[DS_RI], (char *)&cr->ref_id, 1);
595
596
3.43M
    r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1);
597
598
3.43M
    if (c->pos_sorted) {
599
3.41M
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
600
0
            i64 = cr->apos - *last_pos;
601
0
            r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1);
602
3.41M
        } else {
603
3.41M
            i32 = cr->apos - *last_pos;
604
3.41M
            r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
605
3.41M
        }
606
3.41M
        *last_pos = cr->apos;
607
3.41M
    } else {
608
17.6k
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
609
0
            i64 = cr->apos;
610
0
            r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1);
611
17.6k
        } else {
612
17.6k
            i32 = cr->apos;
613
17.6k
            r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
614
17.6k
        }
615
17.6k
    }
616
617
3.43M
    r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1);
618
619
3.43M
    if (cr->cram_flags & CRAM_FLAG_DETACHED) {
620
3.43M
        i32 = cr->mate_flags;
621
3.43M
        r |= h->codecs[DS_MF]->encode(s, h->codecs[DS_MF], (char *)&i32, 1);
622
623
3.43M
        r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS],
624
3.43M
                                      (char *)&cr->mate_ref_id, 1);
625
626
3.43M
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
627
0
            r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
628
0
                                          (char *)&cr->mate_pos, 1);
629
0
            r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
630
0
                                          (char *)&cr->tlen, 1);
631
3.43M
        } else {
632
3.43M
            i32 = cr->mate_pos;
633
3.43M
            r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
634
3.43M
                                          (char *)&i32, 1);
635
3.43M
            i32 = cr->tlen;
636
3.43M
            r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
637
3.43M
                                          (char *)&i32, 1);
638
3.43M
        }
639
3.43M
    } else {
640
0
        if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) {
641
0
            r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF],
642
0
                                          (char *)&cr->mate_line, 1);
643
0
        }
644
0
        if (cr->cram_flags & CRAM_FLAG_EXPLICIT_TLEN) {
645
0
            if (CRAM_MAJOR_VERS(fd->version) >= 4) {
646
0
                r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
647
0
                                              (char *)&cr->tlen, 1);
648
0
            }
649
0
        }
650
0
    }
651
652
    /* Aux tags */
653
3.43M
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
654
0
        int j;
655
0
        uc = cr->ntags;
656
0
        r |= h->codecs[DS_TC]->encode(s, h->codecs[DS_TC], (char *)&uc, 1);
657
658
0
        for (j = 0; j < cr->ntags; j++) {
659
0
            uint32_t i32 = s->TN[cr->TN_idx + j]; // id
660
0
            r |= h->codecs[DS_TN]->encode(s, h->codecs[DS_TN], (char *)&i32, 1);
661
0
        }
662
3.43M
    } else {
663
3.43M
        r |= h->codecs[DS_TL]->encode(s, h->codecs[DS_TL], (char *)&cr->TL, 1);
664
3.43M
    }
665
666
    // qual
667
    // QS codec : Already stored in block[2].
668
669
    // features (diffs)
670
3.43M
    if (!(cr->flags & BAM_FUNMAP)) {
671
84.1k
        int prev_pos = 0, j;
672
673
84.1k
        r |= h->codecs[DS_FN]->encode(s, h->codecs[DS_FN],
674
84.1k
                                      (char *)&cr->nfeature, 1);
675
234k
        for (j = 0; j < cr->nfeature; j++) {
676
150k
            cram_feature *f = &s->features[cr->feature + j];
677
678
150k
            uc = f->X.code;
679
150k
            r |= h->codecs[DS_FC]->encode(s, h->codecs[DS_FC], (char *)&uc, 1);
680
150k
            i32 = f->X.pos - prev_pos;
681
150k
            r |= h->codecs[DS_FP]->encode(s, h->codecs[DS_FP], (char *)&i32, 1);
682
150k
            prev_pos = f->X.pos;
683
684
150k
            switch(f->X.code) {
685
                //char *seq;
686
687
422
            case 'X':
688
                //fprintf(stderr, "    FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base);
689
690
422
                uc = f->X.base;
691
422
                r |= h->codecs[DS_BS]->encode(s, h->codecs[DS_BS],
692
422
                                              (char *)&uc, 1);
693
422
                break;
694
59.4k
            case 'S':
695
                // Already done
696
                //r |= h->codecs[DS_SC]->encode(s, h->codecs[DS_SC],
697
                //                              BLOCK_DATA(s->soft_blk) + f->S.seq_idx,
698
                //                              f->S.len);
699
700
                //if (CRAM_MAJOR_VERS(fd->version) >= 3) {
701
                //    r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
702
                //                                  BLOCK_DATA(s->seqs_blk) + f->S.seq_idx,
703
                //                                  f->S.len);
704
                //}
705
59.4k
                break;
706
737
            case 'I':
707
                //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
708
                //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
709
                //                           seq, f->S.len);
710
                //if (CRAM_MAJOR_VERS(fd->version) >= 3) {
711
                //    r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
712
                //                                  BLOCK_DATA(s->seqs_blk) + f->I.seq_idx,
713
                //                                  f->I.len);
714
                //}
715
737
                break;
716
11.8k
            case 'i':
717
11.8k
                uc = f->i.base;
718
11.8k
                r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
719
11.8k
                                              (char *)&uc, 1);
720
                //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
721
                //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
722
                //                           seq, 1);
723
11.8k
                break;
724
61.4k
            case 'D':
725
61.4k
                i32 = f->D.len;
726
61.4k
                r |= h->codecs[DS_DL]->encode(s, h->codecs[DS_DL],
727
61.4k
                                              (char *)&i32, 1);
728
61.4k
                break;
729
730
1.75k
            case 'B':
731
                //                  // Used when we try to store a non ACGTN base or an N
732
                //                  // that aligns against a non ACGTN reference
733
734
1.75k
                uc  = f->B.base;
735
1.75k
                r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
736
1.75k
                                              (char *)&uc, 1);
737
738
                //                  Already added
739
                //                  uc  = f->B.qual;
740
                //                  r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
741
                //                                           (char *)&uc, 1);
742
1.75k
                break;
743
744
183
            case 'b':
745
                // string of bases
746
183
                r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
747
183
                                              (char *)BLOCK_DATA(s->seqs_blk)
748
183
                                                      + f->b.seq_idx,
749
183
                                              f->b.len);
750
183
                break;
751
752
29
            case 'Q':
753
                //                  Already added
754
                //                  uc  = f->B.qual;
755
                //                  r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
756
                //                                           (char *)&uc, 1);
757
29
                break;
758
759
838
            case 'N':
760
838
                i32 = f->N.len;
761
838
                r |= h->codecs[DS_RS]->encode(s, h->codecs[DS_RS],
762
838
                                              (char *)&i32, 1);
763
838
                break;
764
765
77
            case 'P':
766
77
                i32 = f->P.len;
767
77
                r |= h->codecs[DS_PD]->encode(s, h->codecs[DS_PD],
768
77
                                              (char *)&i32, 1);
769
77
                break;
770
771
13.4k
            case 'H':
772
13.4k
                i32 = f->H.len;
773
13.4k
                r |= h->codecs[DS_HC]->encode(s, h->codecs[DS_HC],
774
13.4k
                                              (char *)&i32, 1);
775
13.4k
                break;
776
777
778
0
            default:
779
0
                hts_log_error("Unhandled feature code %c", f->X.code);
780
0
                return -1;
781
150k
            }
782
150k
        }
783
784
84.1k
        r |= h->codecs[DS_MQ]->encode(s, h->codecs[DS_MQ],
785
84.1k
                                      (char *)&cr->mqual, 1);
786
3.34M
    } else {
787
3.34M
        char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
788
3.34M
        if (cr->len)
789
302k
            r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], seq, cr->len);
790
3.34M
    }
791
792
3.43M
    return r ? -1 : 0;
793
3.43M
}
794
795
796
/*
797
 * Applies various compression methods to specific blocks, depending on
798
 * known observations of how data series compress.
799
 *
800
 * Returns 0 on success
801
 *        -1 on failure
802
 */
803
64.0k
static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) {
804
64.0k
    int level = fd->level, i;
805
64.0k
    int method = 1<<GZIP | 1<<GZIP_RLE, methodF = method;
806
64.0k
    int v31_or_above = (fd->version >= (3<<8)+1);
807
808
    /* Compress the CORE Block too, with minimal zlib level */
809
64.0k
    if (level > 5 && s->block[0]->uncomp_size > 500)
810
0
        cram_compress_block2(fd, s, s->block[0], NULL, 1<<GZIP, 1);
811
812
64.0k
    if (fd->use_bz2)
813
0
        method |= 1<<BZIP2;
814
815
64.0k
    int method_rans   = (1<<RANS0) | (1<<RANS1);
816
64.0k
    int method_ranspr = method_rans;
817
818
64.0k
    if (fd->use_rans) {
819
64.0k
        method_ranspr = (1<<RANS_PR0)   | (1<<RANS_PR1);
820
64.0k
        if (level > 1)
821
64.0k
            method_ranspr |=
822
64.0k
                  (1<<RANS_PR64)  | (1<<RANS_PR9)
823
64.0k
                | (1<<RANS_PR128) | (1<<RANS_PR193);
824
64.0k
        if (level > 5)
825
0
            method_ranspr |= (1<<RANS_PR129) | (1<<RANS_PR192);
826
64.0k
    }
827
828
64.0k
    if (fd->use_rans) {
829
64.0k
        methodF |= v31_or_above ? method_ranspr : method_rans;
830
64.0k
        method  |= v31_or_above ? method_ranspr : method_rans;
831
64.0k
    }
832
833
64.0k
    int method_arith   = 0;
834
64.0k
    if (fd->use_arith) {
835
0
        method_arith = (1<<ARITH_PR0)   | (1<<ARITH_PR1);
836
0
        if (level > 1)
837
0
            method_arith |=
838
0
                  (1<<ARITH_PR64)  | (1<<ARITH_PR9)
839
0
                | (1<<ARITH_PR128) | (1<<ARITH_PR129)
840
0
                | (1<<ARITH_PR192) | (1u<<ARITH_PR193);
841
0
    }
842
64.0k
    if (fd->use_arith && v31_or_above) {
843
0
        methodF |= method_arith;
844
0
        method  |= method_arith;
845
0
    }
846
847
64.0k
    if (fd->use_lzma)
848
0
        method |= (1<<LZMA);
849
850
    /* Faster method for data series we only need entropy encoding on */
851
64.0k
    methodF = method & ~(1<<GZIP | 1<<BZIP2 | 1<<LZMA);
852
64.0k
    if (level >= 5) {
853
64.0k
        method |= 1<<GZIP_1;
854
64.0k
        methodF = method;
855
64.0k
    }
856
64.0k
    if (level == 1) {
857
0
        method &= ~(1<<GZIP);
858
0
        method |=   1<<GZIP_1;
859
0
        methodF = method;
860
0
    }
861
862
64.0k
    int qmethod  = method;
863
64.0k
    int qmethodF = method;
864
64.0k
    if (v31_or_above && fd->use_fqz) {
865
0
        qmethod  |= 1<<FQZ;
866
0
        qmethodF |= 1<<FQZ;
867
0
        if (fd->level > 4) {
868
0
            qmethod  |= 1<<FQZ_b;
869
0
            qmethodF |= 1<<FQZ_b;
870
0
        }
871
0
        if (fd->level > 6) {
872
0
            qmethod  |= (1<<FQZ_c) | (1<<FQZ_d);
873
0
            qmethodF |= (1<<FQZ_c) | (1<<FQZ_d);
874
0
        }
875
0
    }
876
877
64.0k
    pthread_mutex_lock(&fd->metrics_lock);
878
3.07M
    for (i = 0; i < DS_END; i++)
879
3.00M
        if (c->stats[i] && c->stats[i]->nvals > 16)
880
214
            fd->m[i]->unpackable = 1;
881
64.0k
    pthread_mutex_unlock(&fd->metrics_lock);
882
883
    /* Specific compression methods for certain block types */
884
64.0k
    if (cram_compress_block2(fd, s, s->block[DS_IN], fd->m[DS_IN], //IN (seq)
885
64.0k
                             method, level))
886
0
        return -1;
887
888
64.0k
    if (fd->level == 0) {
889
        /* Do nothing */
890
64.0k
    } else if (fd->level == 1) {
891
0
        if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
892
0
                                 qmethodF, 1))
893
0
            return -1;
894
0
        for (i = DS_aux; i <= DS_aux_oz; i++) {
895
0
            if (s->block[i])
896
0
                if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
897
0
                                         method, 1))
898
0
                    return -1;
899
0
        }
900
64.0k
    } else if (fd->level < 3) {
901
0
        if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
902
0
                                 qmethod, 1))
903
0
            return -1;
904
0
        if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA],
905
0
                                 method, 1))
906
0
            return -1;
907
0
        if (s->block[DS_BB])
908
0
            if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB],
909
0
                                     method, 1))
910
0
                return -1;
911
0
        for (i = DS_aux; i <= DS_aux_oz; i++) {
912
0
            if (s->block[i])
913
0
                if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
914
0
                                         method, level))
915
0
                    return -1;
916
0
        }
917
64.0k
    } else {
918
64.0k
        if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
919
64.0k
                                 qmethod, level))
920
0
            return -1;
921
64.0k
        if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA],
922
64.0k
                                 method, level))
923
0
            return -1;
924
64.0k
        if (s->block[DS_BB])
925
64.0k
            if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB],
926
64.0k
                                     method, level))
927
0
                return -1;
928
640k
        for (i = DS_aux; i <= DS_aux_oz; i++) {
929
576k
            if (s->block[i])
930
0
                if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
931
0
                                         method, level))
932
0
                    return -1;
933
576k
        }
934
64.0k
    }
935
936
    // NAME: best is generally xz, bzip2, zlib then rans1
937
64.0k
    int method_rn = method & ~(method_rans | method_ranspr | 1<<GZIP_RLE);
938
64.0k
    if (fd->version >= (3<<8)+1 && fd->use_tok)
939
64.0k
        method_rn |= fd->use_arith ? (1<<TOKA) : (1<<TOK3);
940
64.0k
    if (cram_compress_block2(fd, s, s->block[DS_RN], fd->m[DS_RN],
941
64.0k
                             method_rn, level))
942
0
        return -1;
943
944
    // NS shows strong local correlation as rearrangements are localised
945
64.0k
    if (s->block[DS_NS] && s->block[DS_NS] != s->block[0])
946
143
        if (cram_compress_block2(fd, s, s->block[DS_NS], fd->m[DS_NS],
947
143
                                 method, level))
948
0
            return -1;
949
950
951
    /*
952
     * Compress any auxiliary tags with their own per-tag metrics
953
     */
954
64.0k
    {
955
64.0k
        int i;
956
235k
        for (i = DS_END /*num_blk - naux_blk*/; i < s->hdr->num_blocks; i++) {
957
171k
            if (!s->block[i] || s->block[i] == s->block[0])
958
0
                continue;
959
960
171k
            if (s->block[i]->method != RAW)
961
0
                continue;
962
963
171k
            if (cram_compress_block2(fd, s, s->block[i], s->block[i]->m,
964
171k
                                     method, level))
965
0
                return -1;
966
171k
        }
967
64.0k
    }
968
969
    /*
970
     * Minimal compression of any block still uncompressed, bar CORE
971
     */
972
64.0k
    {
973
64.0k
        int i;
974
3.00M
        for (i = 1; i < s->hdr->num_blocks && i < DS_END; i++) {
975
2.94M
            if (!s->block[i] || s->block[i] == s->block[0])
976
2.49M
                continue;
977
978
449k
            if (s->block[i]->method != RAW)
979
8.01k
                continue;
980
981
441k
            if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
982
441k
                                    methodF, level))
983
0
                return -1;
984
441k
        }
985
64.0k
    }
986
987
64.0k
    return 0;
988
64.0k
}
989
990
/*
991
 * Allocates a block associated with the cram codec associated with
992
 * data series ds_id or the internal codec_id (depending on codec
993
 * type).
994
 *
995
 * The ds_ids are what end up written to disk as an external block.
996
 * The c_ids are internal and used when daisy-chaining transforms
997
 * such as MAP and RLE.  These blocks are also allocated, but
998
 * are ephemeral in nature.  (The codecs themselves cannot allocate
999
 * these as the same codec pointer may be operating on multiple slices
1000
 * if we're using a multi-slice container.)
1001
 *
1002
 * Returns 0 on success
1003
 *        -1 on failure
1004
 */
1005
1.85M
static int cram_allocate_block(cram_codec *codec, cram_slice *s, int ds_id) {
1006
1.85M
    if (!codec)
1007
600k
        return 0;
1008
1009
1.25M
    switch(codec->codec) {
1010
    // Codecs which are hard-coded to use the CORE block
1011
0
    case E_GOLOMB:
1012
835k
    case E_HUFFMAN:
1013
836k
    case E_BETA:
1014
836k
    case E_SUBEXP:
1015
836k
    case E_GOLOMB_RICE:
1016
836k
    case E_GAMMA:
1017
836k
        codec->out = s->block[0];
1018
836k
        break;
1019
1020
    // Codecs which don't use external blocks
1021
0
    case E_CONST_BYTE:
1022
0
    case E_CONST_INT:
1023
0
       codec->out = NULL;
1024
0
       break;
1025
1026
    // Codecs that emit directly to external blocks
1027
226k
    case E_EXTERNAL:
1028
226k
    case E_VARINT_UNSIGNED:
1029
226k
    case E_VARINT_SIGNED:
1030
226k
        if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id)))
1031
0
            return -1;
1032
226k
        codec->u.external.content_id = ds_id;
1033
226k
        codec->out = s->block[ds_id];
1034
226k
        break;
1035
1036
128k
    case E_BYTE_ARRAY_STOP: // Why no sub-codec?
1037
128k
        if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id)))
1038
0
            return -1;
1039
128k
        codec->u.byte_array_stop.content_id = ds_id;
1040
128k
        codec->out = s->block[ds_id];
1041
128k
        break;
1042
1043
1044
    // Codecs that contain sub-codecs which may in turn emit to external blocks
1045
64.0k
    case E_BYTE_ARRAY_LEN: {
1046
64.0k
        cram_codec *bal = codec->u.e_byte_array_len.len_codec;
1047
64.0k
        if (cram_allocate_block(bal, s, bal->u.external.content_id))
1048
0
            return -1;
1049
64.0k
        bal = codec->u.e_byte_array_len.val_codec;
1050
64.0k
        if (cram_allocate_block(bal, s, bal->u.external.content_id))
1051
0
            return -1;
1052
1053
64.0k
        break;
1054
64.0k
    }
1055
1056
64.0k
    case E_XRLE:
1057
0
        if (cram_allocate_block(codec->u.e_xrle.len_codec, s, ds_id))
1058
                                //ds_id == DS_QS ? DS_QS_len : ds_id))
1059
0
            return -1;
1060
0
        if (cram_allocate_block(codec->u.e_xrle.lit_codec, s, ds_id))
1061
0
            return -1;
1062
1063
0
        break;
1064
1065
0
    case E_XPACK:
1066
0
        if (cram_allocate_block(codec->u.e_xpack.sub_codec, s, ds_id))
1067
0
            return -1;
1068
0
        codec->out = cram_new_block(0, 0); // ephemeral
1069
0
        if (!codec->out)
1070
0
            return -1;
1071
1072
0
        break;
1073
1074
0
    case E_XDELTA:
1075
0
        if (cram_allocate_block(codec->u.e_xdelta.sub_codec, s, ds_id))
1076
0
            return -1;
1077
0
        codec->out = cram_new_block(0, 0); // ephemeral
1078
0
        if (!codec->out)
1079
0
            return -1;
1080
1081
0
        break;
1082
1083
0
    default:
1084
0
        break;
1085
1.25M
    }
1086
1087
1.25M
    return 0;
1088
1.25M
}
1089
1090
/*
1091
 * Encodes a single slice from a container
1092
 *
1093
 * Returns 0 on success
1094
 *        -1 on failure
1095
 */
1096
static int cram_encode_slice(cram_fd *fd, cram_container *c,
1097
                             cram_block_compression_hdr *h, cram_slice *s,
1098
64.0k
                             int embed_ref) {
1099
64.0k
    int rec, r = 0;
1100
64.0k
    int64_t last_pos;
1101
64.0k
    enum cram_DS_ID id;
1102
1103
    /*
1104
     * Slice external blocks:
1105
     * ID 0 => base calls (insertions, soft-clip)
1106
     * ID 1 => qualities
1107
     * ID 2 => names
1108
     * ID 3 => TS (insert size), NP (next frag)
1109
     * ID 4 => tag values
1110
     * ID 6 => tag IDs (TN), if CRAM_V1.0
1111
     * ID 7 => TD tag dictionary, if !CRAM_V1.0
1112
     */
1113
1114
    /* Create cram slice header */
1115
64.0k
    s->hdr->ref_base_id = embed_ref>0 && s->hdr->ref_seq_span > 0
1116
64.0k
        ? DS_ref
1117
64.0k
        : (CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : -1);
1118
64.0k
    s->hdr->record_counter = c->num_records + c->record_counter;
1119
64.0k
    c->num_records += s->hdr->num_records;
1120
1121
64.0k
    int ntags = c->tags_used ? c->tags_used->n_occupied : 0;
1122
64.0k
    s->block = calloc(DS_END + ntags*2, sizeof(s->block[0]));
1123
64.0k
    s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t));
1124
64.0k
    if (!s->block || !s->hdr->block_content_ids)
1125
0
        return -1;
1126
1127
    // Create first fixed blocks, always external.
1128
    // CORE
1129
64.0k
    if (!(s->block[0] = cram_new_block(CORE, 0)))
1130
0
        return -1;
1131
1132
    // TN block for CRAM v1
1133
64.0k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
1134
0
        if (h->codecs[DS_TN]->codec == E_EXTERNAL) {
1135
0
            if (!(s->block[DS_TN] = cram_new_block(EXTERNAL,DS_TN))) return -1;
1136
0
            h->codecs[DS_TN]->u.external.content_id = DS_TN;
1137
0
        } else {
1138
0
            s->block[DS_TN] = s->block[0];
1139
0
        }
1140
0
    }
1141
1142
    // Embedded reference
1143
64.0k
    if (embed_ref>0) {
1144
30.5k
        if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref)))
1145
0
            return -1;
1146
30.5k
        s->ref_id = DS_ref; // needed?
1147
30.5k
        BLOCK_APPEND(s->block[DS_ref],
1148
30.5k
                     c->ref + s->hdr->ref_seq_start - c->ref_start,
1149
30.5k
                     s->hdr->ref_seq_span);
1150
30.5k
    }
1151
1152
    /*
1153
     * All the data-series blocks if appropriate.
1154
     */
1155
1.79M
    for (id = DS_QS; id < DS_TN; id++) {
1156
1.72M
        if (cram_allocate_block(h->codecs[id], s, id) < 0)
1157
0
            return -1;
1158
1.72M
    }
1159
1160
    /*
1161
     * Add in the external tag blocks too.
1162
     */
1163
64.0k
    if (c->tags_used) {
1164
64.0k
        int n;
1165
64.0k
        s->hdr->num_blocks = DS_END;
1166
235k
        for (n = 0; n < s->naux_block; n++) {
1167
171k
            s->block[s->hdr->num_blocks++] = s->aux_block[n];
1168
171k
            s->aux_block[n] = NULL;
1169
171k
        }
1170
64.0k
    }
1171
1172
    /* Encode reads */
1173
64.0k
    last_pos = s->hdr->ref_seq_start;
1174
3.49M
    for (rec = 0; rec < s->hdr->num_records; rec++) {
1175
3.43M
        cram_record *cr = &s->crecs[rec];
1176
3.43M
        if (cram_encode_slice_read(fd, c, h, s, cr, &last_pos) == -1)
1177
0
            return -1;
1178
3.43M
    }
1179
1180
64.0k
    s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7);
1181
64.0k
    s->block[0]->comp_size = s->block[0]->uncomp_size;
1182
1183
    // Make sure the fixed blocks point to the correct sources
1184
64.0k
    if (s->block[DS_IN]) cram_free_block(s->block[DS_IN]);
1185
64.0k
    s->block[DS_IN] = s->base_blk; s->base_blk = NULL;
1186
64.0k
    if (s->block[DS_QS]) cram_free_block(s->block[DS_QS]);
1187
64.0k
    s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL;
1188
64.0k
    if (s->block[DS_RN]) cram_free_block(s->block[DS_RN]);
1189
64.0k
    s->block[DS_RN] = s->name_blk; s->name_blk = NULL;
1190
64.0k
    if (s->block[DS_SC]) cram_free_block(s->block[DS_SC]);
1191
64.0k
    s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL;
1192
1193
    // Finalise any data transforms.
1194
1.79M
    for (id = DS_QS; id < DS_TN; id++) {
1195
1.72M
       if (h->codecs[id] && h->codecs[id]->flush)
1196
0
           h->codecs[id]->flush(h->codecs[id]);
1197
1.72M
    }
1198
1199
    // Ensure block sizes are up to date.
1200
3.17M
    for (id = 1; id < s->hdr->num_blocks; id++) {
1201
3.11M
        if (!s->block[id] || s->block[id] == s->block[0])
1202
2.49M
            continue;
1203
1204
620k
        if (s->block[id]->uncomp_size == 0)
1205
620k
            BLOCK_UPLEN(s->block[id]);
1206
620k
    }
1207
1208
    // Compress it all
1209
64.0k
    if (cram_compress_slice(fd, c, s) == -1)
1210
0
        return -1;
1211
1212
    // Collapse empty blocks and create hdr_block
1213
64.0k
    {
1214
64.0k
        int i, j;
1215
1216
64.0k
        s->hdr->block_content_ids = realloc(s->hdr->block_content_ids,
1217
64.0k
                                            s->hdr->num_blocks * sizeof(int32_t));
1218
64.0k
        if (!s->hdr->block_content_ids)
1219
0
            return -1;
1220
1221
3.17M
        for (i = j = 1; i < s->hdr->num_blocks; i++) {
1222
3.11M
            if (!s->block[i] || s->block[i] == s->block[0])
1223
2.49M
                continue;
1224
620k
            if (s->block[i]->uncomp_size == 0) {
1225
314k
                cram_free_block(s->block[i]);
1226
314k
                s->block[i] = NULL;
1227
314k
                continue;
1228
314k
            }
1229
305k
            s->block[j] = s->block[i];
1230
305k
            s->hdr->block_content_ids[j-1] = s->block[i]->content_id;
1231
305k
            j++;
1232
305k
        }
1233
64.0k
        s->hdr->num_content_ids = j-1;
1234
64.0k
        s->hdr->num_blocks = j;
1235
1236
64.0k
        if (!(s->hdr_block = cram_encode_slice_header(fd, s)))
1237
15
            return -1;
1238
64.0k
    }
1239
1240
63.9k
    return r ? -1 : 0;
1241
1242
0
 block_err:
1243
0
    return -1;
1244
64.0k
}
1245
1246
3.43M
static inline const char *bam_data_end(bam1_t *b) {
1247
3.43M
    return (const char *)b->data + b->l_data;
1248
3.43M
}
1249
1250
/*
1251
 * A bounds checking version of bam_aux2i.
1252
 */
1253
0
static inline int bam_aux2i_end(const uint8_t *aux, const uint8_t *aux_end) {
1254
0
    int type = *aux++;
1255
0
    switch (type) {
1256
0
        case 'c':
1257
0
            if (aux_end - aux < 1) {
1258
0
                errno = EINVAL;
1259
0
                return 0;
1260
0
            }
1261
0
            return *(int8_t *)aux;
1262
0
        case 'C':
1263
0
            if (aux_end - aux < 1) {
1264
0
                errno = EINVAL;
1265
0
                return 0;
1266
0
            }
1267
0
            return *aux;
1268
0
        case 's':
1269
0
            if (aux_end - aux < 2) {
1270
0
                errno = EINVAL;
1271
0
                return 0;
1272
0
            }
1273
0
            return le_to_i16(aux);
1274
0
        case 'S':
1275
0
            if (aux_end - aux < 2) {
1276
0
                errno = EINVAL;
1277
0
                return 0;
1278
0
            }
1279
0
            return le_to_u16(aux);
1280
0
        case 'i':
1281
0
            if (aux_end - aux < 4) {
1282
0
                errno = EINVAL;
1283
0
                return 0;
1284
0
            }
1285
0
            return le_to_i32(aux);
1286
0
        case 'I':
1287
0
            if (aux_end - aux < 4) {
1288
0
                errno = EINVAL;
1289
0
                return 0;
1290
0
            }
1291
0
            return le_to_u32(aux);
1292
0
        default:
1293
0
            errno = EINVAL;
1294
0
    }
1295
0
    return 0;
1296
0
}
1297
1298
/*
1299
 * Returns the number of expected read names for this record.
1300
 */
1301
0
static int expected_template_count(bam_seq_t *b) {
1302
0
    int expected = bam_flag(b) & BAM_FPAIRED ? 2 : 1;
1303
1304
0
    uint8_t *TC = (uint8_t *)bam_aux_get(b, "TC");
1305
0
    if (TC) {
1306
0
        int n = bam_aux2i_end(TC, (uint8_t *)bam_data_end(b));
1307
0
        if (expected < n)
1308
0
            expected = n;
1309
0
    }
1310
1311
0
    if (!TC && bam_aux_get(b, "SA")) {
1312
        // We could count the semicolons, but we'd have to do this for
1313
        // read1, read2 and read(not-1-or-2) combining the results
1314
        // together.  This is a cheap and safe alternative for now.
1315
0
        expected = INT_MAX;
1316
0
    }
1317
1318
0
    return expected;
1319
0
}
1320
1321
/*
1322
 * Lossily reject read names.
1323
 *
1324
 * The rule here is that if all reads for this template reside in the
1325
 * same slice then we can lose the name.  Otherwise we keep them as we
1326
 * do not know when (or if) the other reads will turn up.
1327
 *
1328
 * Note there may be only 1 read (non-paired library) or more than 2
1329
 * reads (paired library with supplementary reads), or other weird
1330
 * setups.  We need to know how many are expected.  Ways to guess:
1331
 *
1332
 * - Flags (0x1 - has > 1 read)
1333
 * - TC aux field (not mandatory)
1334
 * - SA tags (count semicolons, NB per fragment so sum - hard)
1335
 * - RNEXT/PNEXT uniqueness count. (not implemented, tricky)
1336
 *
1337
 * Returns 0 on success
1338
 *        -1 on failure
1339
 */
1340
static int lossy_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
1341
65.2k
                            int bam_start) {
1342
65.2k
    int r1, r2, ret = -1;
1343
1344
    // Initialise cram_flags
1345
3.50M
    for (r2 = 0; r2 < s->hdr->num_records; r2++)
1346
3.43M
        s->crecs[r2].cram_flags = 0;
1347
1348
65.2k
    if (!fd->lossy_read_names)
1349
65.2k
        return 0;
1350
1351
0
    khash_t(m_s2u64) *names = kh_init(m_s2u64);
1352
0
    if (!names)
1353
0
        goto fail;
1354
1355
    // 1: Iterate through names to count frequency
1356
0
    for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
1357
        //cram_record *cr = &s->crecs[r2];
1358
0
        bam_seq_t *b = c->bams[r1];
1359
0
        khint_t k;
1360
0
        int n;
1361
0
        uint64_t e;
1362
0
        union {
1363
0
            uint64_t i64;
1364
0
            struct {
1365
0
                int32_t e,c; // expected & observed counts.
1366
0
            } counts;
1367
0
        } u;
1368
1369
0
        e = expected_template_count(b);
1370
0
        u.counts.e = e; u.counts.c = 1;
1371
1372
0
        k = kh_put(m_s2u64, names, bam_name(b), &n);
1373
0
        if (n == -1)
1374
0
            goto fail;
1375
1376
0
        if (n == 0) {
1377
            // not a new name
1378
0
            u.i64 = kh_val(names, k);
1379
0
            if (u.counts.e != e) {
1380
                // different expectation or already hit the max
1381
                //fprintf(stderr, "Err computing no. %s recs\n", bam_name(b));
1382
0
                kh_val(names, k) = 0;
1383
0
            } else {
1384
0
                u.counts.c++;
1385
0
                if (u.counts.e == u.counts.c) {
1386
                    // Reached expected count.
1387
0
                    kh_val(names, k) = -1;
1388
0
                } else {
1389
0
                    kh_val(names, k) = u.i64;
1390
0
                }
1391
0
            }
1392
0
        } else {
1393
            // new name
1394
0
            kh_val(names, k) = u.i64;
1395
0
        }
1396
0
    }
1397
1398
    // 2: Remove names if all present (hd.i == -1)
1399
0
    for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
1400
0
        cram_record *cr = &s->crecs[r2];
1401
0
        bam_seq_t *b = c->bams[r1];
1402
0
        khint_t k;
1403
1404
0
        k = kh_get(m_s2u64, names, bam_name(b));
1405
1406
0
        if (k == kh_end(names))
1407
0
            goto fail;
1408
1409
0
        if (kh_val(names, k) == -1)
1410
0
            cr->cram_flags = CRAM_FLAG_DISCARD_NAME;
1411
0
    }
1412
1413
0
    ret = 0;
1414
0
 fail: // ret==-1
1415
1416
0
    if (names)
1417
0
        kh_destroy(m_s2u64, names);
1418
1419
0
    return ret;
1420
0
}
1421
1422
/*
1423
 * Adds the reading names.  We do this here as a separate pass rather
1424
 * than per record in the process_one_read calls as that function can
1425
 * go back and change the CRAM_FLAG_DETACHED status of a previously
1426
 * processed read if it subsequently determines the TLEN field is
1427
 * incorrect.  Given DETACHED reads always try to decode read names,
1428
 * we need to know their status before generating the read-name block.
1429
 *
1430
 * Output is an update s->name_blk, and cr->name / cr->name_len
1431
 * fields.
1432
 */
1433
static int add_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
1434
64.0k
                          int bam_start) {
1435
64.0k
    int r1, r2;
1436
64.0k
    int keep_names = !fd->lossy_read_names;
1437
1438
64.0k
    for (r1 = bam_start, r2 = 0;
1439
3.49M
         r1 < c->curr_c_rec && r2 < s->hdr->num_records;
1440
3.43M
         r1++, r2++) {
1441
3.43M
        cram_record *cr = &s->crecs[r2];
1442
3.43M
        bam_seq_t *b = c->bams[r1];
1443
1444
3.43M
        cr->name        = BLOCK_SIZE(s->name_blk);
1445
3.43M
        if ((cr->cram_flags & CRAM_FLAG_DETACHED) || keep_names) {
1446
3.43M
            if (CRAM_MAJOR_VERS(fd->version) >= 4
1447
3.43M
                && (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM)
1448
3.43M
                && cr->mate_line) {
1449
                // Dedup read names in V4
1450
0
                BLOCK_APPEND(s->name_blk, "\0", 1);
1451
0
                cr->name_len    = 1;
1452
3.43M
            } else {
1453
3.43M
                BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b));
1454
3.43M
                cr->name_len    = bam_name_len(b);
1455
3.43M
            }
1456
3.43M
        } else {
1457
            // Can only discard duplicate names if not detached
1458
0
            cr->name_len = 0;
1459
0
        }
1460
1461
3.43M
        if (cram_stats_add(c->stats[DS_RN], cr->name_len) < 0)
1462
0
            goto block_err;
1463
3.43M
    }
1464
1465
64.0k
    return 0;
1466
1467
0
 block_err:
1468
0
    return -1;
1469
64.0k
}
1470
1471
// CRAM version >= 3.1
1472
95.6k
#define CRAM_ge31(v) ((v) >= 0x301)
1473
1474
// Returns the next cigar op code: one of the BAM_C* codes,
1475
// or -1 if no more are present.
1476
static inline
1477
int next_cigar_op(uint32_t *cigar, uint32_t ncigar, int *skip, int *spos,
1478
1.26k
                  uint32_t *cig_ind, uint32_t *cig_op, uint32_t *cig_len) {
1479
2.00k
    for(;;) {
1480
5.13k
        while (*cig_len == 0) {
1481
3.12k
            if (*cig_ind < ncigar) {
1482
3.12k
                *cig_op  = cigar[*cig_ind] & BAM_CIGAR_MASK;
1483
3.12k
                *cig_len = cigar[*cig_ind] >> BAM_CIGAR_SHIFT;
1484
3.12k
                (*cig_ind)++;
1485
3.12k
            } else {
1486
1
                return -1;
1487
1
            }
1488
3.12k
        }
1489
1490
2.00k
        if (skip[*cig_op]) {
1491
742
            *spos += (bam_cigar_type(*cig_op)&1) * *cig_len;
1492
742
            *cig_len = 0;
1493
742
            continue;
1494
742
        }
1495
1496
1.26k
        (*cig_len)--;
1497
1.26k
        break;
1498
2.00k
    }
1499
1500
1.26k
    return *cig_op;
1501
1.26k
}
1502
1503
// Ensure ref and hist are large enough.
1504
static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos,
1505
2.81M
                             hts_pos_t ref_start, hts_pos_t *ref_end) {
1506
2.81M
    if (pos < ref_start)
1507
262
        return -1;
1508
2.81M
    if (pos < *ref_end)
1509
2.77M
        return 0;
1510
1511
    // realloc
1512
34.9k
    if (pos - ref_start > UINT_MAX)
1513
30
        return -2; // protect overflow in new_end calculation
1514
1515
34.9k
    hts_pos_t old_end = *ref_end ? *ref_end : ref_start;
1516
34.9k
    hts_pos_t new_end = ref_start + 1000 + (pos-ref_start)*1.5;
1517
1518
    // Refuse to work on excessively large blocks.
1519
    // We'll just switch to referenceless encoding, which is probably better
1520
    // here as this must be very sparse data anyway.
1521
34.9k
    if (new_end - ref_start > UINT_MAX/sizeof(**hist)/2)
1522
886
        return -2;
1523
1524
34.0k
    char *tmp = realloc(*ref, new_end-ref_start+1);
1525
34.0k
    if (!tmp)
1526
0
        return -1;
1527
34.0k
    *ref = tmp;
1528
1529
34.0k
    uint32_t (*tmp5)[5] = realloc(**hist,
1530
34.0k
                                  (new_end - ref_start)*sizeof(**hist));
1531
34.0k
    if (!tmp5)
1532
0
        return -1;
1533
34.0k
    *hist = tmp5;
1534
34.0k
    *ref_end = new_end;
1535
1536
    // initialise
1537
34.0k
    old_end -= ref_start;
1538
34.0k
    new_end -= ref_start;
1539
34.0k
    memset(&(*ref)[old_end],  0,  new_end-old_end);
1540
34.0k
    memset(&(*hist)[old_end], 0, (new_end-old_end)*sizeof(**hist));
1541
1542
34.0k
    return 0;
1543
34.0k
}
1544
1545
// Walk through MD + seq to generate ref
1546
// Returns 1 on success, <0 on failure
1547
static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
1548
                              hts_pos_t ref_start, hts_pos_t *ref_end,
1549
48.5k
                              const uint8_t *MD) {
1550
48.5k
    uint8_t *seq = bam_get_seq(b);
1551
48.5k
    uint32_t *cigar = bam_get_cigar(b);
1552
48.5k
    uint32_t ncigar = b->core.n_cigar;
1553
48.5k
    uint32_t cig_op = 0, cig_len = 0, cig_ind = 0;
1554
1555
48.5k
    int iseq = 0, next_op;
1556
48.5k
    hts_pos_t iref = b->core.pos - ref_start;
1557
1558
    // Skip INS, REF_SKIP, *CLIP, PAD. and BACK.
1559
48.5k
    static int cig_skip[16] = {0,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1};
1560
49.7k
    while (iseq < b->core.l_qseq && *MD) {
1561
2.10k
        if (isdigit(*MD)) {
1562
            // match
1563
852
            int overflow = 0;
1564
852
            int len = hts_str2uint((char *)MD, (char **)&MD, 31, &overflow);
1565
852
            if (overflow ||
1566
852
                extend_ref(ref, hist, iref+ref_start + len,
1567
702
                           ref_start, ref_end) < 0)
1568
299
                return -1;
1569
554
            while (iseq < b->core.l_qseq && len) {
1570
                // rewrite to have internal loops?
1571
461
                if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
1572
461
                                             &iseq, &cig_ind, &cig_op,
1573
461
                                             &cig_len)) < 0)
1574
0
                    return -1;
1575
1576
461
                if (next_op != BAM_CMATCH &&
1577
461
                    next_op != BAM_CEQUAL) {
1578
460
                    hts_log_info("MD:Z and CIGAR are incompatible for "
1579
460
                                 "record %s", bam_get_qname(b));
1580
460
                    return -1;
1581
460
                }
1582
1583
                // Short-cut loop over same cigar op for efficiency
1584
1
                cig_len++;
1585
1
                do {
1586
1
                    cig_len--;
1587
1
                    (*ref)[iref++] = seq_nt16_str[bam_seqi(seq, iseq)];
1588
1
                    iseq++;
1589
1
                    len--;
1590
1
                } while (cig_len && iseq < b->core.l_qseq && len);
1591
1
            }
1592
93
            if (len > 0)
1593
1
                return -1; // MD is longer than seq
1594
1.25k
        } else if (*MD == '^') {
1595
            // deletion
1596
452
            MD++;
1597
452
            while (isalpha(*MD)) {
1598
2
                if (extend_ref(ref, hist, iref+ref_start, ref_start,
1599
2
                               ref_end) < 0)
1600
0
                    return -1;
1601
2
                if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
1602
2
                                             &iseq, &cig_ind, &cig_op,
1603
2
                                             &cig_len)) < 0)
1604
0
                    return -1;
1605
1606
2
                if (next_op != BAM_CDEL) {
1607
0
                    hts_log_info("MD:Z and CIGAR are incompatible");
1608
0
                    return -1;
1609
0
                }
1610
1611
2
                (*ref)[iref++] = *MD++ & ~0x20;
1612
2
            }
1613
803
        } else {
1614
            // substitution
1615
803
            if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end) < 0)
1616
0
                return -1;
1617
803
            if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
1618
803
                                         &iseq, &cig_ind, &cig_op,
1619
803
                                         &cig_len)) < 0)
1620
1
                return -1;
1621
1622
802
            if (next_op != BAM_CMATCH && next_op != BAM_CDIFF) {
1623
203
                hts_log_info("MD:Z and CIGAR are incompatible");
1624
203
                return -1;
1625
203
            }
1626
1627
599
            (*ref)[iref++] = *MD++ & ~0x20;
1628
599
            iseq++;
1629
599
        }
1630
2.10k
    }
1631
1632
47.6k
    return 1;
1633
48.5k
}
1634
1635
// Append a sequence to a ref/consensus structure.
1636
// We maintain both an absolute refefence (ACGTN where MD:Z is
1637
// present) and a 5-way frequency array for when no MD:Z is known.
1638
// We then subsequently convert the 5-way frequencies to a consensus
1639
// ref in a second pass.
1640
//
1641
// Returns >=0 on success,
1642
//         -1 on failure (eg inconsistent data)
1643
static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5],
1644
87.4k
                           hts_pos_t ref_start, hts_pos_t *ref_end) {
1645
87.4k
    const uint8_t *MD = bam_aux_get(b, "MD");
1646
87.4k
    int ret = 0;
1647
87.4k
    if (MD && *MD == 'Z') {
1648
        // We can use MD to directly compute the reference
1649
48.5k
        int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end, MD+1);
1650
1651
48.5k
        if (ret > 0)
1652
47.6k
            return ret;
1653
48.5k
    }
1654
1655
    // Otherwise we just use SEQ+CIGAR and build a consensus which we later
1656
    // turn into a fake reference
1657
39.8k
    uint32_t *cigar = bam_get_cigar(b);
1658
39.8k
    uint32_t ncigar = b->core.n_cigar;
1659
39.8k
    uint32_t i, j;
1660
39.8k
    hts_pos_t iseq = 0, iref = b->core.pos - ref_start;
1661
39.8k
    uint8_t *seq = bam_get_seq(b);
1662
3.08M
    for (i = 0; i < ncigar; i++) {
1663
3.04M
        switch (bam_cigar_op(cigar[i])) {
1664
109k
        case BAM_CSOFT_CLIP:
1665
166k
        case BAM_CINS:
1666
166k
            iseq += bam_cigar_oplen(cigar[i]);
1667
166k
            break;
1668
1669
2.71M
        case BAM_CMATCH:
1670
2.78M
        case BAM_CEQUAL:
1671
2.78M
        case BAM_CDIFF: {
1672
2.78M
            int len = bam_cigar_oplen(cigar[i]);
1673
            // Maps an nt16 (A=1 C=2 G=4 T=8 bits) to 0123 plus N=4
1674
2.78M
            static uint8_t L16[16] = {4,0,1,4, 2,4,4,4, 3,4,4,4, 4,4,4,4};
1675
1676
2.78M
            if (extend_ref(ref, hist, iref+ref_start + len,
1677
2.78M
                           ref_start, ref_end) < 0)
1678
719
                return -1;
1679
2.78M
            if (iseq + len <= b->core.l_qseq) {
1680
                // Nullify failed MD:Z if appropriate
1681
11.1k
                if (ret < 0)
1682
0
                    memset(&(*ref)[iref], 0, len);
1683
1684
15.0k
                for (j = 0; j < len; j++, iref++, iseq++)
1685
3.90k
                    (*hist)[iref][L16[bam_seqi(seq, iseq)]]++;
1686
2.76M
            } else {
1687
                // Probably a 2ndary read with seq "*"
1688
2.76M
                iseq += len;
1689
2.76M
                iref += len;
1690
2.76M
            }
1691
2.78M
            break;
1692
2.78M
        }
1693
1694
14.8k
        case BAM_CDEL:
1695
18.0k
        case BAM_CREF_SKIP:
1696
18.0k
            iref += bam_cigar_oplen(cigar[i]);
1697
3.04M
        }
1698
3.04M
    }
1699
1700
39.1k
    return 1;
1701
39.8k
}
1702
1703
// Automatically generates the reference and stashed it in c->ref, also
1704
// setting c->ref_start and c->ref_end.
1705
//
1706
// If we have MD:Z tags then we use them to directly infer the reference,
1707
// along with SEQ + CIGAR.  Otherwise we use SEQ/CIGAR only to build up
1708
// a consensus and then assume the reference as the majority rule.
1709
//
1710
// In this latter scenario we need to be wary of auto-generating MD and NM
1711
// during decode, but that's handled elsewhere via an additional aux tag.
1712
//
1713
// Returns 0 on success,
1714
//        -1 on failure
1715
31.6k
static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) {
1716
    // TODO: if we can find an external reference then use it, even if the
1717
    // user told us to do embed_ref=2.
1718
31.6k
    char *ref = NULL;
1719
31.6k
    uint32_t (*hist)[5] = NULL;
1720
31.6k
    hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0;
1721
31.6k
    if (ref_start < 0)
1722
2
        return -1; // cannot build consensus from unmapped data
1723
1724
    // initial allocation
1725
31.6k
    if (extend_ref(&ref, &hist,
1726
31.6k
                   c->bams[r1 + s->hdr->num_records-1]->core.pos +
1727
31.6k
                   c->bams[r1 + s->hdr->num_records-1]->core.l_qseq,
1728
31.6k
                   ref_start, &ref_end) < 0)
1729
310
        return -1;
1730
1731
    // Add each bam file to the reference/consensus arrays
1732
31.3k
    int r2;
1733
31.3k
    hts_pos_t last_pos = -1;
1734
118k
    for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) {
1735
87.5k
        if (c->bams[r1]->core.pos < last_pos) {
1736
57
            hts_log_error("Cannot build reference with unsorted data");
1737
57
            goto err;
1738
57
        }
1739
87.4k
        last_pos = c->bams[r1]->core.pos;
1740
87.4k
        if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end) < 0)
1741
719
            goto err;
1742
87.4k
    }
1743
1744
    // Compute the consensus
1745
30.6k
    hts_pos_t i;
1746
1.81G
    for (i = 0; i < ref_end-ref_start; i++) {
1747
1.81G
        if (!ref[i]) {
1748
1.81G
            int max_v = 0, max_j = 4, j;
1749
9.05G
            for (j = 0; j < 4; j++)
1750
                // don't call N (j==4) unless no coverage
1751
7.24G
                if (max_v < hist[i][j])
1752
1.09k
                    max_v = hist[i][j], max_j = j;
1753
1.81G
            ref[i] = "ACGTN"[max_j];
1754
1.81G
        }
1755
1.81G
    }
1756
30.6k
    free(hist);
1757
1758
    // Put the reference in place so it appears to be an external
1759
    // ref file.
1760
30.6k
    c->ref       = ref;
1761
30.6k
    c->ref_start = ref_start+1;
1762
30.6k
    c->ref_end   = ref_end+1;
1763
30.6k
    c->ref_free  = 1;
1764
30.6k
    return 0;
1765
1766
776
 err:
1767
776
    free(ref);
1768
776
    free(hist);
1769
776
    return -1;
1770
31.3k
}
1771
1772
// Check if the SQ M5 tag matches the reference we've loaded.
1773
0
static int validate_md5(cram_fd *fd, int ref_id) {
1774
0
    if (fd->ignore_md5 || ref_id < 0 || ref_id >= fd->refs->nref)
1775
0
        return 0;
1776
1777
    // Have we already checked this ref?
1778
0
    if (fd->refs->ref_id[ref_id]->validated_md5)
1779
0
        return 0;
1780
1781
    // Check if we have the MD5 known.
1782
    // We should, but maybe we're using embedded references?
1783
0
    sam_hrecs_t *hrecs = fd->header->hrecs;
1784
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, "SQ", "SN",
1785
0
                                                 hrecs->ref[ref_id].name);
1786
0
    if (!ty)
1787
0
        return 0;
1788
1789
0
    sam_hrec_tag_t *m5tag = sam_hrecs_find_key(ty, "M5", NULL);
1790
0
    if (!m5tag)
1791
0
        return 0;
1792
1793
    // It's known, so compute md5 on the loaded reference sequence.
1794
0
    char *ref = fd->refs->ref_id[ref_id]->seq;
1795
0
    int64_t len = fd->refs->ref_id[ref_id]->length;
1796
0
    hts_md5_context *md5;
1797
0
    char unsigned buf[16];
1798
0
    char buf2[33];
1799
1800
0
    if (!(md5 = hts_md5_init()))
1801
0
        return -1;
1802
0
    hts_md5_update(md5, ref, len);
1803
0
    hts_md5_final(buf, md5);
1804
0
    hts_md5_destroy(md5);
1805
0
    hts_md5_hex(buf2, buf);
1806
1807
    // Compare it to header @SQ M5 tag
1808
0
    if (strcmp(m5tag->str+3, buf2)) {
1809
0
        hts_log_error("SQ header M5 tag discrepancy for reference '%s'",
1810
0
                      hrecs->ref[ref_id].name);
1811
0
        hts_log_error("Please use the correct reference, or "
1812
0
                      "consider using embed_ref=2");
1813
0
        return -1;
1814
0
    }
1815
0
    fd->refs->ref_id[ref_id]->validated_md5 = 1;
1816
1817
0
    return 0;
1818
0
}
1819
1820
/*
1821
 * Encodes all slices in a container into blocks.
1822
 * Returns 0 on success
1823
 *        -1 on failure
1824
 */
1825
64.2k
int cram_encode_container(cram_fd *fd, cram_container *c) {
1826
64.2k
    int i, j, slice_offset;
1827
64.2k
    cram_block_compression_hdr *h = c->comp_hdr;
1828
64.2k
    cram_block *c_hdr;
1829
64.2k
    int multi_ref = 0;
1830
64.2k
    int r1, r2, sn, nref, embed_ref, no_ref;
1831
64.2k
    spare_bams *spares;
1832
1833
64.2k
    if (!c->bams)
1834
0
        goto err;
1835
1836
64.2k
    if (CRAM_MAJOR_VERS(fd->version) == 1)
1837
0
        goto err;
1838
1839
//#define goto_err {fprintf(stderr, "ERR at %s:%d\n", __FILE__, __LINE__);goto err;}
1840
64.2k
#define goto_err goto err
1841
1842
    // Don't try embed ref if we repeatedly fail
1843
64.2k
    pthread_mutex_lock(&fd->ref_lock);
1844
64.2k
    int failed_embed = (fd->no_ref_counter >= 5); // maximum 5 tries
1845
64.2k
    if (!failed_embed && c->embed_ref == -2 && c->ref_id >= 0) {
1846
803
        hts_log_warning("Retrying embed_ref=2 mode for #%d/5", fd->no_ref_counter);
1847
803
        fd->no_ref = c->no_ref = 0;
1848
803
        fd->embed_ref = c->embed_ref = 2;
1849
63.4k
    } else if (failed_embed && c->embed_ref == -2) {
1850
        // We've tried several times, so this time give up for good
1851
32
        hts_log_warning("Keeping non-ref mode from now on");
1852
32
        fd->embed_ref = c->embed_ref = 0;
1853
32
    }
1854
64.2k
    pthread_mutex_unlock(&fd->ref_lock);
1855
1856
65.3k
 restart:
1857
    /* Cache references up-front if we have unsorted access patterns */
1858
65.3k
    pthread_mutex_lock(&fd->ref_lock);
1859
65.3k
    nref = fd->refs->nref;
1860
65.3k
    pthread_mutex_unlock(&fd->ref_lock);
1861
65.3k
    embed_ref = c->embed_ref;
1862
65.3k
    no_ref = c->no_ref;
1863
1864
    /* To create M5 strings */
1865
    /* Fetch reference sequence */
1866
65.3k
    if (!no_ref) {
1867
63.6k
        if (!c->bams || !c->curr_c_rec || !c->bams[0])
1868
10
            goto_err;
1869
63.6k
        bam_seq_t *b = c->bams[0];
1870
1871
63.6k
        if (embed_ref <= 1) {
1872
1.28k
            char *ref = cram_get_ref(fd, bam_ref(b), 1, 0);
1873
1.28k
            if (!ref && bam_ref(b) >= 0) {
1874
13
                if (!c->pos_sorted) {
1875
                    // TODO: maybe also check fd->no_ref?
1876
1
                    hts_log_warning("Failed to load reference #%d",
1877
1
                                    bam_ref(b));
1878
1
                    hts_log_warning("Switching to non-ref mode");
1879
1880
1
                    pthread_mutex_lock(&fd->ref_lock);
1881
1
                    c->embed_ref = fd->embed_ref = 0;
1882
1
                    c->no_ref = fd->no_ref = 1;
1883
1
                    pthread_mutex_unlock(&fd->ref_lock);
1884
1
                    goto restart;
1885
1
                }
1886
1887
12
                if (c->multi_seq || embed_ref == 0) {
1888
0
                    hts_log_error("Failed to load reference #%d", bam_ref(b));
1889
0
                    return -1;
1890
0
                }
1891
12
                hts_log_warning("Failed to load reference #%d", bam_ref(b));
1892
12
                hts_log_warning("Enabling embed_ref=2 mode to auto-generate"
1893
12
                                " reference");
1894
12
                if (embed_ref <= 0)
1895
12
                    hts_log_warning("NOTE: the CRAM file will be bigger than"
1896
12
                                    " using an external reference");
1897
12
                pthread_mutex_lock(&fd->ref_lock);
1898
12
                embed_ref = c->embed_ref = fd->embed_ref = 2;
1899
12
                pthread_mutex_unlock(&fd->ref_lock);
1900
12
                goto auto_ref;
1901
1.26k
            } else if (ref) {
1902
0
                if (validate_md5(fd, c->ref_seq_id) < 0)
1903
0
                    goto_err;
1904
0
            }
1905
1.26k
            if ((c->ref_id = bam_ref(b)) >= 0) {
1906
0
                c->ref_seq_id = c->ref_id;
1907
0
                c->ref       = fd->refs->ref_id[c->ref_seq_id]->seq;
1908
0
                c->ref_start = 1;
1909
0
                c->ref_end   = fd->refs->ref_id[c->ref_seq_id]->length;
1910
0
            }
1911
62.3k
        } else {
1912
62.3k
        auto_ref:
1913
            // Auto-embed ref.
1914
            // This starts as 'N' and is amended on-the-fly as we go
1915
            // based on MD:Z tags.
1916
62.3k
            if ((c->ref_id = bam_ref(b)) >= 0) {
1917
31.6k
                c->ref = NULL;
1918
                // c->ref_free is boolean; whether to free c->ref.  In this
1919
                // case c->ref will be our auto-embedded sequence instead of
1920
                // a "global" portion of reference from fd->refs.
1921
                // Do not confuse with fd->ref_free which is a pointer to a
1922
                // reference string to free.
1923
31.6k
                c->ref_free = 1;
1924
31.6k
            } else {
1925
                // Double check for broken input.  We shouldn't have
1926
                // embedded references enabled for unmapped data, but our
1927
                // data could be broken.
1928
30.6k
                embed_ref = 0;
1929
30.6k
                no_ref = c->no_ref = 1;
1930
30.6k
            }
1931
62.3k
        }
1932
63.6k
        c->ref_seq_id = c->ref_id;
1933
63.6k
    } else {
1934
1.66k
        c->ref_id = bam_ref(c->bams[0]);
1935
1.66k
        cram_ref_incr(fd->refs, c->ref_id);
1936
1.66k
        c->ref_seq_id = c->ref_id;
1937
1.66k
    }
1938
1939
65.2k
    if (!no_ref && c->refs_used) {
1940
1.14k
        for (i = 0; i < nref; i++) {
1941
717
            if (c->refs_used[i]) {
1942
411
                if (cram_get_ref(fd, i, 1, 0)) {
1943
0
                    if (validate_md5(fd, i) < 0)
1944
0
                        goto_err;
1945
411
                } else {
1946
411
                    hts_log_warning("Failed to find reference, "
1947
411
                                    "switching to non-ref mode");
1948
411
                    no_ref = c->no_ref = 1;
1949
411
                }
1950
411
            }
1951
717
        }
1952
431
    }
1953
1954
    /* Turn bams into cram_records and gather basic stats */
1955
129k
    for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) {
1956
65.2k
        cram_slice *s = c->slices[sn];
1957
65.2k
        int64_t first_base = INT64_MAX, last_base = INT64_MIN;
1958
1959
65.2k
        int r1_start = r1;
1960
1961
65.2k
        assert(sn < c->curr_slice);
1962
1963
        // Discover which read names *may* be safely removed.
1964
        // Ie which ones have all their records in this slice.
1965
65.2k
        if (lossy_read_names(fd, c, s, r1_start) != 0)
1966
0
            return -1;
1967
1968
        // Tracking of MD tags so we can spot when the auto-generated values
1969
        // will differ from the current stored ones.  The kstring here is
1970
        // simply to avoid excessive malloc and free calls.  All initialisation
1971
        // is done within process_one_read().
1972
65.2k
        kstring_t MD = {0};
1973
1974
        // Embed consensus / MD-generated ref
1975
65.2k
        if (embed_ref == 2) {
1976
31.6k
            if (c->ref_id < 0 || cram_generate_reference(c, s, r1) < 0) {
1977
                // Should this be a permanent thing via fd->no_ref?
1978
                // Doing so means we cannot easily switch back again should
1979
                // things fix themselves later on.  This is likely not a
1980
                // concern though as failure to generate a reference implies
1981
                // unsorted data which is rarely recovered from.
1982
1983
                // Only if sn == 0.  We're hosed if we're on the 2nd slice and
1984
                // the first worked, as no-ref is a container global param.
1985
1.08k
                if (sn > 0) {
1986
0
                    hts_log_error("Failed to build reference, "
1987
0
                                  "switching to non-ref mode");
1988
0
                    return -1;
1989
1.08k
                } else {
1990
1.08k
                    hts_log_warning("Failed to build reference, "
1991
1.08k
                                    "switching to non-ref mode");
1992
1.08k
                }
1993
1.08k
                pthread_mutex_lock(&fd->ref_lock);
1994
1.08k
                c->embed_ref = fd->embed_ref = -2; // was previously embed_ref
1995
1.08k
                c->no_ref = fd->no_ref = 1;
1996
1.08k
                fd->no_ref_counter++; // more likely to keep permanent action
1997
1.08k
                pthread_mutex_unlock(&fd->ref_lock);
1998
1.08k
                failed_embed = 1;
1999
1.08k
                goto restart;
2000
30.6k
            } else {
2001
30.6k
                pthread_mutex_lock(&fd->ref_lock);
2002
30.6k
                fd->no_ref_counter -= (fd->no_ref_counter > 0);
2003
30.6k
                pthread_mutex_unlock(&fd->ref_lock);
2004
30.6k
            }
2005
2006
30.6k
            if (c->ref_end > fd->refs->ref_id[c->ref_id]->LN_length)
2007
23.3k
                c->ref_end = fd->refs->ref_id[c->ref_id]->LN_length;
2008
30.6k
        }
2009
2010
        // Iterate through records creating the cram blocks for some
2011
        // fields and just gathering stats for others.
2012
3.49M
        for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) {
2013
3.43M
            cram_record *cr = &s->crecs[r2];
2014
3.43M
            bam_seq_t *b = c->bams[r1];
2015
2016
            /* If multi-ref we need to cope with changing reference per seq */
2017
3.43M
            if (c->multi_seq && !no_ref) {
2018
847
                if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) {
2019
0
                    if (c->ref_seq_id >= 0)
2020
0
                        cram_ref_decr(fd->refs, c->ref_seq_id);
2021
2022
0
                    if (!cram_get_ref(fd, bam_ref(b), 1, 0)) {
2023
0
                        hts_log_error("Failed to load reference #%d", bam_ref(b));
2024
0
                        free(MD.s);
2025
0
                        return -1;
2026
0
                    }
2027
0
                    if (validate_md5(fd, bam_ref(b)) < 0)
2028
0
                        return -1;
2029
2030
0
                    c->ref_seq_id = bam_ref(b); // overwritten later by -2
2031
0
                    if (!fd->refs->ref_id[c->ref_seq_id]->seq)
2032
0
                        return -1;
2033
0
                    c->ref       = fd->refs->ref_id[c->ref_seq_id]->seq;
2034
0
                    c->ref_start = 1;
2035
0
                    c->ref_end   = fd->refs->ref_id[c->ref_seq_id]->length;
2036
0
                }
2037
847
            }
2038
2039
3.43M
            if (process_one_read(fd, c, s, cr, b, r2, &MD, embed_ref,
2040
3.43M
                                 no_ref) != 0) {
2041
168
                free(MD.s);
2042
168
                return -1;
2043
168
            }
2044
2045
3.43M
            if (first_base > cr->apos)
2046
64.5k
                first_base = cr->apos;
2047
2048
3.43M
            if (last_base < cr->aend)
2049
69.6k
                last_base = cr->aend;
2050
3.43M
        }
2051
2052
64.0k
        free(MD.s);
2053
2054
        // Process_one_read doesn't add read names as it can change
2055
        // its mind during the loop on the CRAM_FLAG_DETACHED setting
2056
        // of earlier records (if it detects the auto-generation of
2057
        // TLEN is incorrect).  This affects which read-names can be
2058
        // lossily compressed, so we do these in another pass.
2059
64.0k
        if (add_read_names(fd, c, s, r1_start) < 0)
2060
0
            return -1;
2061
2062
64.0k
        if (c->multi_seq) {
2063
845
            s->hdr->ref_seq_id    = -2;
2064
845
            s->hdr->ref_seq_start = 0;
2065
845
            s->hdr->ref_seq_span  = 0;
2066
63.1k
        } else if (c->ref_id == -1 && CRAM_ge31(fd->version)) {
2067
            // Spec states span=0, but it broke our range queries.
2068
            // See commit message for this and prior.
2069
31.8k
            s->hdr->ref_seq_id    = -1;
2070
31.8k
            s->hdr->ref_seq_start = 0;
2071
31.8k
            s->hdr->ref_seq_span  = 0;
2072
31.8k
        } else {
2073
31.3k
            s->hdr->ref_seq_id    = c->ref_id;
2074
31.3k
            s->hdr->ref_seq_start = first_base;
2075
31.3k
            s->hdr->ref_seq_span  = MAX(0, last_base - first_base + 1);
2076
31.3k
        }
2077
64.0k
        s->hdr->num_records = r2;
2078
2079
        // Processed a slice, now stash the aux blocks so the next
2080
        // slice can start aggregating them from the start again.
2081
64.0k
        if (c->tags_used->n_occupied) {
2082
58.4k
            int ntags = c->tags_used->n_occupied;
2083
58.4k
            s->aux_block = calloc(ntags*2, sizeof(*s->aux_block));
2084
58.4k
            if (!s->aux_block)
2085
0
                return -1;
2086
2087
58.4k
            khint_t k;
2088
2089
58.4k
            s->naux_block = 0;
2090
444k
            for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
2091
385k
                if (!kh_exist(c->tags_used, k))
2092
214k
                    continue;
2093
2094
171k
                cram_tag_map *tm = kh_val(c->tags_used, k);
2095
171k
                if (!tm) goto_err;
2096
171k
                if (!tm->blk) continue;
2097
171k
                s->aux_block[s->naux_block++] = tm->blk;
2098
171k
                tm->blk = NULL;
2099
171k
                if (!tm->blk2) continue;
2100
0
                s->aux_block[s->naux_block++] = tm->blk2;
2101
0
                tm->blk2 = NULL;
2102
0
            }
2103
58.4k
            assert(s->naux_block <= 2*c->tags_used->n_occupied);
2104
58.4k
        }
2105
64.0k
    }
2106
2107
64.0k
    if (c->multi_seq && !no_ref) {
2108
20
        if (c->ref_seq_id >= 0)
2109
0
            cram_ref_decr(fd->refs, c->ref_seq_id);
2110
20
    }
2111
2112
    /* Link our bams[] array onto the spare bam list for reuse */
2113
64.0k
    spares = malloc(sizeof(*spares));
2114
64.0k
    if (!spares) goto_err;
2115
64.0k
    pthread_mutex_lock(&fd->bam_list_lock);
2116
64.0k
    spares->bams = c->bams;
2117
64.0k
    spares->next = fd->bl;
2118
64.0k
    fd->bl = spares;
2119
64.0k
    pthread_mutex_unlock(&fd->bam_list_lock);
2120
64.0k
    c->bams = NULL;
2121
2122
    /* Detect if a multi-seq container */
2123
64.0k
    cram_stats_encoding(fd, c->stats[DS_RI]);
2124
64.0k
    multi_ref = c->stats[DS_RI]->nvals > 1;
2125
64.0k
    pthread_mutex_lock(&fd->metrics_lock);
2126
64.0k
    fd->last_RI_count = c->stats[DS_RI]->nvals;
2127
64.0k
    pthread_mutex_unlock(&fd->metrics_lock);
2128
2129
2130
64.0k
    if (multi_ref) {
2131
353
        hts_log_info("Multi-ref container");
2132
353
        c->ref_seq_id = -2;
2133
353
        c->ref_seq_start = 0;
2134
353
        c->ref_seq_span = 0;
2135
353
    }
2136
2137
2138
    /* Compute MD5s */
2139
64.0k
    no_ref = c->no_ref;
2140
64.0k
    int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0;
2141
2142
128k
    for (i = 0; i < c->curr_slice; i++) {
2143
64.0k
        cram_slice *s = c->slices[i];
2144
2145
64.0k
        if (CRAM_MAJOR_VERS(fd->version) != 1) {
2146
64.0k
            if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !no_ref) {
2147
30.5k
                hts_md5_context *md5 = hts_md5_init();
2148
30.5k
                if (!md5)
2149
0
                    return -1;
2150
30.5k
                hts_md5_update(md5,
2151
30.5k
                               c->ref + s->hdr->ref_seq_start - c->ref_start,
2152
30.5k
                               s->hdr->ref_seq_span);
2153
30.5k
                hts_md5_final(s->hdr->md5, md5);
2154
30.5k
                hts_md5_destroy(md5);
2155
33.5k
            } else {
2156
33.5k
                memset(s->hdr->md5, 0, 16);
2157
33.5k
            }
2158
64.0k
        }
2159
64.0k
    }
2160
2161
64.0k
    c->num_records = 0;
2162
64.0k
    c->num_blocks = 1; // cram_block_compression_hdr
2163
64.0k
    c->length = 0;
2164
2165
    //fprintf(stderr, "=== BF ===\n");
2166
64.0k
    h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]),
2167
64.0k
                                         c->stats[DS_BF], E_INT, NULL,
2168
64.0k
                                         fd->version, &fd->vv);
2169
64.0k
    if (c->stats[DS_BF]->nvals && !h->codecs[DS_BF]) goto_err;
2170
2171
    //fprintf(stderr, "=== CF ===\n");
2172
64.0k
    h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]),
2173
64.0k
                                         c->stats[DS_CF], E_INT, NULL,
2174
64.0k
                                         fd->version, &fd->vv);
2175
64.0k
    if (c->stats[DS_CF]->nvals && !h->codecs[DS_CF]) goto_err;
2176
2177
    //fprintf(stderr, "=== RN ===\n");
2178
    //h->codecs[DS_RN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RN]),
2179
    //                                     c->stats[DS_RN], E_BYTE_ARRAY, NULL,
2180
    //                                     fd->version);
2181
2182
    //fprintf(stderr, "=== AP ===\n");
2183
64.0k
    if (c->pos_sorted || CRAM_MAJOR_VERS(fd->version) >= 4) {
2184
62.9k
        if (c->pos_sorted)
2185
62.9k
            h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]),
2186
62.9k
                                                 c->stats[DS_AP],
2187
62.9k
                                                 is_v4 ? E_LONG : E_INT,
2188
62.9k
                                                 NULL, fd->version, &fd->vv);
2189
0
        else
2190
            // Unsorted data has no stats, but hard-code VARINT_SIGNED / EXT.
2191
0
            h->codecs[DS_AP] = cram_encoder_init(is_v4 ? E_VARINT_SIGNED
2192
0
                                                       : E_EXTERNAL,
2193
0
                                                 NULL,
2194
0
                                                 is_v4 ? E_LONG : E_INT,
2195
0
                                                 NULL, fd->version, &fd->vv);
2196
62.9k
    } else {
2197
        // Removed BETA in v4.0.
2198
        // Should we consider dropping use of it for 3.0 too?
2199
1.10k
        hts_pos_t p[2] = {0, c->max_apos};
2200
1.10k
        h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL,
2201
1.10k
                                             is_v4 ? E_LONG : E_INT,
2202
1.10k
                                             p, fd->version, &fd->vv);
2203
//      cram_xdelta_encoder e;
2204
//      e.word_size = is_v4 ? 8 : 4;
2205
//      e.sub_encoding = E_EXTERNAL;
2206
//      e.sub_codec_dat = (void *)DS_AP;
2207
//
2208
//      h->codecs[DS_AP] = cram_encoder_init(E_XDELTA, NULL,
2209
//                                           is_v4 ? E_LONG : E_INT,
2210
//                                           &e, fd->version, &fd->vv);
2211
1.10k
    }
2212
64.0k
    if (!h->codecs[DS_AP]) goto_err;
2213
2214
    //fprintf(stderr, "=== RG ===\n");
2215
64.0k
    h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]),
2216
64.0k
                                         c->stats[DS_RG],
2217
64.0k
                                         E_INT,
2218
64.0k
                                         NULL,
2219
64.0k
                                         fd->version, &fd->vv);
2220
64.0k
    if (c->stats[DS_RG]->nvals && !h->codecs[DS_RG]) goto_err;
2221
2222
    //fprintf(stderr, "=== MQ ===\n");
2223
64.0k
    h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]),
2224
64.0k
                                         c->stats[DS_MQ], E_INT, NULL,
2225
64.0k
                                         fd->version, &fd->vv);
2226
64.0k
    if (c->stats[DS_MQ]->nvals && !h->codecs[DS_MQ]) goto_err;
2227
2228
    //fprintf(stderr, "=== NS ===\n");
2229
64.0k
    h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]),
2230
64.0k
                                         c->stats[DS_NS], E_INT, NULL,
2231
64.0k
                                         fd->version, &fd->vv);
2232
64.0k
    if (c->stats[DS_NS]->nvals && !h->codecs[DS_NS]) goto_err;
2233
2234
    //fprintf(stderr, "=== MF ===\n");
2235
64.0k
    h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]),
2236
64.0k
                                         c->stats[DS_MF], E_INT, NULL,
2237
64.0k
                                         fd->version, &fd->vv);
2238
64.0k
    if (c->stats[DS_MF]->nvals && !h->codecs[DS_MF]) goto_err;
2239
2240
    //fprintf(stderr, "=== TS ===\n");
2241
64.0k
    h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]),
2242
64.0k
                                         c->stats[DS_TS],
2243
64.0k
                                         is_v4 ? E_LONG : E_INT,
2244
64.0k
                                         NULL, fd->version, &fd->vv);
2245
64.0k
    if (c->stats[DS_TS]->nvals && !h->codecs[DS_TS]) goto_err;
2246
2247
    //fprintf(stderr, "=== NP ===\n");
2248
64.0k
    h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]),
2249
64.0k
                                         c->stats[DS_NP],
2250
64.0k
                                         is_v4 ? E_LONG : E_INT,
2251
64.0k
                                         NULL, fd->version, &fd->vv);
2252
64.0k
    if (c->stats[DS_NP]->nvals && !h->codecs[DS_NP]) goto_err;
2253
2254
    //fprintf(stderr, "=== NF ===\n");
2255
64.0k
    h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]),
2256
64.0k
                                         c->stats[DS_NF], E_INT, NULL,
2257
64.0k
                                         fd->version, &fd->vv);
2258
64.0k
    if (c->stats[DS_NF]->nvals && !h->codecs[DS_NF]) goto_err;
2259
2260
    //fprintf(stderr, "=== RL ===\n");
2261
64.0k
    h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]),
2262
64.0k
                                         c->stats[DS_RL], E_INT, NULL,
2263
64.0k
                                         fd->version, &fd->vv);
2264
64.0k
    if (c->stats[DS_RL]->nvals && !h->codecs[DS_RL]) goto_err;
2265
2266
    //fprintf(stderr, "=== FN ===\n");
2267
64.0k
    h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]),
2268
64.0k
                                         c->stats[DS_FN], E_INT, NULL,
2269
64.0k
                                         fd->version, &fd->vv);
2270
64.0k
    if (c->stats[DS_FN]->nvals && !h->codecs[DS_FN]) goto_err;
2271
2272
    //fprintf(stderr, "=== FC ===\n");
2273
64.0k
    h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]),
2274
64.0k
                                         c->stats[DS_FC], E_BYTE, NULL,
2275
64.0k
                                         fd->version, &fd->vv);
2276
64.0k
    if (c->stats[DS_FC]->nvals && !h->codecs[DS_FC]) goto_err;
2277
2278
    //fprintf(stderr, "=== FP ===\n");
2279
64.0k
    h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]),
2280
64.0k
                                         c->stats[DS_FP], E_INT, NULL,
2281
64.0k
                                         fd->version, &fd->vv);
2282
64.0k
    if (c->stats[DS_FP]->nvals && !h->codecs[DS_FP]) goto_err;
2283
2284
    //fprintf(stderr, "=== DL ===\n");
2285
64.0k
    h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]),
2286
64.0k
                                         c->stats[DS_DL], E_INT, NULL,
2287
64.0k
                                         fd->version, &fd->vv);
2288
64.0k
    if (c->stats[DS_DL]->nvals && !h->codecs[DS_DL]) goto_err;
2289
2290
    //fprintf(stderr, "=== BA ===\n");
2291
64.0k
    h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]),
2292
64.0k
                                         c->stats[DS_BA], E_BYTE, NULL,
2293
64.0k
                                         fd->version, &fd->vv);
2294
64.0k
    if (c->stats[DS_BA]->nvals && !h->codecs[DS_BA]) goto_err;
2295
2296
64.0k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2297
64.0k
        cram_byte_array_len_encoder e;
2298
2299
64.0k
        e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
2300
64.0k
            ? E_VARINT_UNSIGNED
2301
64.0k
            : E_EXTERNAL;
2302
64.0k
        e.len_dat = (void *)DS_BB_len;
2303
        //e.len_dat = (void *)DS_BB;
2304
2305
64.0k
        e.val_encoding = E_EXTERNAL;
2306
64.0k
        e.val_dat = (void *)DS_BB;
2307
2308
64.0k
        h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
2309
64.0k
                                             E_BYTE_ARRAY, (void *)&e,
2310
64.0k
                                             fd->version, &fd->vv);
2311
64.0k
        if (!h->codecs[DS_BB]) goto_err;
2312
64.0k
    } else {
2313
0
        h->codecs[DS_BB] = NULL;
2314
0
    }
2315
2316
    //fprintf(stderr, "=== BS ===\n");
2317
64.0k
    h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]),
2318
64.0k
                                         c->stats[DS_BS], E_BYTE, NULL,
2319
64.0k
                                         fd->version, &fd->vv);
2320
64.0k
    if (c->stats[DS_BS]->nvals && !h->codecs[DS_BS]) goto_err;
2321
2322
64.0k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
2323
0
        h->codecs[DS_TL] = NULL;
2324
0
        h->codecs[DS_RI] = NULL;
2325
0
        h->codecs[DS_RS] = NULL;
2326
0
        h->codecs[DS_PD] = NULL;
2327
0
        h->codecs[DS_HC] = NULL;
2328
0
        h->codecs[DS_SC] = NULL;
2329
2330
        //fprintf(stderr, "=== TC ===\n");
2331
0
        h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]),
2332
0
                                             c->stats[DS_TC], E_BYTE, NULL,
2333
0
                                             fd->version, &fd->vv);
2334
0
        if (c->stats[DS_TC]->nvals && !h->codecs[DS_TC]) goto_err;
2335
2336
        //fprintf(stderr, "=== TN ===\n");
2337
0
        h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]),
2338
0
                                             c->stats[DS_TN], E_INT, NULL,
2339
0
                                             fd->version, &fd->vv);
2340
0
        if (c->stats[DS_TN]->nvals && !h->codecs[DS_TN]) goto_err;
2341
64.0k
    } else {
2342
64.0k
        h->codecs[DS_TC] = NULL;
2343
64.0k
        h->codecs[DS_TN] = NULL;
2344
2345
        //fprintf(stderr, "=== TL ===\n");
2346
64.0k
        h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]),
2347
64.0k
                                             c->stats[DS_TL], E_INT, NULL,
2348
64.0k
                                             fd->version, &fd->vv);
2349
64.0k
        if (c->stats[DS_TL]->nvals && !h->codecs[DS_TL]) goto_err;
2350
2351
2352
        //fprintf(stderr, "=== RI ===\n");
2353
64.0k
        h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]),
2354
64.0k
                                             c->stats[DS_RI], E_INT, NULL,
2355
64.0k
                                             fd->version, &fd->vv);
2356
64.0k
        if (c->stats[DS_RI]->nvals && !h->codecs[DS_RI]) goto_err;
2357
2358
        //fprintf(stderr, "=== RS ===\n");
2359
64.0k
        h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]),
2360
64.0k
                                             c->stats[DS_RS], E_INT, NULL,
2361
64.0k
                                             fd->version, &fd->vv);
2362
64.0k
        if (c->stats[DS_RS]->nvals && !h->codecs[DS_RS]) goto_err;
2363
2364
        //fprintf(stderr, "=== PD ===\n");
2365
64.0k
        h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]),
2366
64.0k
                                             c->stats[DS_PD], E_INT, NULL,
2367
64.0k
                                             fd->version, &fd->vv);
2368
64.0k
        if (c->stats[DS_PD]->nvals && !h->codecs[DS_PD]) goto_err;
2369
2370
        //fprintf(stderr, "=== HC ===\n");
2371
64.0k
        h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]),
2372
64.0k
                                             c->stats[DS_HC], E_INT, NULL,
2373
64.0k
                                             fd->version, &fd->vv);
2374
64.0k
        if (c->stats[DS_HC]->nvals && !h->codecs[DS_HC]) goto_err;
2375
2376
        //fprintf(stderr, "=== SC ===\n");
2377
64.0k
        if (1) {
2378
64.0k
            int i2[2] = {0, DS_SC};
2379
2380
64.0k
            h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2381
64.0k
                                                 E_BYTE_ARRAY, (void *)i2,
2382
64.0k
                                                 fd->version, &fd->vv);
2383
64.0k
        } else {
2384
            // Appears to be no practical benefit to using this method,
2385
            // but it may work better if we start mixing SC, IN and BB
2386
            // elements into the same external block.
2387
0
            cram_byte_array_len_encoder e;
2388
2389
0
            e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
2390
0
                ? E_VARINT_UNSIGNED
2391
0
                : E_EXTERNAL;
2392
0
            e.len_dat = (void *)DS_SC_len;
2393
2394
0
            e.val_encoding = E_EXTERNAL;
2395
0
            e.val_dat = (void *)DS_SC;
2396
2397
0
            h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
2398
0
                                                 E_BYTE_ARRAY, (void *)&e,
2399
0
                                                 fd->version, &fd->vv);
2400
0
        }
2401
64.0k
        if (!h->codecs[DS_SC]) goto_err;
2402
64.0k
    }
2403
2404
    //fprintf(stderr, "=== IN ===\n");
2405
64.0k
    {
2406
64.0k
        int i2[2] = {0, DS_IN};
2407
64.0k
        h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2408
64.0k
                                             E_BYTE_ARRAY, (void *)i2,
2409
64.0k
                                             fd->version, &fd->vv);
2410
64.0k
        if (!h->codecs[DS_IN]) goto_err;
2411
64.0k
    }
2412
2413
64.0k
    h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE,
2414
64.0k
                                         (void *)DS_QS,
2415
64.0k
                                         fd->version, &fd->vv);
2416
64.0k
    if (!h->codecs[DS_QS]) goto_err;
2417
64.0k
    {
2418
64.0k
        int i2[2] = {0, DS_RN};
2419
64.0k
        h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2420
64.0k
                                             E_BYTE_ARRAY, (void *)i2,
2421
64.0k
                                             fd->version, &fd->vv);
2422
64.0k
        if (!h->codecs[DS_RN]) goto_err;
2423
64.0k
    }
2424
2425
2426
    /* Encode slices */
2427
127k
    for (i = 0; i < c->curr_slice; i++) {
2428
64.0k
        hts_log_info("Encode slice %d", i);
2429
2430
64.0k
        int local_embed_ref =
2431
64.0k
            embed_ref>0 && c->slices[i]->hdr->ref_seq_id != -1 ? 1 : 0;
2432
64.0k
        if (cram_encode_slice(fd, c, h, c->slices[i], local_embed_ref) != 0)
2433
15
            return -1;
2434
64.0k
    }
2435
2436
    /* Create compression header */
2437
63.9k
    {
2438
63.9k
        h->ref_seq_id    = c->ref_seq_id;
2439
63.9k
        h->ref_seq_start = c->ref_seq_start;
2440
63.9k
        h->ref_seq_span  = c->ref_seq_span;
2441
63.9k
        h->num_records   = c->num_records;
2442
63.9k
        h->qs_seq_orient = c->qs_seq_orient;
2443
        // slight misnomer - sorted or treat as-if sorted (ap_delta force to 1)
2444
63.9k
        h->AP_delta      = c->pos_sorted;
2445
63.9k
        memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20);
2446
2447
63.9k
        if (!(c_hdr = cram_encode_compression_header(fd, c, h, embed_ref)))
2448
0
            return -1;
2449
63.9k
    }
2450
2451
    /* Compute landmarks */
2452
    /* Fill out slice landmarks */
2453
63.9k
    c->num_landmarks = c->curr_slice;
2454
63.9k
    c->landmark = malloc(c->num_landmarks * sizeof(*c->landmark));
2455
63.9k
    if (!c->landmark)
2456
0
        return -1;
2457
2458
    /*
2459
     * Slice offset starts after the first block, so we need to simulate
2460
     * writing it to work out the correct offset
2461
     */
2462
63.9k
    {
2463
63.9k
        slice_offset = c_hdr->method == RAW
2464
63.9k
            ? c_hdr->uncomp_size
2465
63.9k
            : c_hdr->comp_size;
2466
63.9k
        slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
2467
63.9k
            fd->vv.varint_size(c_hdr->content_id) +
2468
63.9k
            fd->vv.varint_size(c_hdr->comp_size) +
2469
63.9k
            fd->vv.varint_size(c_hdr->uncomp_size);
2470
63.9k
    }
2471
2472
63.9k
    c->ref_seq_id    = c->slices[0]->hdr->ref_seq_id;
2473
63.9k
    if (c->ref_seq_id == -1 && CRAM_ge31(fd->version)) {
2474
        // Spec states span=0, but it broke our range queries.
2475
        // See commit message for this and prior.
2476
31.8k
        c->ref_seq_start = 0;
2477
31.8k
        c->ref_seq_span  = 0;
2478
32.1k
    } else {
2479
32.1k
        c->ref_seq_start = c->slices[0]->hdr->ref_seq_start;
2480
32.1k
        c->ref_seq_span  = c->slices[0]->hdr->ref_seq_span;
2481
32.1k
    }
2482
127k
    for (i = 0; i < c->curr_slice; i++) {
2483
63.9k
        cram_slice *s = c->slices[i];
2484
2485
63.9k
        c->num_blocks += s->hdr->num_blocks + 1; // slice header
2486
63.9k
        c->landmark[i] = slice_offset;
2487
2488
63.9k
        if (s->hdr->ref_seq_start + s->hdr->ref_seq_span >
2489
63.9k
            c->ref_seq_start + c->ref_seq_span) {
2490
0
            c->ref_seq_span = s->hdr->ref_seq_start + s->hdr->ref_seq_span
2491
0
                - c->ref_seq_start;
2492
0
        }
2493
2494
63.9k
        slice_offset += s->hdr_block->method == RAW
2495
63.9k
            ? s->hdr_block->uncomp_size
2496
63.9k
            : s->hdr_block->comp_size;
2497
2498
63.9k
        slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
2499
63.9k
            fd->vv.varint_size(s->hdr_block->content_id) +
2500
63.9k
            fd->vv.varint_size(s->hdr_block->comp_size) +
2501
63.9k
            fd->vv.varint_size(s->hdr_block->uncomp_size);
2502
2503
433k
        for (j = 0; j < s->hdr->num_blocks; j++) {
2504
369k
            slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
2505
369k
                fd->vv.varint_size(s->block[j]->content_id) +
2506
369k
                fd->vv.varint_size(s->block[j]->comp_size) +
2507
369k
                fd->vv.varint_size(s->block[j]->uncomp_size);
2508
2509
369k
            slice_offset += s->block[j]->method == RAW
2510
369k
                ? s->block[j]->uncomp_size
2511
369k
                : s->block[j]->comp_size;
2512
369k
        }
2513
63.9k
    }
2514
63.9k
    c->length += slice_offset; // just past the final slice
2515
2516
63.9k
    c->comp_hdr_block = c_hdr;
2517
2518
63.9k
    if (c->ref_seq_id >= 0) {
2519
31.2k
        if (c->ref_free) {
2520
31.1k
            free(c->ref);
2521
31.1k
            c->ref = NULL;
2522
31.1k
        } else {
2523
146
            cram_ref_decr(fd->refs, c->ref_seq_id);
2524
146
        }
2525
31.2k
    }
2526
2527
    /* Cache references up-front if we have unsorted access patterns */
2528
63.9k
    if (!no_ref && c->refs_used) {
2529
45
        for (i = 0; i < fd->refs->nref; i++) {
2530
25
            if (c->refs_used[i])
2531
0
                cram_ref_decr(fd->refs, i);
2532
25
        }
2533
20
    }
2534
2535
63.9k
    return 0;
2536
2537
48
 err:
2538
48
    return -1;
2539
63.9k
}
2540
2541
2542
/*
2543
 * Adds a feature code to a read within a slice. For purposes of minimising
2544
 * memory allocations and fragmentation we have one array of features for all
2545
 * reads within the slice. We return the index into this array for this new
2546
 * feature.
2547
 *
2548
 * Returns feature index on success
2549
 *         -1 on failure.
2550
 */
2551
static int cram_add_feature(cram_container *c, cram_slice *s,
2552
185k
                            cram_record *r, cram_feature *f) {
2553
185k
    if (s->nfeatures >= s->afeatures) {
2554
29.6k
        s->afeatures = s->afeatures ? s->afeatures*2 : 1024;
2555
29.6k
        s->features = realloc(s->features, s->afeatures*sizeof(*s->features));
2556
29.6k
        if (!s->features)
2557
0
            return -1;
2558
29.6k
    }
2559
2560
185k
    if (!r->nfeature++) {
2561
84.7k
        r->feature = s->nfeatures;
2562
84.7k
        if (cram_stats_add(c->stats[DS_FP], f->X.pos) < 0)
2563
0
            return -1;
2564
101k
    } else {
2565
101k
        if (cram_stats_add(c->stats[DS_FP],
2566
101k
                           f->X.pos - s->features[r->feature + r->nfeature-2].X.pos) < 0)
2567
0
            return -1;
2568
2569
101k
    }
2570
185k
    if (cram_stats_add(c->stats[DS_FC], f->X.code) < 0)
2571
0
        return -1;
2572
2573
185k
    s->features[s->nfeatures++] = *f;
2574
2575
185k
    return 0;
2576
185k
}
2577
2578
static int cram_add_substitution(cram_fd *fd, cram_container *c,
2579
                                 cram_slice *s, cram_record *r,
2580
992
                                 int pos, char base, char qual, char ref) {
2581
992
    cram_feature f;
2582
2583
    // seq=ACGTN vs ref=ACGT or seq=ACGT vs ref=ACGTN
2584
992
    if (fd->L2[(uc)base]<4 || (fd->L2[(uc)base]<5 && fd->L2[(uc)ref]<4)) {
2585
438
        f.X.pos = pos+1;
2586
438
        f.X.code = 'X';
2587
438
        f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f];
2588
438
        if (cram_stats_add(c->stats[DS_BS], f.X.base) < 0)
2589
0
            return -1;
2590
554
    } else {
2591
554
        f.B.pos = pos+1;
2592
554
        f.B.code = 'B';
2593
554
        f.B.base = base;
2594
554
        f.B.qual = qual;
2595
554
        if (cram_stats_add(c->stats[DS_BA], f.B.base) < 0) return -1;
2596
554
        if (cram_stats_add(c->stats[DS_QS], f.B.qual) < 0) return -1;
2597
554
        BLOCK_APPEND_CHAR(s->qual_blk, qual);
2598
554
    }
2599
992
    return cram_add_feature(c, s, r, &f);
2600
2601
0
 block_err:
2602
0
    return -1;
2603
992
}
2604
2605
static int cram_add_bases(cram_fd *fd, cram_container *c,
2606
                          cram_slice *s, cram_record *r,
2607
229
                          int pos, int len, char *base) {
2608
229
    cram_feature f;
2609
2610
229
    f.b.pos = pos+1;
2611
229
    f.b.code = 'b';
2612
229
    f.b.seq_idx = base - (char *)BLOCK_DATA(s->seqs_blk);
2613
229
    f.b.len = len;
2614
2615
229
    return cram_add_feature(c, s, r, &f);
2616
229
}
2617
2618
static int cram_add_base(cram_fd *fd, cram_container *c,
2619
                         cram_slice *s, cram_record *r,
2620
1.22k
                         int pos, char base, char qual) {
2621
1.22k
    cram_feature f;
2622
1.22k
    f.B.pos = pos+1;
2623
1.22k
    f.B.code = 'B';
2624
1.22k
    f.B.base = base;
2625
1.22k
    f.B.qual = qual;
2626
1.22k
    if (cram_stats_add(c->stats[DS_BA], base) < 0) return -1;
2627
1.22k
    if (cram_stats_add(c->stats[DS_QS], qual) < 0) return -1;
2628
1.22k
    BLOCK_APPEND_CHAR(s->qual_blk, qual);
2629
1.22k
    return cram_add_feature(c, s, r, &f);
2630
2631
0
 block_err:
2632
0
    return -1;
2633
1.22k
}
2634
2635
static int cram_add_quality(cram_fd *fd, cram_container *c,
2636
                            cram_slice *s, cram_record *r,
2637
29
                            int pos, char qual) {
2638
29
    cram_feature f;
2639
29
    f.Q.pos = pos+1;
2640
29
    f.Q.code = 'Q';
2641
29
    f.Q.qual = qual;
2642
29
    if (cram_stats_add(c->stats[DS_QS], qual) < 0) return -1;
2643
29
    BLOCK_APPEND_CHAR(s->qual_blk, qual);
2644
29
    return cram_add_feature(c, s, r, &f);
2645
2646
0
 block_err:
2647
0
    return -1;
2648
29
}
2649
2650
static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r,
2651
61.9k
                             int pos, int len, char *base) {
2652
61.9k
    cram_feature f;
2653
61.9k
    f.D.pos = pos+1;
2654
61.9k
    f.D.code = 'D';
2655
61.9k
    f.D.len = len;
2656
61.9k
    if (cram_stats_add(c->stats[DS_DL], len) < 0) return -1;
2657
61.9k
    return cram_add_feature(c, s, r, &f);
2658
61.9k
}
2659
2660
static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r,
2661
78.7k
                             int pos, int len, char *base, int version) {
2662
78.7k
    cram_feature f;
2663
78.7k
    f.S.pos = pos+1;
2664
78.7k
    f.S.code = 'S';
2665
78.7k
    f.S.len = len;
2666
78.7k
    switch (CRAM_MAJOR_VERS(version)) {
2667
0
    case 1:
2668
0
        f.S.seq_idx = BLOCK_SIZE(s->base_blk);
2669
0
        BLOCK_APPEND(s->base_blk, base, len);
2670
0
        BLOCK_APPEND_CHAR(s->base_blk, '\0');
2671
0
        break;
2672
2673
0
    case 2:
2674
78.7k
    default:
2675
78.7k
        f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
2676
78.7k
        if (base) {
2677
1.00k
            BLOCK_APPEND(s->soft_blk, base, len);
2678
77.7k
        } else {
2679
77.7k
            int i;
2680
201k
            for (i = 0; i < len; i++)
2681
123k
                BLOCK_APPEND_CHAR(s->soft_blk, 'N');
2682
77.7k
        }
2683
78.7k
        BLOCK_APPEND_CHAR(s->soft_blk, '\0');
2684
78.7k
        break;
2685
2686
        //default:
2687
        //    // v3.0 onwards uses BB data-series
2688
        //    f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
2689
78.7k
    }
2690
78.7k
    return cram_add_feature(c, s, r, &f);
2691
2692
0
 block_err:
2693
0
    return -1;
2694
78.7k
}
2695
2696
static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r,
2697
17.9k
                             int pos, int len, char *base) {
2698
17.9k
    cram_feature f;
2699
17.9k
    f.S.pos = pos+1;
2700
17.9k
    f.S.code = 'H';
2701
17.9k
    f.S.len = len;
2702
17.9k
    if (cram_stats_add(c->stats[DS_HC], len) < 0) return -1;
2703
17.9k
    return cram_add_feature(c, s, r, &f);
2704
17.9k
}
2705
2706
static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r,
2707
849
                         int pos, int len, char *base) {
2708
849
    cram_feature f;
2709
849
    f.S.pos = pos+1;
2710
849
    f.S.code = 'N';
2711
849
    f.S.len = len;
2712
849
    if (cram_stats_add(c->stats[DS_RS], len) < 0) return -1;
2713
849
    return cram_add_feature(c, s, r, &f);
2714
849
}
2715
2716
static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r,
2717
187
                        int pos, int len, char *base) {
2718
187
    cram_feature f;
2719
187
    f.S.pos = pos+1;
2720
187
    f.S.code = 'P';
2721
187
    f.S.len = len;
2722
187
    if (cram_stats_add(c->stats[DS_PD], len) < 0) return -1;
2723
187
    return cram_add_feature(c, s, r, &f);
2724
187
}
2725
2726
static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r,
2727
23.5k
                              int pos, int len, char *base) {
2728
23.5k
    cram_feature f;
2729
23.5k
    f.I.pos = pos+1;
2730
23.5k
    if (len == 1) {
2731
22.7k
        char b = base ? *base : 'N';
2732
22.7k
        f.i.code = 'i';
2733
22.7k
        f.i.base = b;
2734
22.7k
        if (cram_stats_add(c->stats[DS_BA], b) < 0) return -1;
2735
22.7k
    } else {
2736
832
        f.I.code = 'I';
2737
832
        f.I.len = len;
2738
832
        f.S.seq_idx = BLOCK_SIZE(s->base_blk);
2739
832
        if (base) {
2740
98
            BLOCK_APPEND(s->base_blk, base, len);
2741
734
        } else {
2742
734
            int i;
2743
14.6M
            for (i = 0; i < len; i++)
2744
14.6M
                BLOCK_APPEND_CHAR(s->base_blk, 'N');
2745
734
        }
2746
832
        BLOCK_APPEND_CHAR(s->base_blk, '\0');
2747
832
    }
2748
23.5k
    return cram_add_feature(c, s, r, &f);
2749
2750
0
 block_err:
2751
0
    return -1;
2752
23.5k
}
2753
2754
/*
2755
 * Encodes auxiliary data. Largely duplicated from above, but done so to
2756
 * keep it simple and avoid a myriad of version ifs.
2757
 *
2758
 * Returns the RG header line pointed to by the BAM aux fields on success,
2759
 *         NULL on failure or no rg present, also sets "*err" to non-zero
2760
 */
2761
static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
2762
                                      cram_container *c,
2763
                                      cram_slice *s, cram_record *cr,
2764
                                      int verbatim_NM, int verbatim_MD,
2765
                                      int NM, kstring_t *MD, int cf_tag,
2766
3.43M
                                      int no_ref, int *err) {
2767
3.43M
    char *aux, *orig;
2768
3.43M
    sam_hrec_rg_t *brg = NULL;
2769
3.43M
    int aux_size = bam_get_l_aux(b);
2770
3.43M
    const char *aux_end = bam_data_end(b);
2771
3.43M
    cram_block *td_b = c->comp_hdr->TD_blk;
2772
3.43M
    int TD_blk_size = BLOCK_SIZE(td_b), new;
2773
3.43M
    char *key;
2774
3.43M
    khint_t k;
2775
2776
3.43M
    if (err) *err = 1;
2777
2778
3.43M
    orig = aux = (char *)bam_aux(b);
2779
2780
2781
    // cF:i  => Extra CRAM bit flags.
2782
    // 1:  Don't auto-decode MD (may be invalid)
2783
    // 2:  Don't auto-decode NM (may be invalid)
2784
3.43M
    if (cf_tag && CRAM_MAJOR_VERS(fd->version) < 4) {
2785
        // Temporary copy of aux so we can ammend it.
2786
86.4k
        aux = malloc(aux_size+4);
2787
86.4k
        if (!aux)
2788
0
            return NULL;
2789
2790
86.4k
        memcpy(aux, orig, aux_size);
2791
86.4k
        aux[aux_size++] = 'c';
2792
86.4k
        aux[aux_size++] = 'F';
2793
86.4k
        aux[aux_size++] = 'C';
2794
86.4k
        aux[aux_size++] = cf_tag;
2795
86.4k
        orig = aux;
2796
86.4k
        aux_end = aux + aux_size;
2797
86.4k
    }
2798
2799
    // Copy aux keys to td_b and aux values to slice aux blocks
2800
4.06M
    while (aux_end - aux >= 1 && aux[0] != 0) {
2801
630k
        int r;
2802
2803
        // Room for code + type + at least 1 byte of data
2804
630k
        if (aux - orig >= aux_size - 3)
2805
1
            goto err;
2806
2807
        // RG:Z
2808
630k
        if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
2809
706
            char *rg = &aux[3];
2810
706
            aux = rg;
2811
151k
            while (aux < aux_end && *aux++);
2812
706
            if (aux == aux_end && aux[-1] != '\0') {
2813
0
                hts_log_error("Unterminated RG:Z tag for read \"%s\"",
2814
0
                              bam_get_qname(b));
2815
0
                goto err;
2816
0
            }
2817
706
            brg = sam_hrecs_find_rg(fd->header->hrecs, rg);
2818
706
            if (brg) {
2819
117
                if (CRAM_MAJOR_VERS(fd->version) >= 4)
2820
0
                    BLOCK_APPEND(td_b, "RG*", 3);
2821
117
                continue;
2822
589
            } else {
2823
                // RG:Z tag will be stored verbatim
2824
589
                hts_log_warning("Missing @RG header for RG \"%s\"", rg);
2825
589
                aux = rg - 3;
2826
589
            }
2827
706
        }
2828
2829
        // MD:Z
2830
630k
        if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
2831
70.2k
            if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) {
2832
1.35k
                if (MD && MD->s && strncasecmp(MD->s, aux+3, orig + aux_size - (aux+3)) == 0) {
2833
18
                    while (aux < aux_end && *aux++);
2834
3
                    if (aux == aux_end && aux[-1] != '\0') {
2835
0
                        hts_log_error("Unterminated MD:Z tag for read \"%s\"",
2836
0
                                      bam_get_qname(b));
2837
0
                        goto err;
2838
0
                    }
2839
3
                    if (CRAM_MAJOR_VERS(fd->version) >= 4)
2840
0
                        BLOCK_APPEND(td_b, "MD*", 3);
2841
3
                    continue;
2842
3
                }
2843
1.35k
            }
2844
70.2k
        }
2845
2846
        // NM:i
2847
630k
        if (aux[0] == 'N' && aux[1] == 'M') {
2848
180
            if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_NM) {
2849
0
                int NM_ = bam_aux2i_end((uint8_t *)aux+2, (uint8_t *)aux_end);
2850
0
                if (NM_ == NM) {
2851
0
                    switch(aux[2]) {
2852
0
                    case 'A': case 'C': case 'c': aux+=4; break;
2853
0
                    case 'S': case 's':           aux+=5; break;
2854
0
                    case 'I': case 'i': case 'f': aux+=7; break;
2855
0
                    default:
2856
0
                        hts_log_error("Unhandled type code for NM tag");
2857
0
                        goto err;
2858
0
                    }
2859
0
                    if (CRAM_MAJOR_VERS(fd->version) >= 4)
2860
0
                        BLOCK_APPEND(td_b, "NM*", 3);
2861
0
                    continue;
2862
0
                }
2863
0
            }
2864
180
        }
2865
2866
630k
        BLOCK_APPEND(td_b, aux, 3);
2867
2868
        // Container level tags_used, for TD series
2869
        // Maps integer key ('X0i') to cram_tag_map struct.
2870
630k
        int key = (((unsigned char *) aux)[0]<<16 |
2871
630k
                   ((unsigned char *) aux)[1]<<8  |
2872
630k
                   ((unsigned char *) aux)[2]);
2873
630k
        k = kh_put(m_tagmap, c->tags_used, key, &r);
2874
630k
        if (-1 == r)
2875
0
            goto err;
2876
630k
        else if (r != 0)
2877
171k
            kh_val(c->tags_used, k) = NULL;
2878
2879
630k
        if (r == 1) {
2880
171k
            khint_t k_global;
2881
2882
            // Global tags_used for cram_metrics support
2883
171k
            pthread_mutex_lock(&fd->metrics_lock);
2884
171k
            k_global = kh_put(m_metrics, fd->tags_used, key, &r);
2885
171k
            if (-1 == r) {
2886
0
                pthread_mutex_unlock(&fd->metrics_lock);
2887
0
                goto err;
2888
0
            }
2889
171k
            if (r >= 1) {
2890
8.61k
                kh_val(fd->tags_used, k_global) = cram_new_metrics();
2891
8.61k
                if (!kh_val(fd->tags_used, k_global)) {
2892
0
                    kh_del(m_metrics, fd->tags_used, k_global);
2893
0
                    pthread_mutex_unlock(&fd->metrics_lock);
2894
0
                    goto err;
2895
0
                }
2896
8.61k
            }
2897
2898
171k
            pthread_mutex_unlock(&fd->metrics_lock);
2899
2900
171k
            int i2[2] = {'\t',key};
2901
171k
            size_t sk = key;
2902
171k
            cram_tag_map *m = calloc(1, sizeof(*m));
2903
171k
            if (!m)
2904
0
                goto_err;
2905
171k
            kh_val(c->tags_used, k) = m;
2906
2907
171k
            cram_codec *c;
2908
2909
            // Use a block content id based on the tag id.
2910
            // Codec type depends on tag data type.
2911
171k
            switch(aux[2]) {
2912
62.1k
            case 'Z': case 'H':
2913
                // string as byte_array_stop
2914
62.1k
                c = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2915
62.1k
                                      E_BYTE_ARRAY, (void *)i2,
2916
62.1k
                                      fd->version, &fd->vv);
2917
62.1k
                break;
2918
2919
95.0k
            case 'A': case 'c': case 'C': {
2920
                // byte array len, 1 byte
2921
95.0k
                cram_byte_array_len_encoder e;
2922
95.0k
                cram_stats st;
2923
2924
95.0k
                if (CRAM_MAJOR_VERS(fd->version) <= 3) {
2925
95.0k
                    e.len_encoding = E_HUFFMAN;
2926
95.0k
                    e.len_dat = NULL; // will get codes from st
2927
95.0k
                } else {
2928
0
                    e.len_encoding = E_CONST_INT;
2929
0
                    e.len_dat = NULL; // will get codes from st
2930
0
                }
2931
95.0k
                memset(&st, 0, sizeof(st));
2932
95.0k
                if (cram_stats_add(&st, 1) < 0) goto block_err;
2933
95.0k
                cram_stats_encoding(fd, &st);
2934
2935
95.0k
                e.val_encoding = E_EXTERNAL;
2936
95.0k
                e.val_dat = (void *)sk;
2937
2938
95.0k
                c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2939
95.0k
                                      E_BYTE_ARRAY, (void *)&e,
2940
95.0k
                                      fd->version, &fd->vv);
2941
95.0k
                break;
2942
95.0k
            }
2943
2944
3.96k
            case 's': case 'S': {
2945
                // byte array len, 2 byte
2946
3.96k
                cram_byte_array_len_encoder e;
2947
3.96k
                cram_stats st;
2948
2949
3.96k
                if (CRAM_MAJOR_VERS(fd->version) <= 3) {
2950
3.96k
                    e.len_encoding = E_HUFFMAN;
2951
3.96k
                    e.len_dat = NULL; // will get codes from st
2952
3.96k
                } else {
2953
0
                    e.len_encoding = E_CONST_INT;
2954
0
                    e.len_dat = NULL; // will get codes from st
2955
0
                }
2956
3.96k
                memset(&st, 0, sizeof(st));
2957
3.96k
                if (cram_stats_add(&st, 2) < 0) goto block_err;
2958
3.96k
                cram_stats_encoding(fd, &st);
2959
2960
3.96k
                e.val_encoding = E_EXTERNAL;
2961
3.96k
                e.val_dat = (void *)sk;
2962
2963
3.96k
                c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2964
3.96k
                                      E_BYTE_ARRAY, (void *)&e,
2965
3.96k
                                      fd->version, &fd->vv);
2966
3.96k
                break;
2967
3.96k
            }
2968
9.42k
            case 'i': case 'I': case 'f': {
2969
                // byte array len, 4 byte
2970
9.42k
                cram_byte_array_len_encoder e;
2971
9.42k
                cram_stats st;
2972
2973
9.42k
                if (CRAM_MAJOR_VERS(fd->version) <= 3) {
2974
9.42k
                    e.len_encoding = E_HUFFMAN;
2975
9.42k
                    e.len_dat = NULL; // will get codes from st
2976
9.42k
                } else {
2977
0
                    e.len_encoding = E_CONST_INT;
2978
0
                    e.len_dat = NULL; // will get codes from st
2979
0
                }
2980
9.42k
                memset(&st, 0, sizeof(st));
2981
9.42k
                if (cram_stats_add(&st, 4) < 0) goto block_err;
2982
9.42k
                cram_stats_encoding(fd, &st);
2983
2984
9.42k
                e.val_encoding = E_EXTERNAL;
2985
9.42k
                e.val_dat = (void *)sk;
2986
2987
9.42k
                c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2988
9.42k
                                      E_BYTE_ARRAY, (void *)&e,
2989
9.42k
                                      fd->version, &fd->vv);
2990
9.42k
                break;
2991
9.42k
            }
2992
2993
1.16k
            case 'B': {
2994
                // Byte array of variable size, but we generate our tag
2995
                // byte stream at the wrong stage (during reading and not
2996
                // after slice header construction). So we use
2997
                // BYTE_ARRAY_LEN with the length codec being external
2998
                // too.
2999
1.16k
                cram_byte_array_len_encoder e;
3000
3001
1.16k
                e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
3002
1.16k
                    ? E_VARINT_UNSIGNED
3003
1.16k
                    : E_EXTERNAL;
3004
1.16k
                e.len_dat = (void *)sk; // or key+128 for len?
3005
3006
1.16k
                e.val_encoding = E_EXTERNAL;
3007
1.16k
                e.val_dat = (void *)sk;
3008
3009
1.16k
                c = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
3010
1.16k
                                      E_BYTE_ARRAY, (void *)&e,
3011
1.16k
                                      fd->version, &fd->vv);
3012
1.16k
                break;
3013
9.42k
            }
3014
3015
62
            default:
3016
62
                hts_log_error("Unsupported SAM aux type '%c'", aux[2]);
3017
62
                c = NULL;
3018
171k
            }
3019
3020
171k
            if (!c)
3021
62
                goto_err;
3022
3023
171k
            m->codec = c;
3024
3025
            // Link to fd-global tag metrics
3026
171k
            pthread_mutex_lock(&fd->metrics_lock);
3027
171k
            m->m = k_global ? (cram_metrics *)kh_val(fd->tags_used, k_global) : NULL;
3028
171k
            pthread_mutex_unlock(&fd->metrics_lock);
3029
171k
        }
3030
3031
630k
        cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
3032
630k
        if (!tm) goto_err;
3033
630k
        cram_codec *codec = tm->codec;
3034
630k
        if (!tm->codec) goto_err;
3035
3036
630k
        switch(aux[2]) {
3037
315k
        case 'A': case 'C': case 'c':
3038
315k
            if (aux_end - aux < 3+1)
3039
0
                goto err;
3040
3041
315k
            if (!tm->blk) {
3042
95.0k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3043
0
                    goto err;
3044
95.0k
                codec->u.e_byte_array_len.val_codec->out = tm->blk;
3045
95.0k
            }
3046
3047
315k
            aux+=3;
3048
            //codec->encode(s, codec, aux, 1);
3049
            // Functionally equivalent, but less code.
3050
315k
            BLOCK_APPEND_CHAR(tm->blk, *aux);
3051
315k
            aux++;
3052
315k
            break;
3053
3054
18.5k
        case 'S': case 's':
3055
18.5k
            if (aux_end - aux < 3+2)
3056
0
                goto err;
3057
3058
18.5k
            if (!tm->blk) {
3059
3.96k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3060
0
                    goto err;
3061
3.96k
                codec->u.e_byte_array_len.val_codec->out = tm->blk;
3062
3.96k
            }
3063
3064
18.5k
            aux+=3;
3065
            //codec->encode(s, codec, aux, 2);
3066
18.5k
            BLOCK_APPEND(tm->blk, aux, 2);
3067
18.5k
            aux+=2;
3068
18.5k
            break;
3069
3070
66.8k
        case 'I': case 'i': case 'f':
3071
66.8k
            if (aux_end - aux < 3+4)
3072
0
                goto err;
3073
3074
66.8k
            if (!tm->blk) {
3075
9.42k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3076
0
                    goto err;
3077
9.42k
                codec->u.e_byte_array_len.val_codec->out = tm->blk;
3078
9.42k
            }
3079
3080
66.8k
            aux+=3;
3081
            //codec->encode(s, codec, aux, 4);
3082
66.8k
            BLOCK_APPEND(tm->blk, aux, 4);
3083
66.8k
            aux+=4;
3084
66.8k
            break;
3085
3086
0
        case 'd':
3087
0
            if (aux_end - aux < 3+8)
3088
0
                goto err;
3089
3090
0
            if (!tm->blk) {
3091
0
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3092
0
                    goto err;
3093
0
                codec->u.e_byte_array_len.val_codec->out = tm->blk;
3094
0
            }
3095
3096
0
            aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
3097
            //codec->encode(s, codec, aux, 8);
3098
0
            BLOCK_APPEND(tm->blk, aux, 8);
3099
0
            aux+=8;
3100
0
            break;
3101
3102
189k
        case 'Z': case 'H': {
3103
189k
            if (aux_end - aux < 3)
3104
0
                goto err;
3105
3106
189k
            if (!tm->blk) {
3107
62.1k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3108
0
                    goto err;
3109
62.1k
                codec->out = tm->blk;
3110
62.1k
            }
3111
3112
189k
            char *aux_s;
3113
189k
            aux += 3;
3114
189k
            aux_s = aux;
3115
20.4M
            while (aux < aux_end && *aux++);
3116
189k
            if (aux == aux_end && aux[-1] != '\0') {
3117
0
                hts_log_error("Unterminated %c%c:%c tag for read \"%s\"",
3118
0
                              aux_s[-3], aux_s[-2], aux_s[-1],
3119
0
                              bam_get_qname(b));
3120
0
                goto err;
3121
0
            }
3122
189k
            if (codec->encode(s, codec, aux_s, aux - aux_s) < 0)
3123
0
                goto err;
3124
189k
            break;
3125
189k
        }
3126
3127
189k
        case 'B': {
3128
40.2k
            if (aux_end - aux < 4+4)
3129
0
                goto err;
3130
3131
40.2k
            int type = aux[3];
3132
40.2k
            uint64_t count = (((uint64_t)((unsigned char *)aux)[4]) << 0 |
3133
40.2k
                              ((uint64_t)((unsigned char *)aux)[5]) << 8 |
3134
40.2k
                              ((uint64_t)((unsigned char *)aux)[6]) <<16 |
3135
40.2k
                              ((uint64_t)((unsigned char *)aux)[7]) <<24);
3136
40.2k
            uint64_t blen;
3137
40.2k
            if (!tm->blk) {
3138
1.16k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3139
0
                    goto err;
3140
1.16k
                if (codec->u.e_byte_array_len.val_codec->codec == E_XDELTA) {
3141
0
                    if (!(tm->blk2 = cram_new_block(EXTERNAL, key+128)))
3142
0
                        goto err;
3143
0
                    codec->u.e_byte_array_len.len_codec->out = tm->blk2;
3144
0
                    codec->u.e_byte_array_len.val_codec->u.e_xdelta.sub_codec->out = tm->blk;
3145
1.16k
                } else {
3146
1.16k
                    codec->u.e_byte_array_len.len_codec->out = tm->blk;
3147
1.16k
                    codec->u.e_byte_array_len.val_codec->out = tm->blk;
3148
1.16k
                }
3149
1.16k
            }
3150
3151
            // skip TN field
3152
40.2k
            aux+=3;
3153
3154
            // We use BYTE_ARRAY_LEN with external length, so store that first
3155
40.2k
            switch (type) {
3156
10.6k
            case 'c': case 'C':
3157
10.6k
                blen = count;
3158
10.6k
                break;
3159
4.98k
            case 's': case 'S':
3160
4.98k
                blen = 2*count;
3161
4.98k
                break;
3162
24.6k
            case 'i': case 'I': case 'f':
3163
24.6k
                blen = 4*count;
3164
24.6k
                break;
3165
11
            default:
3166
11
                hts_log_error("Unknown sub-type '%c' for aux type 'B'", type);
3167
11
                goto err;
3168
40.2k
            }
3169
3170
40.2k
            blen += 5; // sub-type & length
3171
40.2k
            if (aux_end - aux < blen || blen > INT_MAX)
3172
0
                goto err;
3173
3174
40.2k
            if (codec->encode(s, codec, aux, (int) blen) < 0)
3175
0
                goto err;
3176
40.2k
            aux += blen;
3177
40.2k
            break;
3178
40.2k
        }
3179
0
        default:
3180
0
            hts_log_error("Unknown aux type '%c'", aux_end - aux < 2 ? '?' : aux[2]);
3181
0
            goto err;
3182
630k
        }
3183
630k
        tm->blk->m = tm->m;
3184
630k
    }
3185
3186
    // FIXME: sort BLOCK_DATA(td_b) by char[3] triples
3187
3188
    // And and increment TD hash entry
3189
3.43M
    BLOCK_APPEND_CHAR(td_b, 0);
3190
3191
    // Duplicate key as BLOCK_DATA() can be realloced to a new pointer.
3192
3.43M
    key = string_ndup(c->comp_hdr->TD_keys,
3193
3.43M
                      (char *)BLOCK_DATA(td_b) + TD_blk_size,
3194
3.43M
                      BLOCK_SIZE(td_b) - TD_blk_size);
3195
3.43M
    if (!key)
3196
0
        goto block_err;
3197
3.43M
    k = kh_put(m_s2i, c->comp_hdr->TD_hash, key, &new);
3198
3.43M
    if (new < 0) {
3199
0
        goto err;
3200
3.43M
    } else if (new == 0) {
3201
3.35M
        BLOCK_SIZE(td_b) = TD_blk_size;
3202
3.35M
    } else {
3203
77.5k
        kh_val(c->comp_hdr->TD_hash, k) = c->comp_hdr->nTL;
3204
77.5k
        c->comp_hdr->nTL++;
3205
77.5k
    }
3206
3207
3.43M
    cr->TL = kh_val(c->comp_hdr->TD_hash, k);
3208
3.43M
    if (cram_stats_add(c->stats[DS_TL], cr->TL) < 0)
3209
0
        goto block_err;
3210
3211
3.43M
    if (orig != (char *)bam_aux(b))
3212
86.3k
        free(orig);
3213
3214
3.43M
    if (err) *err = 0;
3215
3216
3.43M
    return brg;
3217
3218
74
 err:
3219
74
 block_err:
3220
74
    if (orig != (char *)bam_aux(b))
3221
9
        free(orig);
3222
74
    return NULL;
3223
74
}
3224
3225
/*
3226
 * During cram_next_container or before the final flush at end of
3227
 * file, we update the current slice headers and increment the slice
3228
 * number to the next slice.
3229
 *
3230
 * See cram_next_container() and cram_close().
3231
 */
3232
64.2k
void cram_update_curr_slice(cram_container *c, int version) {
3233
64.2k
    cram_slice *s = c->slice;
3234
64.2k
    if (c->multi_seq) {
3235
849
        s->hdr->ref_seq_id    = -2;
3236
849
        s->hdr->ref_seq_start = 0;
3237
849
        s->hdr->ref_seq_span  = 0;
3238
63.3k
    } else if (c->curr_ref == -1 && CRAM_ge31(version)) {
3239
        // Spec states span=0, but it broke our range queries.
3240
        // See commit message for this and prior.
3241
31.9k
        s->hdr->ref_seq_id    = -1;
3242
31.9k
        s->hdr->ref_seq_start = 0;
3243
31.9k
        s->hdr->ref_seq_span  = 0;
3244
31.9k
    } else {
3245
31.4k
        s->hdr->ref_seq_id    = c->curr_ref;
3246
31.4k
        s->hdr->ref_seq_start = c->first_base;
3247
31.4k
        s->hdr->ref_seq_span  = MAX(0, c->last_base - c->first_base + 1);
3248
31.4k
    }
3249
64.2k
    s->hdr->num_records   = c->curr_rec;
3250
3251
64.2k
    if (c->curr_slice == 0) {
3252
64.2k
        if (c->ref_seq_id != s->hdr->ref_seq_id)
3253
39.5k
            c->ref_seq_id  = s->hdr->ref_seq_id;
3254
64.2k
        c->ref_seq_start = c->first_base;
3255
64.2k
    }
3256
3257
64.2k
    c->curr_slice++;
3258
64.2k
}
3259
3260
/*
3261
 * Handles creation of a new container or new slice, flushing any
3262
 * existing containers when appropriate.
3263
 *
3264
 * Really this is next slice, which may or may not lead to a new container.
3265
 *
3266
 * Returns cram_container pointer on success
3267
 *         NULL on failure.
3268
 */
3269
64.2k
static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) {
3270
64.2k
    cram_container *c = fd->ctr;
3271
64.2k
    int i;
3272
3273
    /* First occurrence */
3274
64.2k
    if (c->curr_ref == -2)
3275
2.24k
        c->curr_ref = bam_ref(b);
3276
3277
64.2k
    if (c->slice)
3278
62.0k
        cram_update_curr_slice(c, fd->version);
3279
3280
    /* Flush container */
3281
64.2k
    if (c->curr_slice == c->max_slice ||
3282
64.2k
        (bam_ref(b) != c->curr_ref && !c->multi_seq)) {
3283
62.0k
        c->ref_seq_span = fd->last_base - c->ref_seq_start + 1;
3284
62.0k
        hts_log_info("Flush container %d/%"PRId64"..%"PRId64,
3285
62.0k
                     c->ref_seq_id, c->ref_seq_start,
3286
62.0k
                     c->ref_seq_start + c->ref_seq_span -1);
3287
3288
        /* Encode slices */
3289
62.0k
        if (-1 == cram_flush_container_mt(fd, c))
3290
21
            return NULL;
3291
61.9k
        if (!fd->pool) {
3292
            // Move to sep func, as we need cram_flush_container for
3293
            // the closing phase to flush the partial container.
3294
123k
            for (i = 0; i < c->max_slice; i++) {
3295
61.9k
                cram_free_slice(c->slices[i]);
3296
61.9k
                c->slices[i] = NULL;
3297
61.9k
            }
3298
3299
61.9k
            c->slice = NULL;
3300
61.9k
            c->curr_slice = 0;
3301
3302
            /* Easy approach for purposes of freeing stats */
3303
61.9k
            cram_free_container(c);
3304
61.9k
        }
3305
3306
61.9k
        c = fd->ctr = cram_new_container(fd->seqs_per_slice,
3307
61.9k
                                         fd->slices_per_container);
3308
61.9k
        if (!c)
3309
0
            return NULL;
3310
3311
61.9k
        pthread_mutex_lock(&fd->ref_lock);
3312
61.9k
        c->no_ref = fd->no_ref;
3313
61.9k
        c->embed_ref = fd->embed_ref;
3314
61.9k
        c->record_counter = fd->record_counter;
3315
61.9k
        pthread_mutex_unlock(&fd->ref_lock);
3316
61.9k
        c->curr_ref = bam_ref(b);
3317
61.9k
    }
3318
3319
64.2k
    c->last_pos = c->first_base = c->last_base = bam_pos(b)+1;
3320
3321
    /* New slice */
3322
64.2k
    c->slice = c->slices[c->curr_slice] =
3323
64.2k
        cram_new_slice(MAPPED_SLICE, c->max_rec);
3324
64.2k
    if (!c->slice)
3325
0
        return NULL;
3326
3327
64.2k
    if (c->multi_seq) {
3328
0
        c->slice->hdr->ref_seq_id = -2;
3329
0
        c->slice->hdr->ref_seq_start = 0;
3330
0
        c->slice->last_apos = 1;
3331
64.2k
    } else {
3332
64.2k
        c->slice->hdr->ref_seq_id = bam_ref(b);
3333
        // wrong for unsorted data, will fix during encoding.
3334
64.2k
        c->slice->hdr->ref_seq_start = bam_pos(b)+1;
3335
64.2k
        c->slice->last_apos = bam_pos(b)+1;
3336
64.2k
    }
3337
3338
64.2k
    c->curr_rec = 0;
3339
64.2k
    c->s_num_bases = 0;
3340
64.2k
    c->n_mapped = 0;
3341
3342
    // QO field: 0 implies original orientation, 1 implies sequence orientation
3343
    // 1 is often preferable for NovaSeq, but impact is slight. ~0.5% diff.
3344
    // Conversely other data sets it's often better than 1% saving for 0.
3345
    // Short of trying both and learning, for now we use use 0 for V4, 1 for V3.
3346
64.2k
    c->qs_seq_orient = CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : 1;
3347
3348
64.2k
    return c;
3349
64.2k
}
3350
3351
3352
/*
3353
 * Converts a single bam record into a cram record.
3354
 * Possibly used within a thread.
3355
 *
3356
 * Returns 0 on success;
3357
 *        -1 on failure
3358
 */
3359
static int process_one_read(cram_fd *fd, cram_container *c,
3360
                            cram_slice *s, cram_record *cr,
3361
                            bam_seq_t *b, int rnum, kstring_t *MD,
3362
3.43M
                            int embed_ref, int no_ref) {
3363
3.43M
    int i, fake_qual = -1, NM = 0;
3364
3.43M
    char *cp;
3365
3.43M
    char *ref, *seq, *qual;
3366
3367
    // Any places with N in seq and/or reference can lead to ambiguous
3368
    // interpretation of the SAM NM:i tag.  So we store these verbatim
3369
    // to ensure valid data round-trips the same regardless of who
3370
    // defines it as valid.
3371
    // Similarly when alignments go beyond end of the reference.
3372
3.43M
    int verbatim_NM = fd->store_nm;
3373
3.43M
    int verbatim_MD = fd->store_md;
3374
3375
    // FIXME: multi-ref containers
3376
3377
3.43M
    cr->flags       = bam_flag(b);
3378
3.43M
    cr->len         = bam_seq_len(b);
3379
3.43M
    uint8_t *md;
3380
3.43M
    if (!(md = bam_aux_get(b, "MD")))
3381
3.36M
        MD = NULL;
3382
69.6k
    else
3383
69.6k
        MD->l = 0;
3384
3385
3.43M
    int cf_tag = 0;
3386
3387
3.43M
    if (embed_ref == 2) {
3388
86.4k
        cf_tag  = MD ? 0 : 1;                   // No MD
3389
86.4k
        cf_tag |= bam_aux_get(b, "NM") ? 0 : 2; // No NM
3390
86.4k
    }
3391
3392
    //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg);
3393
3394
3.43M
    ref = c->ref ? c->ref - (c->ref_start-1) : NULL;
3395
3.43M
    cr->ref_id      = bam_ref(b);
3396
3.43M
    if (cram_stats_add(c->stats[DS_RI], cr->ref_id) < 0)
3397
0
        goto block_err;
3398
3.43M
    if (cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]) < 0)
3399
0
        goto block_err;
3400
3401
    // Non reference based encoding means storing the bases verbatim as features, which in
3402
    // turn means every base also has a quality already stored.
3403
3.43M
    if (!no_ref || CRAM_MAJOR_VERS(fd->version) >= 3)
3404
3.43M
        cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
3405
3406
3.43M
    if (cr->len <= 0 && CRAM_MAJOR_VERS(fd->version) >= 3)
3407
3.12M
        cr->cram_flags |= CRAM_FLAG_NO_SEQ;
3408
    //cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK);
3409
3410
3.43M
    c->num_bases   += cr->len;
3411
3.43M
    cr->apos        = bam_pos(b)+1;
3412
3.43M
    if (cr->apos < 0 || cr->apos > INT64_MAX/2)
3413
2
        goto err;
3414
3.43M
    if (c->pos_sorted) {
3415
3.41M
        if (cr->apos < s->last_apos && !fd->ap_delta) {
3416
266
            c->pos_sorted = 0;
3417
3.41M
        } else {
3418
3.41M
            if (cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos) < 0)
3419
0
                goto block_err;
3420
3.41M
            s->last_apos = cr->apos;
3421
3.41M
        }
3422
3.41M
    } else {
3423
        //cram_stats_add(c->stats[DS_AP], cr->apos);
3424
17.8k
    }
3425
3.43M
    c->max_apos += (cr->apos > c->max_apos) * (cr->apos - c->max_apos);
3426
3427
    /*
3428
     * This seqs_ds is largely pointless and it could reuse the same memory
3429
     * over and over.
3430
     * s->base_blk is what we need for encoding.
3431
     */
3432
3.43M
    cr->seq         = BLOCK_SIZE(s->seqs_blk);
3433
3.43M
    cr->qual        = BLOCK_SIZE(s->qual_blk);
3434
3.43M
    BLOCK_GROW(s->seqs_blk, cr->len+1);
3435
3.43M
    BLOCK_GROW(s->qual_blk, cr->len);
3436
3437
    // Convert BAM nibble encoded sequence to string of base pairs
3438
3.43M
    seq = cp = (char *)BLOCK_END(s->seqs_blk);
3439
3.43M
    *seq = 0;
3440
3.43M
    nibble2base(bam_seq(b), cp, cr->len);
3441
3.43M
    BLOCK_SIZE(s->seqs_blk) += cr->len;
3442
3443
3.43M
    qual = cp = (char *)bam_qual(b);
3444
3445
3446
    /* Copy and parse */
3447
3.43M
    if (!(cr->flags & BAM_FUNMAP)) {
3448
85.0k
        uint32_t *cig_to, *cig_from;
3449
85.0k
        int64_t apos = cr->apos-1, spos = 0;
3450
85.0k
        int64_t MD_last = apos; // last position of edit in MD tag
3451
3452
85.0k
        if (apos < 0) {
3453
0
            hts_log_error("Mapped read with position <= 0 is disallowed");
3454
0
            return -1;
3455
0
        }
3456
3457
85.0k
        cr->cigar       = s->ncigar;
3458
85.0k
        cr->ncigar      = bam_cigar_len(b);
3459
85.3k
        while (cr->cigar + cr->ncigar >= s->cigar_alloc) {
3460
341
            s->cigar_alloc = s->cigar_alloc ? s->cigar_alloc*2 : 1024;
3461
341
            s->cigar = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar));
3462
341
            if (!s->cigar)
3463
0
                return -1;
3464
341
        }
3465
3466
85.0k
        cig_to = (uint32_t *)s->cigar;
3467
85.0k
        cig_from = (uint32_t *)bam_cigar(b);
3468
3469
85.0k
        cr->feature = 0;
3470
85.0k
        cr->nfeature = 0;
3471
869k
        for (i = 0; i < cr->ncigar; i++) {
3472
784k
            enum cigar_op cig_op = cig_from[i] & BAM_CIGAR_MASK;
3473
784k
            uint32_t cig_len = cig_from[i] >> BAM_CIGAR_SHIFT;
3474
784k
            cig_to[i] = cig_from[i];
3475
3476
            /* Can also generate events from here for CRAM diffs */
3477
3478
784k
            switch (cig_op) {
3479
0
                int l;
3480
3481
                // Don't trust = and X ops to be correct.
3482
565k
            case BAM_CMATCH:
3483
599k
            case BAM_CBASE_MATCH:
3484
600k
            case BAM_CBASE_MISMATCH:
3485
                //fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n",
3486
                //      cig_len, &ref[apos], cig_len, &seq[spos]);
3487
600k
                l = 0;
3488
600k
                if (!no_ref && cr->len) {
3489
6.33k
                    int end = cig_len+apos < c->ref_end
3490
6.33k
                        ? cig_len : c->ref_end - apos;
3491
6.33k
                    char *sp = &seq[spos];
3492
6.33k
                    char *rp = &ref[apos];
3493
6.33k
                    char *qp = &qual[spos];
3494
6.33k
                    if (end > cr->len) {
3495
0
                        hts_log_error("CIGAR and query sequence are of different length");
3496
0
                        return -1;
3497
0
                    }
3498
9.28k
                    for (l = 0; l < end; l++) {
3499
                        // This case is just too disputed and different tools
3500
                        // interpret these in different ways.  We give up and
3501
                        // store verbatim.
3502
2.94k
                        if (rp[l] == 'N' && sp[l] == 'N')
3503
146
                            verbatim_NM = verbatim_MD = 1;
3504
2.94k
                        if (rp[l] != sp[l]) {
3505
                            // Build our own MD tag if one is on the sequence, so
3506
                            // we can ensure it matches and thus can be discarded.
3507
992
                            if (MD && ref) {
3508
430
                                if (kputuw(apos+l - MD_last, MD) < 0) goto err;
3509
430
                                if (kputc(rp[l], MD) < 0) goto err;
3510
430
                                MD_last = apos+l+1;
3511
430
                            }
3512
992
                            NM++;
3513
992
                            if (!sp[l])
3514
0
                                break;
3515
992
                            if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) {
3516
#if 0
3517
                                // Disabled for the time being as it doesn't
3518
                                // seem to gain us much.
3519
                                int ol=l;
3520
                                while (l<end && rp[l] != sp[l])
3521
                                    l++;
3522
                                if (l-ol > 1) {
3523
                                    if (cram_add_bases(fd, c, s, cr, spos+ol,
3524
                                                       l-ol, &seq[spos+ol]))
3525
                                        return -1;
3526
                                    l--;
3527
                                } else {
3528
                                    l = ol;
3529
                                    if (cram_add_substitution(fd, c, s, cr,
3530
                                                              spos+l, sp[l],
3531
                                                              qp[l], rp[l]))
3532
                                        return -1;
3533
                                }
3534
#else
3535
                                // With urmap pushed to the limit and lots
3536
                                // of unaligned data (should be soft-clipped)
3537
                                // this saves ~2-7%. Worth it?
3538
0
                                int nl = l;
3539
0
                                int max_end = nl, max_score = 0, score = 0;
3540
0
                                while (nl < end) {
3541
0
                                    if (rp[nl] != sp[nl]) {
3542
0
                                        score += 3;
3543
0
                                        if (max_score < score) {
3544
0
                                            max_score = score;
3545
0
                                            max_end = nl;
3546
0
                                        }
3547
0
                                    } else {
3548
0
                                        score--;
3549
0
                                        if (score < -2 ||
3550
0
                                            max_score - score > 7)
3551
0
                                            break;
3552
0
                                    }
3553
0
                                    nl++;
3554
0
                                }
3555
0
                                if (max_score > 20) {
3556
0
                                    cram_add_bases(fd, c, s, cr, spos+l,
3557
0
                                                   max_end-l, &seq[spos+l]);
3558
0
                                    l = max_end-1;
3559
0
                                } else {
3560
0
                                    while (l < nl) {
3561
0
                                        if (rp[l] != sp[l])
3562
0
                                            cram_add_substitution(fd, c, s,
3563
0
                                                                  cr, spos+l,
3564
0
                                                                  sp[l], qp[l],
3565
0
                                                                  rp[l]);
3566
0
                                        l++;
3567
0
                                    }
3568
0
                                    l--;
3569
0
                                }
3570
0
#endif
3571
992
                            } else {
3572
992
                                if (cram_add_substitution(fd, c, s, cr, spos+l,
3573
992
                                                          sp[l], qp[l], rp[l]))
3574
0
                                    return -1;
3575
992
                            }
3576
992
                        }
3577
2.94k
                    }
3578
6.33k
                    spos += l;
3579
6.33k
                    apos += l;
3580
6.33k
                }
3581
3582
600k
                if (l < cig_len && cr->len) {
3583
1.44k
                    if (no_ref) {
3584
229
                        if (CRAM_MAJOR_VERS(fd->version) == 3) {
3585
229
                            if (cram_add_bases(fd, c, s, cr, spos,
3586
229
                                               cig_len-l, &seq[spos]))
3587
0
                                return -1;
3588
229
                            spos += cig_len-l;
3589
229
                        } else {
3590
0
                            for (; l < cig_len && seq[spos]; l++, spos++) {
3591
0
                                if (cram_add_base(fd, c, s, cr, spos,
3592
0
                                                  seq[spos], qual[spos]))
3593
0
                                    return -1;
3594
0
                            }
3595
0
                        }
3596
1.21k
                    } else {
3597
                        /* off end of sequence or non-ref based output */
3598
1.21k
                        verbatim_NM = verbatim_MD = 1;
3599
2.44k
                        for (; l < cig_len && seq[spos]; l++, spos++) {
3600
1.22k
                            if (cram_add_base(fd, c, s, cr, spos,
3601
1.22k
                                              seq[spos], qual[spos]))
3602
0
                                return -1;
3603
1.22k
                        }
3604
1.21k
                    }
3605
1.44k
                    apos += cig_len;
3606
599k
                } else if (!cr->len) {
3607
                    /* Seq "*" */
3608
594k
                    verbatim_NM = verbatim_MD = 1;
3609
594k
                    apos += cig_len;
3610
594k
                    spos += cig_len;
3611
594k
                }
3612
600k
                break;
3613
3614
600k
            case BAM_CDEL:
3615
61.9k
                if (MD && ref) {
3616
36.3k
                    if (kputuw(apos - MD_last, MD) < 0) goto err;
3617
36.3k
                    if (apos < c->ref_end) {
3618
34.7k
                        if (kputc_('^', MD) < 0) goto err;
3619
34.7k
                        if (kputsn(&ref[apos], MIN(c->ref_end - apos, cig_len), MD) < 0)
3620
0
                            goto err;
3621
34.7k
                    }
3622
36.3k
                }
3623
61.9k
                NM += cig_len;
3624
3625
61.9k
                if (cram_add_deletion(c, s, cr, spos, cig_len, &seq[spos]))
3626
0
                    return -1;
3627
61.9k
                apos += cig_len;
3628
61.9k
                MD_last = apos;
3629
61.9k
                break;
3630
3631
849
            case BAM_CREF_SKIP:
3632
849
                if (cram_add_skip(c, s, cr, spos, cig_len, &seq[spos]))
3633
0
                    return -1;
3634
849
                apos += cig_len;
3635
849
                MD_last += cig_len;
3636
849
                break;
3637
3638
23.5k
            case BAM_CINS:
3639
23.5k
                if (cram_add_insertion(c, s, cr, spos, cig_len,
3640
23.5k
                                       cr->len ? &seq[spos] : NULL))
3641
0
                    return -1;
3642
23.5k
                if (no_ref && cr->len) {
3643
99
                    for (l = 0; l < cig_len; l++, spos++) {
3644
29
                        cram_add_quality(fd, c, s, cr, spos, qual[spos]);
3645
29
                    }
3646
23.5k
                } else {
3647
23.5k
                    spos += cig_len;
3648
23.5k
                }
3649
23.5k
                NM += cig_len;
3650
23.5k
                break;
3651
3652
78.7k
            case BAM_CSOFT_CLIP:
3653
78.7k
                if (cram_add_softclip(c, s, cr, spos, cig_len,
3654
78.7k
                                      cr->len ? &seq[spos] : NULL,
3655
78.7k
                                      fd->version))
3656
0
                    return -1;
3657
3658
78.7k
                if (no_ref &&
3659
78.7k
                    !(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
3660
0
                    if (cr->len) {
3661
0
                        for (l = 0; l < cig_len; l++, spos++) {
3662
0
                            cram_add_quality(fd, c, s, cr, spos, qual[spos]);
3663
0
                        }
3664
0
                    } else {
3665
0
                        for (l = 0; l < cig_len; l++, spos++) {
3666
0
                            cram_add_quality(fd, c, s, cr, spos, -1);
3667
0
                        }
3668
0
                    }
3669
78.7k
                } else {
3670
78.7k
                    spos += cig_len;
3671
78.7k
                }
3672
78.7k
                break;
3673
3674
17.9k
            case BAM_CHARD_CLIP:
3675
17.9k
                if (cram_add_hardclip(c, s, cr, spos, cig_len, &seq[spos]))
3676
0
                    return -1;
3677
17.9k
                break;
3678
3679
17.9k
            case BAM_CPAD:
3680
187
                if (cram_add_pad(c, s, cr, spos, cig_len, &seq[spos]))
3681
0
                    return -1;
3682
187
                break;
3683
3684
187
            default:
3685
92
                hts_log_error("Unknown CIGAR op code %d", cig_op);
3686
92
                return -1;
3687
784k
            }
3688
784k
        }
3689
84.9k
        if (cr->len && spos != cr->len) {
3690
0
            hts_log_error("CIGAR and query sequence are of different length");
3691
0
            return -1;
3692
0
        }
3693
84.9k
        fake_qual = spos;
3694
        // Protect against negative length refs (fuzz 382922241)
3695
84.9k
        cr->aend = no_ref ? apos : MIN(apos, MAX(0, c->ref_end));
3696
84.9k
        if (cram_stats_add(c->stats[DS_FN], cr->nfeature) < 0)
3697
0
            goto block_err;
3698
3699
84.9k
        if (MD && ref)
3700
38.7k
            if (kputuw(apos - MD_last, MD) < 0) goto err;
3701
3.34M
    } else {
3702
        // Unmapped
3703
3.34M
        cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
3704
3.34M
        cr->cigar  = 0;
3705
3.34M
        cr->ncigar = 0;
3706
3.34M
        cr->nfeature = 0;
3707
3.34M
        cr->aend = MIN(cr->apos, c->ref_end);
3708
440M
        for (i = 0; i < cr->len; i++)
3709
437M
            if (cram_stats_add(c->stats[DS_BA], seq[i]) < 0)
3710
0
                goto block_err;
3711
3.34M
        fake_qual = 0;
3712
3.34M
    }
3713
3714
3.43M
    cr->ntags      = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags);
3715
3.43M
    int err = 0;
3716
3.43M
    sam_hrec_rg_t *brg =
3717
3.43M
        cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD,
3718
3.43M
                        cf_tag, no_ref, &err);
3719
3.43M
    if (err)
3720
74
        goto block_err;
3721
3722
    /* Read group, identified earlier */
3723
3.43M
    if (brg) {
3724
117
        cr->rg = brg->id;
3725
3.43M
    } else if (CRAM_MAJOR_VERS(fd->version) == 1) {
3726
0
        sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, "UNKNOWN");
3727
0
        if (!brg) goto block_err;
3728
0
        cr->rg = brg->id;
3729
3.43M
    } else {
3730
3.43M
        cr->rg = -1;
3731
3.43M
    }
3732
3.43M
    if (cram_stats_add(c->stats[DS_RG], cr->rg) < 0)
3733
0
        goto block_err;
3734
3735
    /*
3736
     * Append to the qual block now. We do this here as
3737
     * cram_add_substitution() can generate BA/QS events which need to
3738
     * be in the qual block before we append the rest of the data.
3739
     */
3740
3.43M
    if (cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES) {
3741
        /* Special case of seq "*" */
3742
3.43M
        if (cr->len == 0) {
3743
3.12M
            cr->len = fake_qual;
3744
3.12M
            BLOCK_GROW(s->qual_blk, cr->len);
3745
3.12M
            cp = (char *)BLOCK_END(s->qual_blk);
3746
3.12M
            memset(cp, 255, cr->len);
3747
3.12M
        } else {
3748
307k
            BLOCK_GROW(s->qual_blk, cr->len);
3749
307k
            cp = (char *)BLOCK_END(s->qual_blk);
3750
307k
            char *from = (char *)&bam_qual(b)[0];
3751
307k
            char *to = &cp[0];
3752
307k
            memcpy(to, from, cr->len);
3753
3754
            // Store quality in original orientation for better compression.
3755
307k
            if (!c->qs_seq_orient) {
3756
0
                if (cr->flags & BAM_FREVERSE) {
3757
0
                    int i, j;
3758
0
                    for (i = 0, j = cr->len-1; i < j; i++, j--) {
3759
0
                        unsigned char c;
3760
0
                        c = to[i];
3761
0
                        to[i] = to[j];
3762
0
                        to[j] = c;
3763
0
                    }
3764
0
                }
3765
0
            }
3766
307k
        }
3767
3.43M
        BLOCK_SIZE(s->qual_blk) += cr->len;
3768
3.43M
    } else {
3769
0
        if (cr->len == 0)
3770
0
            cr->len = fake_qual >= 0 ? fake_qual : cr->aend - cr->apos + 1;
3771
0
    }
3772
3773
3.43M
    if (cram_stats_add(c->stats[DS_RL], cr->len) < 0)
3774
0
        goto block_err;
3775
3776
    /* Now we know apos and aend both, update mate-pair information */
3777
3.43M
    {
3778
3.43M
        int new;
3779
3.43M
        khint_t k;
3780
3.43M
        int sec = (cr->flags & BAM_FSECONDARY) ? 1 : 0;
3781
3782
        //fprintf(stderr, "Checking %"PRId64"/%.*s\t", rnum,
3783
        //      cr->name_len, DSTRING_STR(s->name_ds)+cr->name);
3784
3.43M
        if (cr->flags & BAM_FPAIRED) {
3785
98.5k
            char *key = string_ndup(s->pair_keys, bam_name(b), bam_name_len(b));
3786
98.5k
            if (!key)
3787
0
                return -1;
3788
3789
98.5k
            k = kh_put(m_s2i, s->pair[sec], key, &new);
3790
98.5k
            if (-1 == new)
3791
0
                return -1;
3792
98.5k
            else if (new > 0)
3793
11.2k
                kh_val(s->pair[sec], k) = rnum;
3794
3.33M
        } else {
3795
3.33M
            new = 1;
3796
3.33M
            k = 0; // Prevents false-positive warning from gcc -Og
3797
3.33M
        }
3798
3799
3.43M
        if (new == 0) {
3800
87.3k
            cram_record *p = &s->crecs[kh_val(s->pair[sec], k)];
3801
87.3k
            int64_t aleft, aright;
3802
87.3k
            int sign;
3803
3804
87.3k
            aleft = MIN(cr->apos, p->apos);
3805
87.3k
            aright = MAX(cr->aend, p->aend);
3806
87.3k
            if (cr->apos < p->apos) {
3807
1.16k
                sign = 1;
3808
86.1k
            } else if (cr->apos > p->apos) {
3809
1.36k
                sign = -1;
3810
84.8k
            } else if (cr->flags & BAM_FREAD1) {
3811
56.3k
                sign = 1;
3812
56.3k
            } else {
3813
28.5k
                sign = -1;
3814
28.5k
            }
3815
3816
            // This vs p: tlen, matepos, flags. Permit TLEN 0 and/or TLEN +/-
3817
            // a small amount, if appropriate options set.
3818
87.3k
            if ((!fd->tlen_zero && MAX(bam_mate_pos(b)+1, 0) != p->apos) &&
3819
87.3k
                !(fd->tlen_zero && bam_mate_pos(b) == 0))
3820
30.0k
                goto detached;
3821
3822
57.2k
            if (((bam_flag(b) & BAM_FMUNMAP) != 0) !=
3823
57.2k
                ((p->flags & BAM_FUNMAP) != 0))
3824
264
                goto detached;
3825
3826
57.0k
            if (((bam_flag(b) & BAM_FMREVERSE) != 0) !=
3827
57.0k
                ((p->flags & BAM_FREVERSE) != 0))
3828
0
                goto detached;
3829
3830
3831
            // p vs this: tlen, matepos, flags
3832
57.0k
            if (p->ref_id != cr->ref_id &&
3833
57.0k
                !(fd->tlen_zero && p->ref_id == -1))
3834
85
                goto detached;
3835
3836
56.9k
            if (p->mate_pos != cr->apos &&
3837
56.9k
                !(fd->tlen_zero && p->mate_pos == 0))
3838
36
                goto detached;
3839
3840
56.8k
            if (((p->flags & BAM_FMUNMAP) != 0) !=
3841
56.8k
                ((p->mate_flags & CRAM_M_UNMAP) != 0))
3842
0
                goto detached;
3843
3844
56.8k
            if (((p->flags & BAM_FMREVERSE) != 0) !=
3845
56.8k
                ((p->mate_flags & CRAM_M_REVERSE) != 0))
3846
0
                goto detached;
3847
3848
            // Supplementary reads are just too ill defined
3849
56.8k
            if ((cr->flags & BAM_FSUPPLEMENTARY) ||
3850
56.8k
                (p->flags & BAM_FSUPPLEMENTARY))
3851
0
                goto detached;
3852
3853
            // When in lossy name mode, if a read isn't detached we
3854
            // cannot store the name.  The corollary is that when we
3855
            // must store the name, it must be detached (inefficient).
3856
56.8k
            if (fd->lossy_read_names &&
3857
56.8k
                (!(cr->cram_flags & CRAM_FLAG_DISCARD_NAME) ||
3858
0
                 !((p->cram_flags & CRAM_FLAG_DISCARD_NAME))))
3859
0
                goto detached;
3860
3861
            // Now check TLEN.  We do this last as sometimes it's the
3862
            // only thing that differs.  In CRAM4 we have a better way
3863
            // of handling this that doesn't break detached status
3864
56.8k
            int explicit_tlen = 0;
3865
56.8k
            int tflag1 = ((bam_ins_size(b) &&
3866
56.8k
                           llabs(bam_ins_size(b) - sign*(aright-aleft+1))
3867
22
                           > fd->tlen_approx)
3868
56.8k
                          || (!bam_ins_size(b) && !fd->tlen_zero));
3869
3870
56.8k
            int tflag2 = ((p->tlen && llabs(p->tlen - -sign*(aright-aleft+1))
3871
22
                           > fd->tlen_approx)
3872
56.8k
                          || (!p->tlen && !fd->tlen_zero));
3873
3874
56.8k
            if (tflag1 || tflag2) {
3875
56.8k
                if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3876
0
                    explicit_tlen = CRAM_FLAG_EXPLICIT_TLEN;
3877
56.8k
                } else {
3878
                    // Stil do detached for unmapped data in CRAM4 as this
3879
                    // also impacts RNEXT calculation.
3880
56.8k
                    goto detached;
3881
56.8k
                }
3882
56.8k
            }
3883
3884
            /*
3885
             * The fields below are unused when encoding this read as it is
3886
             * no longer detached.  In theory they may get referred to when
3887
             * processing a 3rd or 4th read in this template?, so we set them
3888
             * here just to be sure.
3889
             *
3890
             * They do not need cram_stats_add() calls those as they are
3891
             * not emitted.
3892
             */
3893
0
            cr->mate_pos = p->apos;
3894
0
            cram_stats_add(c->stats[DS_NP], cr->mate_pos);
3895
0
            cr->tlen = explicit_tlen ? bam_ins_size(b) : sign*(aright-aleft+1);
3896
0
            cram_stats_add(c->stats[DS_TS], cr->tlen);
3897
0
            cr->mate_flags =
3898
0
                ((p->flags & BAM_FMUNMAP)   == BAM_FMUNMAP)   * CRAM_M_UNMAP +
3899
0
                ((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE;
3900
3901
            // Decrement statistics aggregated earlier
3902
0
            if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
3903
0
                cram_stats_del(c->stats[DS_NP], p->mate_pos);
3904
0
                cram_stats_del(c->stats[DS_MF], p->mate_flags);
3905
0
                if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN))
3906
0
                    cram_stats_del(c->stats[DS_TS], p->tlen);
3907
0
                cram_stats_del(c->stats[DS_NS], p->mate_ref_id);
3908
0
            }
3909
3910
            /* Similarly we could correct the p-> values too, but these will no
3911
             * longer have any code that refers back to them as the new 'p'
3912
             * for this template is our current 'cr'.
3913
             */
3914
            //p->mate_pos = cr->apos;
3915
            //p->mate_flags =
3916
            //  ((cr->flags & BAM_FMUNMAP)   == BAM_FMUNMAP)  * CRAM_M_UNMAP +
3917
            //  ((cr->flags & BAM_FMREVERSE) == BAM_FMREVERSE)* CRAM_M_REVERSE;
3918
            //p->tlen = p->apos - cr->aend;
3919
3920
            // Clear detached from cr flags
3921
0
            cr->cram_flags &= ~CRAM_FLAG_DETACHED;
3922
0
            cr->cram_flags |= explicit_tlen;
3923
0
            if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0)
3924
0
                goto block_err;
3925
3926
            // Clear detached from p flags and set downstream
3927
0
            if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
3928
0
                cram_stats_del(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK);
3929
0
                p->cram_flags &= ~CRAM_FLAG_STATS_ADDED;
3930
0
            }
3931
3932
0
            p->cram_flags  &= ~CRAM_FLAG_DETACHED;
3933
0
            p->cram_flags  |=  CRAM_FLAG_MATE_DOWNSTREAM | explicit_tlen;;
3934
0
            if (cram_stats_add(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK) < 0)
3935
0
                goto block_err;
3936
3937
0
            p->mate_line = rnum - (kh_val(s->pair[sec], k) + 1);
3938
0
            if (cram_stats_add(c->stats[DS_NF], p->mate_line) < 0)
3939
0
                goto block_err;
3940
3941
0
            kh_val(s->pair[sec], k) = rnum;
3942
3.34M
        } else {
3943
3.43M
        detached:
3944
            //fprintf(stderr, "unpaired\n");
3945
3946
            /* Derive mate flags from this flag */
3947
3.43M
            cr->mate_flags = 0;
3948
3.43M
            if (bam_flag(b) & BAM_FMUNMAP)
3949
59.2k
                cr->mate_flags |= CRAM_M_UNMAP;
3950
3.43M
            if (bam_flag(b) & BAM_FMREVERSE)
3951
36
                cr->mate_flags |= CRAM_M_REVERSE;
3952
3953
3.43M
            if (cram_stats_add(c->stats[DS_MF], cr->mate_flags) < 0)
3954
0
                goto block_err;
3955
3956
3.43M
            cr->mate_pos    = MAX(bam_mate_pos(b)+1, 0);
3957
3.43M
            if (cram_stats_add(c->stats[DS_NP], cr->mate_pos) < 0)
3958
0
                goto block_err;
3959
3960
3.43M
            cr->tlen        = bam_ins_size(b);
3961
3.43M
            if (cram_stats_add(c->stats[DS_TS], cr->tlen) < 0)
3962
0
                goto block_err;
3963
3964
3.43M
            cr->cram_flags |= CRAM_FLAG_DETACHED;
3965
3.43M
            if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0)
3966
0
                goto block_err;
3967
3.43M
            if (cram_stats_add(c->stats[DS_NS], bam_mate_ref(b)) < 0)
3968
0
                goto block_err;
3969
3970
3.43M
            cr->cram_flags |= CRAM_FLAG_STATS_ADDED;
3971
3.43M
        }
3972
3.43M
    }
3973
3974
3.43M
    cr->mqual       = bam_map_qual(b);
3975
3.43M
    if (cram_stats_add(c->stats[DS_MQ], cr->mqual) < 0)
3976
0
        goto block_err;
3977
3978
3.43M
    cr->mate_ref_id = bam_mate_ref(b);
3979
3980
3.43M
    if (!(bam_flag(b) & BAM_FUNMAP)) {
3981
84.9k
        if (c->first_base > cr->apos)
3982
128
            c->first_base = cr->apos;
3983
3984
84.9k
        if (c->last_base < cr->aend)
3985
1.86k
            c->last_base = cr->aend;
3986
84.9k
    }
3987
3988
3.43M
    return 0;
3989
3990
74
 block_err:
3991
76
 err:
3992
76
    return -1;
3993
74
}
3994
3995
/*
3996
 * Write iterator: put BAM format sequences into a CRAM file.
3997
 * We buffer up a containers worth of data at a time.
3998
 *
3999
 * Returns 0 on success
4000
 *        -1 on failure
4001
 */
4002
3.43M
int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
4003
3.43M
    cram_container *c;
4004
4005
3.43M
    if (!fd->ctr) {
4006
2.24k
        fd->ctr = cram_new_container(fd->seqs_per_slice,
4007
2.24k
                                     fd->slices_per_container);
4008
2.24k
        if (!fd->ctr)
4009
0
            return -1;
4010
2.24k
        fd->ctr->record_counter = fd->record_counter;
4011
4012
2.24k
        pthread_mutex_lock(&fd->ref_lock);
4013
2.24k
        fd->ctr->no_ref = fd->no_ref;
4014
2.24k
        fd->ctr->embed_ref = fd->embed_ref;
4015
2.24k
        pthread_mutex_unlock(&fd->ref_lock);
4016
2.24k
    }
4017
3.43M
    c = fd->ctr;
4018
4019
3.43M
    int embed_ref = c->embed_ref;
4020
4021
3.43M
    if (!c->slice || c->curr_rec == c->max_rec ||
4022
3.43M
        (bam_ref(b) != c->curr_ref && c->curr_ref >= -1) ||
4023
3.43M
        (c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice)) {
4024
68.3k
        int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1;
4025
68.3k
        int curr_ref = c->slice ? c->curr_ref : bam_ref(b);
4026
4027
        /*
4028
         * Start packing slices when we routinely have under 1/4tr full.
4029
         *
4030
         * This option isn't available if we choose to embed references
4031
         * since we can only have one per slice.
4032
         *
4033
         * The multi_seq var here refers to our intention for the next slice.
4034
         * This slice has already been encoded so we output as-is.
4035
         */
4036
68.3k
        if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 &&
4037
68.3k
            fd->last_slice && fd->last_slice < c->max_rec/4+10 &&
4038
68.3k
            embed_ref<=0) {
4039
743
            if (!c->multi_seq)
4040
540
                hts_log_info("Multi-ref enabled for next container");
4041
743
            multi_seq = 1;
4042
67.6k
        } else if (fd->multi_seq == 1) {
4043
4.48k
            pthread_mutex_lock(&fd->metrics_lock);
4044
4.48k
            if (fd->last_RI_count <= c->max_slice && fd->multi_seq_user != 1) {
4045
503
                multi_seq = 0;
4046
503
                hts_log_info("Multi-ref disabled for next container");
4047
503
            }
4048
4.48k
            pthread_mutex_unlock(&fd->metrics_lock);
4049
4.48k
        }
4050
4051
68.3k
        slice_rec = c->slice_rec;
4052
68.3k
        curr_rec  = c->curr_rec;
4053
4054
68.3k
        if (CRAM_MAJOR_VERS(fd->version) == 1 ||
4055
68.3k
            c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice ||
4056
68.3k
            c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice) {
4057
64.2k
            if (NULL == (c = cram_next_container(fd, b))) {
4058
21
                if (fd->ctr) {
4059
                    // prevent cram_close attempting to flush
4060
21
                    fd->ctr_mt = fd->ctr; // delay free when threading
4061
21
                    fd->ctr = NULL;
4062
21
                }
4063
21
                return -1;
4064
21
            }
4065
64.2k
        }
4066
4067
        /*
4068
         * Due to our processing order, some things we've already done we
4069
         * cannot easily undo. So when we first notice we should be packing
4070
         * multiple sequences per container we emit the small partial
4071
         * container as-is and then start a fresh one in a different mode.
4072
         */
4073
68.3k
        if (multi_seq == 0 && fd->multi_seq == 1 && fd->multi_seq_user == -1) {
4074
            // User selected auto-mode, we're currently using multi-seq, but
4075
            // have detected we don't need to.  Switch back to auto.
4076
503
            fd->multi_seq = -1;
4077
67.8k
        } else if (multi_seq) {
4078
            // We detected we need multi-seq
4079
4.72k
            fd->multi_seq = 1;
4080
4.72k
            c->multi_seq = 1;
4081
4.72k
            c->pos_sorted = 0;
4082
4083
            // Cram_next_container may end up flushing an existing one and
4084
            // triggering fd->embed_ref=2 if no reference is found.
4085
            // Embedded refs are incompatible with multi-seq, so we bail
4086
            // out and switch to no_ref in this scenario.  We do this
4087
            // within the container only, as multi_seq may be temporary
4088
            // and we switch back away from it again.
4089
4.72k
            pthread_mutex_lock(&fd->ref_lock);
4090
4.72k
            if (fd->embed_ref > 0 && c->curr_rec == 0 && c->curr_slice == 0) {
4091
94
                hts_log_warning("Changing from embed_ref to no_ref mode");
4092
                // Should we update fd->embed_ref and no_ref here too?
4093
                // Doing so means if we go into multi-seq and back out
4094
                // again, eg due a cluster of tiny refs in the middle of
4095
                // much larger ones, then we bake in no-ref mode.
4096
                //
4097
                // However for unsorted data we're realistically not
4098
                // going to switch back.
4099
94
                c->embed_ref = fd->embed_ref = 0; // or -1 for auto?
4100
94
                c->no_ref = fd->no_ref = 1;
4101
94
            }
4102
4.72k
            pthread_mutex_unlock(&fd->ref_lock);
4103
4104
4.72k
            if (!c->refs_used) {
4105
849
                pthread_mutex_lock(&fd->ref_lock);
4106
849
                c->refs_used = calloc(fd->refs->nref, sizeof(int));
4107
849
                pthread_mutex_unlock(&fd->ref_lock);
4108
849
                if (!c->refs_used)
4109
0
                    return -1;
4110
849
            }
4111
4.72k
        }
4112
4113
68.3k
        fd->last_slice = curr_rec - slice_rec;
4114
68.3k
        c->slice_rec = c->curr_rec;
4115
4116
        // Have we seen this reference before?
4117
68.3k
        if (bam_ref(b) >= 0 && curr_ref >= 0 && bam_ref(b) != curr_ref &&
4118
68.3k
            embed_ref<=0 && !fd->unsorted && multi_seq) {
4119
4120
22
            if (!c->refs_used) {
4121
0
                pthread_mutex_lock(&fd->ref_lock);
4122
0
                c->refs_used = calloc(fd->refs->nref, sizeof(int));
4123
0
                pthread_mutex_unlock(&fd->ref_lock);
4124
0
                if (!c->refs_used)
4125
0
                    return -1;
4126
22
            } else if (c->refs_used && c->refs_used[bam_ref(b)]) {
4127
7
                pthread_mutex_lock(&fd->ref_lock);
4128
7
                fd->unsorted = 1;
4129
7
                fd->multi_seq = 1;
4130
7
                pthread_mutex_unlock(&fd->ref_lock);
4131
7
            }
4132
22
        }
4133
4134
68.3k
        c->curr_ref = bam_ref(b);
4135
68.3k
        if (c->refs_used && c->curr_ref >= 0) c->refs_used[c->curr_ref]++;
4136
68.3k
    }
4137
4138
3.43M
    if (!c->bams) {
4139
        /* First time through, allocate a set of bam pointers */
4140
64.2k
        pthread_mutex_lock(&fd->bam_list_lock);
4141
64.2k
        if (fd->bl) {
4142
61.9k
            spare_bams *spare = fd->bl;
4143
61.9k
            c->bams = spare->bams;
4144
61.9k
            fd->bl = spare->next;
4145
61.9k
            free(spare);
4146
61.9k
        } else {
4147
2.24k
            c->bams = calloc(c->max_c_rec, sizeof(bam_seq_t *));
4148
2.24k
            if (!c->bams) {
4149
0
                pthread_mutex_unlock(&fd->bam_list_lock);
4150
0
                return -1;
4151
0
            }
4152
2.24k
        }
4153
64.2k
        pthread_mutex_unlock(&fd->bam_list_lock);
4154
64.2k
    }
4155
4156
    /* Copy or alloc+copy the bam record, for later encoding */
4157
3.43M
    if (c->bams[c->curr_c_rec]) {
4158
2.40M
        if (bam_copy1(c->bams[c->curr_c_rec], b) == NULL)
4159
0
            return -1;
4160
2.40M
    } else {
4161
1.03M
        c->bams[c->curr_c_rec] = bam_dup1(b);
4162
1.03M
        if (c->bams[c->curr_c_rec] == NULL)
4163
0
            return -1;
4164
1.03M
    }
4165
3.43M
    if (bam_seq_len(b)) {
4166
307k
        c->s_num_bases += bam_seq_len(b);
4167
3.12M
    } else {
4168
        // No sequence in BAM record.  CRAM doesn't directly support this
4169
        // case, it ends up being stored as a string of N's for each query
4170
        // consuming CIGAR operation.  As this can become very inefficient
4171
        // in time and memory, data where the query length is excessively
4172
        // long are rejected.
4173
3.12M
        hts_pos_t qlen = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
4174
3.12M
        if (qlen > 100000000) {
4175
27
            hts_log_error("CIGAR query length %"PRIhts_pos
4176
27
                          " for read \"%s\" is too long",
4177
27
                          qlen, bam_get_qname(b));
4178
27
            return -1;
4179
27
        }
4180
3.12M
        c->s_num_bases += qlen;
4181
3.12M
    }
4182
3.43M
    c->curr_rec++;
4183
3.43M
    c->curr_c_rec++;
4184
3.43M
    c->s_aux_bytes += bam_get_l_aux(b);
4185
3.43M
    c->n_mapped += (bam_flag(b) & BAM_FUNMAP) ? 0 : 1;
4186
3.43M
    fd->record_counter++;
4187
4188
3.43M
    return 0;
4189
3.43M
}