Coverage Report

Created: 2025-08-10 06:30

/src/htslib/cram/cram_encode.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
Copyright (c) 2012-2020, 2022-2024 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
32
#include <config.h>
33
34
#include <stdio.h>
35
#include <errno.h>
36
#include <assert.h>
37
#include <stdlib.h>
38
#include <string.h>
39
#include <strings.h>
40
#include <zlib.h>
41
#include <sys/types.h>
42
#include <sys/stat.h>
43
#include <math.h>
44
#include <inttypes.h>
45
#include <ctype.h>
46
47
#include "cram.h"
48
#include "os.h"
49
#include "../sam_internal.h" // for nibble2base
50
#include "../htslib/hts.h"
51
#include "../htslib/hts_endian.h"
52
#include "../textutils_internal.h"
53
54
KHASH_MAP_INIT_STR(m_s2u64, uint64_t)
55
56
#define Z_CRAM_STRAT Z_FILTERED
57
//#define Z_CRAM_STRAT Z_RLE
58
//#define Z_CRAM_STRAT Z_HUFFMAN_ONLY
59
//#define Z_CRAM_STRAT Z_DEFAULT_STRATEGY
60
61
static int process_one_read(cram_fd *fd, cram_container *c,
62
                            cram_slice *s, cram_record *cr,
63
                            bam_seq_t *b, int rnum, kstring_t *MD,
64
                            int embed_ref, int no_ref);
65
66
/*
67
 * Returns index of val into key.
68
 * Basically strchr(key, val)-key;
69
 */
70
1.25M
static int sub_idx(char *key, char val) {
71
1.25M
    int i;
72
73
3.14M
    for (i = 0; i < 4 && *key++ != val; i++);
74
1.25M
    return i;
75
1.25M
}
76
77
/*
78
 * Encodes a compression header block into a generic cram_block structure.
79
 *
80
 * Returns cram_block ptr on success
81
 *         NULL on failure
82
 */
83
cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
84
                                           cram_block_compression_hdr *h,
85
74.8k
                                           int embed_ref) {
86
74.8k
    cram_block *cb  = cram_new_block(COMPRESSION_HEADER, 0);
87
74.8k
    cram_block *map = cram_new_block(COMPRESSION_HEADER, 0);
88
74.8k
    int i, mc, r = 0;
89
90
74.8k
    int no_ref = c->no_ref;
91
92
74.8k
    if (!cb || !map)
93
0
        return NULL;
94
95
    /*
96
     * This is a concatenation of several blocks of data:
97
     * header + landmarks, preservation map, read encoding map, and the tag
98
     * encoding map.
99
     * All 4 are variable sized and we need to know how large these are
100
     * before creating the compression header itself as this starts with
101
     * the total size (stored as a variable length string).
102
     */
103
104
    // Duplicated from container itself, and removed in 1.1
105
74.8k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
106
0
        r |= itf8_put_blk(cb, h->ref_seq_id);
107
0
        r |= itf8_put_blk(cb, h->ref_seq_start);
108
0
        r |= itf8_put_blk(cb, h->ref_seq_span);
109
0
        r |= itf8_put_blk(cb, h->num_records);
110
0
        r |= itf8_put_blk(cb, h->num_landmarks);
111
0
        for (i = 0; i < h->num_landmarks; i++) {
112
0
            r |= itf8_put_blk(cb, h->landmark[i]);
113
0
        }
114
0
    }
115
116
74.8k
    if (h->preservation_map) {
117
0
        kh_destroy(map, h->preservation_map);
118
0
        h->preservation_map = NULL;
119
0
    }
120
121
    /* Create in-memory preservation map */
122
    /* FIXME: should create this when we create the container */
123
74.8k
    if (c->num_records > 0) {
124
62.8k
        khint_t k;
125
62.8k
        int r;
126
127
62.8k
        if (!(h->preservation_map = kh_init(map)))
128
0
            return NULL;
129
130
62.8k
        k = kh_put(map, h->preservation_map, "RN", &r);
131
62.8k
        if (-1 == r) return NULL;
132
62.8k
        kh_val(h->preservation_map, k).i = !fd->lossy_read_names;
133
134
62.8k
        if (CRAM_MAJOR_VERS(fd->version) == 1) {
135
0
            k = kh_put(map, h->preservation_map, "PI", &r);
136
0
            if (-1 == r) return NULL;
137
0
            kh_val(h->preservation_map, k).i = 0;
138
139
0
            k = kh_put(map, h->preservation_map, "UI", &r);
140
0
            if (-1 == r) return NULL;
141
0
            kh_val(h->preservation_map, k).i = 1;
142
143
0
            k = kh_put(map, h->preservation_map, "MI", &r);
144
0
            if (-1 == r) return NULL;
145
0
            kh_val(h->preservation_map, k).i = 1;
146
147
62.8k
        } else {
148
            // Technically SM was in 1.0, but wasn't in Java impl.
149
62.8k
            k = kh_put(map, h->preservation_map, "SM", &r);
150
62.8k
            if (-1 == r) return NULL;
151
62.8k
            kh_val(h->preservation_map, k).i = 0;
152
153
62.8k
            k = kh_put(map, h->preservation_map, "TD", &r);
154
62.8k
            if (-1 == r) return NULL;
155
62.8k
            kh_val(h->preservation_map, k).i = 0;
156
157
62.8k
            k = kh_put(map, h->preservation_map, "AP", &r);
158
62.8k
            if (-1 == r) return NULL;
159
62.8k
            kh_val(h->preservation_map, k).i = h->AP_delta;
160
161
62.8k
            if (CRAM_MAJOR_VERS(fd->version) >= 4) {
162
0
                k = kh_put(map, h->preservation_map, "QO", &r);
163
0
                if (-1 == r) return NULL;
164
0
                kh_val(h->preservation_map, k).i = h->qs_seq_orient;
165
0
            }
166
167
62.8k
            if (no_ref || embed_ref>0) {
168
                // Reference Required == No
169
58.9k
                k = kh_put(map, h->preservation_map, "RR", &r);
170
58.9k
                if (-1 == r) return NULL;
171
58.9k
                kh_val(h->preservation_map, k).i = 0;
172
58.9k
            }
173
62.8k
        }
174
62.8k
    }
175
176
    /* Encode preservation map; could collapse this and above into one */
177
74.8k
    mc = 0;
178
74.8k
    BLOCK_SIZE(map) = 0;
179
74.8k
    if (h->preservation_map) {
180
62.8k
        khint_t k;
181
182
62.8k
        for (k = kh_begin(h->preservation_map);
183
565k
             k != kh_end(h->preservation_map);
184
502k
             k++) {
185
502k
            const char *key;
186
502k
            khash_t(map) *pmap = h->preservation_map;
187
188
189
502k
            if (!kh_exist(pmap, k))
190
192k
                continue;
191
192
310k
            key = kh_key(pmap, k);
193
310k
            BLOCK_APPEND(map, key, 2);
194
195
310k
            switch(CRAM_KEY(key[0], key[1])) {
196
0
            case CRAM_KEY('M','I'):
197
0
            case CRAM_KEY('U','I'):
198
0
            case CRAM_KEY('P','I'):
199
62.8k
            case CRAM_KEY('A','P'):
200
125k
            case CRAM_KEY('R','N'):
201
184k
            case CRAM_KEY('R','R'):
202
184k
            case CRAM_KEY('Q','O'):
203
184k
                BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
204
184k
                break;
205
206
184k
            case CRAM_KEY('S','M'): {
207
62.8k
                char smat[5], *mp = smat;
208
                // Output format is for order ACGTN (minus ref base)
209
                // to store the code value 0-3 for each symbol.
210
                //
211
                // Note this is different to storing the symbols in order
212
                // that the codes occur from 0-3, which is what we used to
213
                // do.  (It didn't matter as we always had a fixed table in
214
                // the order.)
215
62.8k
                *mp++ =
216
62.8k
                    (sub_idx(h->substitution_matrix[0], 'C') << 6) |
217
62.8k
                    (sub_idx(h->substitution_matrix[0], 'G') << 4) |
218
62.8k
                    (sub_idx(h->substitution_matrix[0], 'T') << 2) |
219
62.8k
                    (sub_idx(h->substitution_matrix[0], 'N') << 0);
220
62.8k
                *mp++ =
221
62.8k
                    (sub_idx(h->substitution_matrix[1], 'A') << 6) |
222
62.8k
                    (sub_idx(h->substitution_matrix[1], 'G') << 4) |
223
62.8k
                    (sub_idx(h->substitution_matrix[1], 'T') << 2) |
224
62.8k
                    (sub_idx(h->substitution_matrix[1], 'N') << 0);
225
62.8k
                *mp++ =
226
62.8k
                    (sub_idx(h->substitution_matrix[2], 'A') << 6) |
227
62.8k
                    (sub_idx(h->substitution_matrix[2], 'C') << 4) |
228
62.8k
                    (sub_idx(h->substitution_matrix[2], 'T') << 2) |
229
62.8k
                    (sub_idx(h->substitution_matrix[2], 'N') << 0);
230
62.8k
                *mp++ =
231
62.8k
                    (sub_idx(h->substitution_matrix[3], 'A') << 6) |
232
62.8k
                    (sub_idx(h->substitution_matrix[3], 'C') << 4) |
233
62.8k
                    (sub_idx(h->substitution_matrix[3], 'G') << 2) |
234
62.8k
                    (sub_idx(h->substitution_matrix[3], 'N') << 0);
235
62.8k
                *mp++ =
236
62.8k
                    (sub_idx(h->substitution_matrix[4], 'A') << 6) |
237
62.8k
                    (sub_idx(h->substitution_matrix[4], 'C') << 4) |
238
62.8k
                    (sub_idx(h->substitution_matrix[4], 'G') << 2) |
239
62.8k
                    (sub_idx(h->substitution_matrix[4], 'T') << 0);
240
62.8k
                BLOCK_APPEND(map, smat, 5);
241
62.8k
                break;
242
62.8k
            }
243
244
62.8k
            case CRAM_KEY('T','D'): {
245
62.8k
                r |= (fd->vv.varint_put32_blk(map, BLOCK_SIZE(h->TD_blk)) <= 0);
246
62.8k
                BLOCK_APPEND(map,
247
62.8k
                             BLOCK_DATA(h->TD_blk),
248
62.8k
                             BLOCK_SIZE(h->TD_blk));
249
62.8k
                break;
250
62.8k
            }
251
252
62.8k
            default:
253
0
                hts_log_warning("Unknown preservation key '%.2s'", key);
254
0
                break;
255
310k
            }
256
257
310k
            mc++;
258
310k
        }
259
62.8k
    }
260
74.8k
    r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
261
74.8k
    r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
262
74.8k
    BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
263
264
    /* rec encoding map */
265
74.8k
    mc = 0;
266
74.8k
    BLOCK_SIZE(map) = 0;
267
74.8k
    if (h->codecs[DS_BF]) {
268
62.8k
        if (-1 == h->codecs[DS_BF]->store(h->codecs[DS_BF], map, "BF",
269
62.8k
                                          fd->version))
270
0
            return NULL;
271
62.8k
        mc++;
272
62.8k
    }
273
74.8k
    if (h->codecs[DS_CF]) {
274
62.8k
        if (-1 == h->codecs[DS_CF]->store(h->codecs[DS_CF], map, "CF",
275
62.8k
                                          fd->version))
276
0
            return NULL;
277
62.8k
        mc++;
278
62.8k
    }
279
74.8k
    if (h->codecs[DS_RL]) {
280
62.8k
        if (-1 == h->codecs[DS_RL]->store(h->codecs[DS_RL], map, "RL",
281
62.8k
                                          fd->version))
282
0
            return NULL;
283
62.8k
        mc++;
284
62.8k
    }
285
74.8k
    if (h->codecs[DS_AP]) {
286
62.8k
        if (-1 == h->codecs[DS_AP]->store(h->codecs[DS_AP], map, "AP",
287
62.8k
                                          fd->version))
288
0
            return NULL;
289
62.8k
        mc++;
290
62.8k
    }
291
74.8k
    if (h->codecs[DS_RG]) {
292
62.8k
        if (-1 == h->codecs[DS_RG]->store(h->codecs[DS_RG], map, "RG",
293
62.8k
                                          fd->version))
294
0
            return NULL;
295
62.8k
        mc++;
296
62.8k
    }
297
74.8k
    if (h->codecs[DS_MF]) {
298
62.8k
        if (-1 == h->codecs[DS_MF]->store(h->codecs[DS_MF], map, "MF",
299
62.8k
                                          fd->version))
300
0
            return NULL;
301
62.8k
        mc++;
302
62.8k
    }
303
74.8k
    if (h->codecs[DS_NS]) {
304
62.8k
        if (-1 == h->codecs[DS_NS]->store(h->codecs[DS_NS], map, "NS",
305
62.8k
                                          fd->version))
306
0
            return NULL;
307
62.8k
        mc++;
308
62.8k
    }
309
74.8k
    if (h->codecs[DS_NP]) {
310
62.8k
        if (-1 == h->codecs[DS_NP]->store(h->codecs[DS_NP], map, "NP",
311
62.8k
                                          fd->version))
312
0
            return NULL;
313
62.8k
        mc++;
314
62.8k
    }
315
74.8k
    if (h->codecs[DS_TS]) {
316
62.8k
        if (-1 == h->codecs[DS_TS]->store(h->codecs[DS_TS], map, "TS",
317
62.8k
                                          fd->version))
318
0
            return NULL;
319
62.8k
        mc++;
320
62.8k
    }
321
74.8k
    if (h->codecs[DS_NF]) {
322
11
        if (-1 == h->codecs[DS_NF]->store(h->codecs[DS_NF], map, "NF",
323
11
                                          fd->version))
324
0
            return NULL;
325
11
        mc++;
326
11
    }
327
74.8k
    if (h->codecs[DS_TC]) {
328
0
        if (-1 == h->codecs[DS_TC]->store(h->codecs[DS_TC], map, "TC",
329
0
                                          fd->version))
330
0
            return NULL;
331
0
        mc++;
332
0
    }
333
74.8k
    if (h->codecs[DS_TN]) {
334
0
        if (-1 == h->codecs[DS_TN]->store(h->codecs[DS_TN], map, "TN",
335
0
                                          fd->version))
336
0
            return NULL;
337
0
        mc++;
338
0
    }
339
74.8k
    if (h->codecs[DS_TL]) {
340
62.8k
        if (-1 == h->codecs[DS_TL]->store(h->codecs[DS_TL], map, "TL",
341
62.8k
                                          fd->version))
342
0
            return NULL;
343
62.8k
        mc++;
344
62.8k
    }
345
74.8k
    if (h->codecs[DS_FN]) {
346
28.1k
        if (-1 == h->codecs[DS_FN]->store(h->codecs[DS_FN], map, "FN",
347
28.1k
                                          fd->version))
348
0
            return NULL;
349
28.1k
        mc++;
350
28.1k
    }
351
74.8k
    if (h->codecs[DS_FC]) {
352
28.0k
        if (-1 == h->codecs[DS_FC]->store(h->codecs[DS_FC], map, "FC",
353
28.0k
                                          fd->version))
354
0
            return NULL;
355
28.0k
        mc++;
356
28.0k
    }
357
74.8k
    if (h->codecs[DS_FP]) {
358
28.0k
        if (-1 == h->codecs[DS_FP]->store(h->codecs[DS_FP], map, "FP",
359
28.0k
                                          fd->version))
360
0
            return NULL;
361
28.0k
        mc++;
362
28.0k
    }
363
74.8k
    if (h->codecs[DS_BS]) {
364
710
        if (-1 == h->codecs[DS_BS]->store(h->codecs[DS_BS], map, "BS",
365
710
                                          fd->version))
366
0
            return NULL;
367
710
        mc++;
368
710
    }
369
74.8k
    if (h->codecs[DS_IN]) {
370
62.8k
        if (-1 == h->codecs[DS_IN]->store(h->codecs[DS_IN], map, "IN",
371
62.8k
                                          fd->version))
372
0
            return NULL;
373
62.8k
        mc++;
374
62.8k
    }
375
74.8k
    if (h->codecs[DS_DL]) {
376
5.96k
        if (-1 == h->codecs[DS_DL]->store(h->codecs[DS_DL], map, "DL",
377
5.96k
                                          fd->version))
378
0
            return NULL;
379
5.96k
        mc++;
380
5.96k
    }
381
74.8k
    if (h->codecs[DS_BA]) {
382
4.17k
        if (-1 == h->codecs[DS_BA]->store(h->codecs[DS_BA], map, "BA",
383
4.17k
                                          fd->version))
384
0
            return NULL;
385
4.17k
        mc++;
386
4.17k
    }
387
74.8k
    if (h->codecs[DS_BB]) {
388
62.8k
        if (-1 == h->codecs[DS_BB]->store(h->codecs[DS_BB], map, "BB",
389
62.8k
                                          fd->version))
390
0
            return NULL;
391
62.8k
        mc++;
392
62.8k
    }
393
74.8k
    if (h->codecs[DS_MQ]) {
394
62.8k
        if (-1 == h->codecs[DS_MQ]->store(h->codecs[DS_MQ], map, "MQ",
395
62.8k
                                          fd->version))
396
0
            return NULL;
397
62.8k
        mc++;
398
62.8k
    }
399
74.8k
    if (h->codecs[DS_RN]) {
400
62.8k
        if (-1 == h->codecs[DS_RN]->store(h->codecs[DS_RN], map, "RN",
401
62.8k
                                          fd->version))
402
0
            return NULL;
403
62.8k
        mc++;
404
62.8k
    }
405
74.8k
    if (h->codecs[DS_QS]) {
406
62.8k
        if (-1 == h->codecs[DS_QS]->store(h->codecs[DS_QS], map, "QS",
407
62.8k
                                          fd->version))
408
0
            return NULL;
409
62.8k
        mc++;
410
62.8k
    }
411
74.8k
    if (h->codecs[DS_QQ]) {
412
0
        if (-1 == h->codecs[DS_QQ]->store(h->codecs[DS_QQ], map, "QQ",
413
0
                                          fd->version))
414
0
            return NULL;
415
0
        mc++;
416
0
    }
417
74.8k
    if (h->codecs[DS_RI]) {
418
62.8k
        if (-1 == h->codecs[DS_RI]->store(h->codecs[DS_RI], map, "RI",
419
62.8k
                                          fd->version))
420
0
            return NULL;
421
62.8k
        mc++;
422
62.8k
    }
423
74.8k
    if (CRAM_MAJOR_VERS(fd->version) != 1) {
424
74.8k
        if (h->codecs[DS_SC]) {
425
62.8k
            if (-1 == h->codecs[DS_SC]->store(h->codecs[DS_SC], map, "SC",
426
62.8k
                                              fd->version))
427
0
                return NULL;
428
62.8k
            mc++;
429
62.8k
        }
430
74.8k
        if (h->codecs[DS_RS]) {
431
315
            if (-1 == h->codecs[DS_RS]->store(h->codecs[DS_RS], map, "RS",
432
315
                                              fd->version))
433
0
                return NULL;
434
315
            mc++;
435
315
        }
436
74.8k
        if (h->codecs[DS_PD]) {
437
67
            if (-1 == h->codecs[DS_PD]->store(h->codecs[DS_PD], map, "PD",
438
67
                                              fd->version))
439
0
                return NULL;
440
67
            mc++;
441
67
        }
442
74.8k
        if (h->codecs[DS_HC]) {
443
3.88k
            if (-1 == h->codecs[DS_HC]->store(h->codecs[DS_HC], map, "HC",
444
3.88k
                                              fd->version))
445
0
                return NULL;
446
3.88k
            mc++;
447
3.88k
        }
448
74.8k
    }
449
74.8k
    if (h->codecs[DS_TM]) {
450
0
        if (-1 == h->codecs[DS_TM]->store(h->codecs[DS_TM], map, "TM",
451
0
                                          fd->version))
452
0
            return NULL;
453
0
        mc++;
454
0
    }
455
74.8k
    if (h->codecs[DS_TV]) {
456
0
        if (-1 == h->codecs[DS_TV]->store(h->codecs[DS_TV], map, "TV",
457
0
                                          fd->version))
458
0
            return NULL;
459
0
        mc++;
460
0
    }
461
74.8k
    r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
462
74.8k
    r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
463
74.8k
    BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
464
465
    /* tag encoding map */
466
74.8k
    mc = 0;
467
74.8k
    BLOCK_SIZE(map) = 0;
468
74.8k
    if (c->tags_used) {
469
62.8k
        khint_t k;
470
471
474k
        for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
472
412k
            int key;
473
412k
            if (!kh_exist(c->tags_used, k))
474
222k
                continue;
475
476
189k
            key = kh_key(c->tags_used, k);
477
189k
            cram_codec *cd = kh_val(c->tags_used, k)->codec;
478
479
189k
            r |= (fd->vv.varint_put32_blk(map, key) <= 0);
480
189k
            if (-1 == cd->store(cd, map, NULL, fd->version))
481
0
                return NULL;
482
483
189k
            mc++;
484
189k
        }
485
62.8k
    }
486
487
74.8k
    r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
488
74.8k
    r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
489
74.8k
    BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
490
491
74.8k
    hts_log_info("Wrote compression block header in %d bytes", (int)BLOCK_SIZE(cb));
492
493
74.8k
    BLOCK_UPLEN(cb);
494
495
74.8k
    cram_free_block(map);
496
497
74.8k
    if (r >= 0)
498
74.8k
        return cb;
499
500
0
 block_err:
501
0
    return NULL;
502
74.8k
}
503
504
505
/*
506
 * Encodes a slice compression header.
507
 *
508
 * Returns cram_block on success
509
 *         NULL on failure
510
 */
511
62.9k
cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) {
512
62.9k
    char *buf;
513
62.9k
    char *cp;
514
62.9k
    cram_block *b = cram_new_block(MAPPED_SLICE, 0);
515
62.9k
    int j;
516
517
62.9k
    if (!b)
518
0
        return NULL;
519
520
62.9k
    cp = buf = malloc(22+16+5*(8+s->hdr->num_blocks));
521
62.9k
    if (NULL == buf) {
522
0
        cram_free_block(b);
523
0
        return NULL;
524
0
    }
525
526
62.9k
    cp += fd->vv.varint_put32s(cp, NULL, s->hdr->ref_seq_id);
527
62.9k
    if (CRAM_MAJOR_VERS(fd->version) >= 4) {
528
0
        cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_start);
529
0
        cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_span);
530
62.9k
    } else {
531
62.9k
        if (s->hdr->ref_seq_start < 0 || s->hdr->ref_seq_start > INT_MAX) {
532
121
            hts_log_error("Reference position too large for CRAM 3");
533
121
            cram_free_block(b);
534
121
            free(buf);
535
121
            return NULL;
536
121
        }
537
62.8k
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_start);
538
62.8k
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_span);
539
62.8k
    }
540
62.8k
    cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_records);
541
62.8k
    if (CRAM_MAJOR_VERS(fd->version) == 2)
542
0
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->record_counter);
543
62.8k
    else if (CRAM_MAJOR_VERS(fd->version) >= 3)
544
62.8k
        cp += fd->vv.varint_put64(cp, NULL, s->hdr->record_counter);
545
62.8k
    cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_blocks);
546
62.8k
    cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_content_ids);
547
396k
    for (j = 0; j < s->hdr->num_content_ids; j++) {
548
333k
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->block_content_ids[j]);
549
333k
    }
550
62.8k
    if (s->hdr->content_type == MAPPED_SLICE)
551
62.8k
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_base_id);
552
553
62.8k
    if (CRAM_MAJOR_VERS(fd->version) != 1) {
554
62.8k
        memcpy(cp, s->hdr->md5, 16); cp += 16;
555
62.8k
    }
556
557
62.8k
    assert(cp-buf <= 22+16+5*(8+s->hdr->num_blocks));
558
559
62.8k
    b->data = (unsigned char *)buf;
560
62.8k
    b->comp_size = b->uncomp_size = cp-buf;
561
562
62.8k
    return b;
563
62.8k
}
564
565
566
/*
567
 * Encodes a single read.
568
 *
569
 * Returns 0 on success
570
 *        -1 on failure
571
 */
572
static int cram_encode_slice_read(cram_fd *fd,
573
                                  cram_container *c,
574
                                  cram_block_compression_hdr *h,
575
                                  cram_slice *s,
576
                                  cram_record *cr,
577
6.94M
                                  int64_t *last_pos) {
578
6.94M
    int r = 0;
579
6.94M
    int32_t i32;
580
6.94M
    int64_t i64;
581
6.94M
    unsigned char uc;
582
583
    //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name);
584
585
    //printf("BF=0x%x\n", cr->flags);
586
    //      bf = cram_flag_swap[cr->flags];
587
6.94M
    i32 = fd->cram_flag_swap[cr->flags & 0xfff];
588
6.94M
    r |= h->codecs[DS_BF]->encode(s, h->codecs[DS_BF], (char *)&i32, 1);
589
590
6.94M
    i32 = cr->cram_flags & CRAM_FLAG_MASK;
591
6.94M
    r |= h->codecs[DS_CF]->encode(s, h->codecs[DS_CF], (char *)&i32, 1);
592
593
6.94M
    if (CRAM_MAJOR_VERS(fd->version) != 1 && s->hdr->ref_seq_id == -2)
594
12.9k
        r |= h->codecs[DS_RI]->encode(s, h->codecs[DS_RI], (char *)&cr->ref_id, 1);
595
596
6.94M
    r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1);
597
598
6.94M
    if (c->pos_sorted) {
599
6.92M
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
600
0
            i64 = cr->apos - *last_pos;
601
0
            r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1);
602
6.92M
        } else {
603
6.92M
            i32 = cr->apos - *last_pos;
604
6.92M
            r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
605
6.92M
        }
606
6.92M
        *last_pos = cr->apos;
607
6.92M
    } else {
608
17.7k
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
609
0
            i64 = cr->apos;
610
0
            r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1);
611
17.7k
        } else {
612
17.7k
            i32 = cr->apos;
613
17.7k
            r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
614
17.7k
        }
615
17.7k
    }
616
617
6.94M
    r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1);
618
619
6.94M
    if (cr->cram_flags & CRAM_FLAG_DETACHED) {
620
6.94M
        i32 = cr->mate_flags;
621
6.94M
        r |= h->codecs[DS_MF]->encode(s, h->codecs[DS_MF], (char *)&i32, 1);
622
623
6.94M
        r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS],
624
6.94M
                                      (char *)&cr->mate_ref_id, 1);
625
626
6.94M
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
627
0
            r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
628
0
                                          (char *)&cr->mate_pos, 1);
629
0
            r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
630
0
                                          (char *)&cr->tlen, 1);
631
6.94M
        } else {
632
6.94M
            i32 = cr->mate_pos;
633
6.94M
            r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
634
6.94M
                                          (char *)&i32, 1);
635
6.94M
            i32 = cr->tlen;
636
6.94M
            r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
637
6.94M
                                          (char *)&i32, 1);
638
6.94M
        }
639
6.94M
    } else {
640
22
        if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) {
641
11
            r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF],
642
11
                                          (char *)&cr->mate_line, 1);
643
11
        }
644
22
        if (cr->cram_flags & CRAM_FLAG_EXPLICIT_TLEN) {
645
0
            if (CRAM_MAJOR_VERS(fd->version) >= 4) {
646
0
                r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
647
0
                                              (char *)&cr->tlen, 1);
648
0
            }
649
0
        }
650
22
    }
651
652
    /* Aux tags */
653
6.94M
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
654
0
        int j;
655
0
        uc = cr->ntags;
656
0
        r |= h->codecs[DS_TC]->encode(s, h->codecs[DS_TC], (char *)&uc, 1);
657
658
0
        for (j = 0; j < cr->ntags; j++) {
659
0
            uint32_t i32 = s->TN[cr->TN_idx + j]; // id
660
0
            r |= h->codecs[DS_TN]->encode(s, h->codecs[DS_TN], (char *)&i32, 1);
661
0
        }
662
6.94M
    } else {
663
6.94M
        r |= h->codecs[DS_TL]->encode(s, h->codecs[DS_TL], (char *)&cr->TL, 1);
664
6.94M
    }
665
666
    // qual
667
    // QS codec : Already stored in block[2].
668
669
    // features (diffs)
670
6.94M
    if (!(cr->flags & BAM_FUNMAP)) {
671
82.4k
        int prev_pos = 0, j;
672
673
82.4k
        r |= h->codecs[DS_FN]->encode(s, h->codecs[DS_FN],
674
82.4k
                                      (char *)&cr->nfeature, 1);
675
259k
        for (j = 0; j < cr->nfeature; j++) {
676
176k
            cram_feature *f = &s->features[cr->feature + j];
677
678
176k
            uc = f->X.code;
679
176k
            r |= h->codecs[DS_FC]->encode(s, h->codecs[DS_FC], (char *)&uc, 1);
680
176k
            i32 = f->X.pos - prev_pos;
681
176k
            r |= h->codecs[DS_FP]->encode(s, h->codecs[DS_FP], (char *)&i32, 1);
682
176k
            prev_pos = f->X.pos;
683
684
176k
            switch(f->X.code) {
685
                //char *seq;
686
687
2.21k
            case 'X':
688
                //fprintf(stderr, "    FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base);
689
690
2.21k
                uc = f->X.base;
691
2.21k
                r |= h->codecs[DS_BS]->encode(s, h->codecs[DS_BS],
692
2.21k
                                              (char *)&uc, 1);
693
2.21k
                break;
694
73.3k
            case 'S':
695
                // Already done
696
                //r |= h->codecs[DS_SC]->encode(s, h->codecs[DS_SC],
697
                //                              BLOCK_DATA(s->soft_blk) + f->S.seq_idx,
698
                //                              f->S.len);
699
700
                //if (CRAM_MAJOR_VERS(fd->version) >= 3) {
701
                //    r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
702
                //                                  BLOCK_DATA(s->seqs_blk) + f->S.seq_idx,
703
                //                                  f->S.len);
704
                //}
705
73.3k
                break;
706
624
            case 'I':
707
                //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
708
                //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
709
                //                           seq, f->S.len);
710
                //if (CRAM_MAJOR_VERS(fd->version) >= 3) {
711
                //    r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
712
                //                                  BLOCK_DATA(s->seqs_blk) + f->I.seq_idx,
713
                //                                  f->I.len);
714
                //}
715
624
                break;
716
9.95k
            case 'i':
717
9.95k
                uc = f->i.base;
718
9.95k
                r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
719
9.95k
                                              (char *)&uc, 1);
720
                //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
721
                //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
722
                //                           seq, 1);
723
9.95k
                break;
724
59.0k
            case 'D':
725
59.0k
                i32 = f->D.len;
726
59.0k
                r |= h->codecs[DS_DL]->encode(s, h->codecs[DS_DL],
727
59.0k
                                              (char *)&i32, 1);
728
59.0k
                break;
729
730
2.45k
            case 'B':
731
                //                  // Used when we try to store a non ACGTN base or an N
732
                //                  // that aligns against a non ACGTN reference
733
734
2.45k
                uc  = f->B.base;
735
2.45k
                r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
736
2.45k
                                              (char *)&uc, 1);
737
738
                //                  Already added
739
                //                  uc  = f->B.qual;
740
                //                  r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
741
                //                                           (char *)&uc, 1);
742
2.45k
                break;
743
744
581
            case 'b':
745
                // string of bases
746
581
                r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
747
581
                                              (char *)BLOCK_DATA(s->seqs_blk)
748
581
                                                      + f->b.seq_idx,
749
581
                                              f->b.len);
750
581
                break;
751
752
97
            case 'Q':
753
                //                  Already added
754
                //                  uc  = f->B.qual;
755
                //                  r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
756
                //                                           (char *)&uc, 1);
757
97
                break;
758
759
577
            case 'N':
760
577
                i32 = f->N.len;
761
577
                r |= h->codecs[DS_RS]->encode(s, h->codecs[DS_RS],
762
577
                                              (char *)&i32, 1);
763
577
                break;
764
765
271
            case 'P':
766
271
                i32 = f->P.len;
767
271
                r |= h->codecs[DS_PD]->encode(s, h->codecs[DS_PD],
768
271
                                              (char *)&i32, 1);
769
271
                break;
770
771
27.8k
            case 'H':
772
27.8k
                i32 = f->H.len;
773
27.8k
                r |= h->codecs[DS_HC]->encode(s, h->codecs[DS_HC],
774
27.8k
                                              (char *)&i32, 1);
775
27.8k
                break;
776
777
778
0
            default:
779
0
                hts_log_error("Unhandled feature code %c", f->X.code);
780
0
                return -1;
781
176k
            }
782
176k
        }
783
784
82.4k
        r |= h->codecs[DS_MQ]->encode(s, h->codecs[DS_MQ],
785
82.4k
                                      (char *)&cr->mqual, 1);
786
6.85M
    } else {
787
6.85M
        char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
788
6.85M
        if (cr->len)
789
297k
            r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], seq, cr->len);
790
6.85M
    }
791
792
6.94M
    return r ? -1 : 0;
793
6.94M
}
794
795
796
/*
797
 * Applies various compression methods to specific blocks, depending on
798
 * known observations of how data series compress.
799
 *
800
 * Returns 0 on success
801
 *        -1 on failure
802
 */
803
62.9k
static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) {
804
62.9k
    int level = fd->level, i;
805
62.9k
    int method = 1<<GZIP | 1<<GZIP_RLE, methodF = method;
806
62.9k
    int v31_or_above = (fd->version >= (3<<8)+1);
807
808
    /* Compress the CORE Block too, with minimal zlib level */
809
62.9k
    if (level > 5 && s->block[0]->uncomp_size > 500)
810
0
        cram_compress_block2(fd, s, s->block[0], NULL, 1<<GZIP, 1);
811
812
62.9k
    if (fd->use_bz2)
813
0
        method |= 1<<BZIP2;
814
815
62.9k
    int method_rans   = (1<<RANS0) | (1<<RANS1);
816
62.9k
    int method_ranspr = method_rans;
817
818
62.9k
    if (fd->use_rans) {
819
62.9k
        method_ranspr = (1<<RANS_PR0)   | (1<<RANS_PR1);
820
62.9k
        if (level > 1)
821
62.9k
            method_ranspr |=
822
62.9k
                  (1<<RANS_PR64)  | (1<<RANS_PR9)
823
62.9k
                | (1<<RANS_PR128) | (1<<RANS_PR193);
824
62.9k
        if (level > 5)
825
0
            method_ranspr |= (1<<RANS_PR129) | (1<<RANS_PR192);
826
62.9k
    }
827
828
62.9k
    if (fd->use_rans) {
829
62.9k
        methodF |= v31_or_above ? method_ranspr : method_rans;
830
62.9k
        method  |= v31_or_above ? method_ranspr : method_rans;
831
62.9k
    }
832
833
62.9k
    int method_arith   = 0;
834
62.9k
    if (fd->use_arith) {
835
0
        method_arith = (1<<ARITH_PR0)   | (1<<ARITH_PR1);
836
0
        if (level > 1)
837
0
            method_arith |=
838
0
                  (1<<ARITH_PR64)  | (1<<ARITH_PR9)
839
0
                | (1<<ARITH_PR128) | (1<<ARITH_PR129)
840
0
                | (1<<ARITH_PR192) | (1u<<ARITH_PR193);
841
0
    }
842
62.9k
    if (fd->use_arith && v31_or_above) {
843
0
        methodF |= method_arith;
844
0
        method  |= method_arith;
845
0
    }
846
847
62.9k
    if (fd->use_lzma)
848
0
        method |= (1<<LZMA);
849
850
    /* Faster method for data series we only need entropy encoding on */
851
62.9k
    methodF = method & ~(1<<GZIP | 1<<BZIP2 | 1<<LZMA);
852
62.9k
    if (level >= 5) {
853
62.9k
        method |= 1<<GZIP_1;
854
62.9k
        methodF = method;
855
62.9k
    }
856
62.9k
    if (level == 1) {
857
0
        method &= ~(1<<GZIP);
858
0
        method |=   1<<GZIP_1;
859
0
        methodF = method;
860
0
    }
861
862
62.9k
    int qmethod  = method;
863
62.9k
    int qmethodF = method;
864
62.9k
    if (v31_or_above && fd->use_fqz) {
865
0
        qmethod  |= 1<<FQZ;
866
0
        qmethodF |= 1<<FQZ;
867
0
        if (fd->level > 4) {
868
0
            qmethod  |= 1<<FQZ_b;
869
0
            qmethodF |= 1<<FQZ_b;
870
0
        }
871
0
        if (fd->level > 6) {
872
0
            qmethod  |= (1<<FQZ_c) | (1<<FQZ_d);
873
0
            qmethodF |= (1<<FQZ_c) | (1<<FQZ_d);
874
0
        }
875
0
    }
876
877
62.9k
    pthread_mutex_lock(&fd->metrics_lock);
878
3.02M
    for (i = 0; i < DS_END; i++)
879
2.96M
        if (c->stats[i] && c->stats[i]->nvals > 16)
880
233
            fd->m[i]->unpackable = 1;
881
62.9k
    pthread_mutex_unlock(&fd->metrics_lock);
882
883
    /* Specific compression methods for certain block types */
884
62.9k
    if (cram_compress_block2(fd, s, s->block[DS_IN], fd->m[DS_IN], //IN (seq)
885
62.9k
                             method, level))
886
0
        return -1;
887
888
62.9k
    if (fd->level == 0) {
889
        /* Do nothing */
890
62.9k
    } else if (fd->level == 1) {
891
0
        if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
892
0
                                 qmethodF, 1))
893
0
            return -1;
894
0
        for (i = DS_aux; i <= DS_aux_oz; i++) {
895
0
            if (s->block[i])
896
0
                if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
897
0
                                         method, 1))
898
0
                    return -1;
899
0
        }
900
62.9k
    } else if (fd->level < 3) {
901
0
        if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
902
0
                                 qmethod, 1))
903
0
            return -1;
904
0
        if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA],
905
0
                                 method, 1))
906
0
            return -1;
907
0
        if (s->block[DS_BB])
908
0
            if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB],
909
0
                                     method, 1))
910
0
                return -1;
911
0
        for (i = DS_aux; i <= DS_aux_oz; i++) {
912
0
            if (s->block[i])
913
0
                if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
914
0
                                         method, level))
915
0
                    return -1;
916
0
        }
917
62.9k
    } else {
918
62.9k
        if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
919
62.9k
                                 qmethod, level))
920
0
            return -1;
921
62.9k
        if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA],
922
62.9k
                                 method, level))
923
0
            return -1;
924
62.9k
        if (s->block[DS_BB])
925
62.9k
            if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB],
926
62.9k
                                     method, level))
927
0
                return -1;
928
629k
        for (i = DS_aux; i <= DS_aux_oz; i++) {
929
566k
            if (s->block[i])
930
0
                if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
931
0
                                         method, level))
932
0
                    return -1;
933
566k
        }
934
62.9k
    }
935
936
    // NAME: best is generally xz, bzip2, zlib then rans1
937
62.9k
    int method_rn = method & ~(method_rans | method_ranspr | 1<<GZIP_RLE);
938
62.9k
    if (fd->version >= (3<<8)+1 && fd->use_tok)
939
62.9k
        method_rn |= fd->use_arith ? (1<<TOKA) : (1<<TOK3);
940
62.9k
    if (cram_compress_block2(fd, s, s->block[DS_RN], fd->m[DS_RN],
941
62.9k
                             method_rn, level))
942
0
        return -1;
943
944
    // NS shows strong local correlation as rearrangements are localised
945
62.9k
    if (s->block[DS_NS] && s->block[DS_NS] != s->block[0])
946
1.16k
        if (cram_compress_block2(fd, s, s->block[DS_NS], fd->m[DS_NS],
947
1.16k
                                 method, level))
948
0
            return -1;
949
950
951
    /*
952
     * Compress any auxiliary tags with their own per-tag metrics
953
     */
954
62.9k
    {
955
62.9k
        int i;
956
252k
        for (i = DS_END /*num_blk - naux_blk*/; i < s->hdr->num_blocks; i++) {
957
189k
            if (!s->block[i] || s->block[i] == s->block[0])
958
0
                continue;
959
960
189k
            if (s->block[i]->method != RAW)
961
0
                continue;
962
963
189k
            if (cram_compress_block2(fd, s, s->block[i], s->block[i]->m,
964
189k
                                     method, level))
965
0
                return -1;
966
189k
        }
967
62.9k
    }
968
969
    /*
970
     * Minimal compression of any block still uncompressed, bar CORE
971
     */
972
62.9k
    {
973
62.9k
        int i;
974
2.96M
        for (i = 1; i < s->hdr->num_blocks && i < DS_END; i++) {
975
2.89M
            if (!s->block[i] || s->block[i] == s->block[0])
976
2.44M
                continue;
977
978
451k
            if (s->block[i]->method != RAW)
979
13.9k
                continue;
980
981
437k
            if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
982
437k
                                    methodF, level))
983
0
                return -1;
984
437k
        }
985
62.9k
    }
986
987
62.9k
    return 0;
988
62.9k
}
989
990
/*
991
 * Allocates a block associated with the cram codec associated with
992
 * data series ds_id or the internal codec_id (depending on codec
993
 * type).
994
 *
995
 * The ds_ids are what end up written to disk as an external block.
996
 * The c_ids are internal and used when daisy-chaining transforms
997
 * such as MAP and RLE.  These blocks are also allocated, but
998
 * are ephemeral in nature.  (The codecs themselves cannot allocate
999
 * these as the same codec pointer may be operating on multiple slices
1000
 * if we're using a multi-slice container.)
1001
 *
1002
 * Returns 0 on success
1003
 *        -1 on failure
1004
 */
1005
1.82M
static int cram_allocate_block(cram_codec *codec, cram_slice *s, int ds_id) {
1006
1.82M
    if (!codec)
1007
593k
        return 0;
1008
1009
1.23M
    switch(codec->codec) {
1010
    // Codecs which are hard-coded to use the CORE block
1011
0
    case E_GOLOMB:
1012
809k
    case E_HUFFMAN:
1013
810k
    case E_BETA:
1014
810k
    case E_SUBEXP:
1015
810k
    case E_GOLOMB_RICE:
1016
810k
    case E_GAMMA:
1017
810k
        codec->out = s->block[0];
1018
810k
        break;
1019
1020
    // Codecs which don't use external blocks
1021
0
    case E_CONST_BYTE:
1022
0
    case E_CONST_INT:
1023
0
       codec->out = NULL;
1024
0
       break;
1025
1026
    // Codecs that emit directly to external blocks
1027
233k
    case E_EXTERNAL:
1028
233k
    case E_VARINT_UNSIGNED:
1029
233k
    case E_VARINT_SIGNED:
1030
233k
        if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id)))
1031
0
            return -1;
1032
233k
        codec->u.external.content_id = ds_id;
1033
233k
        codec->out = s->block[ds_id];
1034
233k
        break;
1035
1036
125k
    case E_BYTE_ARRAY_STOP: // Why no sub-codec?
1037
125k
        if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id)))
1038
0
            return -1;
1039
125k
        codec->u.byte_array_stop.content_id = ds_id;
1040
125k
        codec->out = s->block[ds_id];
1041
125k
        break;
1042
1043
1044
    // Codecs that contain sub-codecs which may in turn emit to external blocks
1045
62.9k
    case E_BYTE_ARRAY_LEN: {
1046
62.9k
        cram_codec *bal = codec->u.e_byte_array_len.len_codec;
1047
62.9k
        if (cram_allocate_block(bal, s, bal->u.external.content_id))
1048
0
            return -1;
1049
62.9k
        bal = codec->u.e_byte_array_len.val_codec;
1050
62.9k
        if (cram_allocate_block(bal, s, bal->u.external.content_id))
1051
0
            return -1;
1052
1053
62.9k
        break;
1054
62.9k
    }
1055
1056
62.9k
    case E_XRLE:
1057
0
        if (cram_allocate_block(codec->u.e_xrle.len_codec, s, ds_id))
1058
                                //ds_id == DS_QS ? DS_QS_len : ds_id))
1059
0
            return -1;
1060
0
        if (cram_allocate_block(codec->u.e_xrle.lit_codec, s, ds_id))
1061
0
            return -1;
1062
1063
0
        break;
1064
1065
0
    case E_XPACK:
1066
0
        if (cram_allocate_block(codec->u.e_xpack.sub_codec, s, ds_id))
1067
0
            return -1;
1068
0
        codec->out = cram_new_block(0, 0); // ephemeral
1069
0
        if (!codec->out)
1070
0
            return -1;
1071
1072
0
        break;
1073
1074
0
    case E_XDELTA:
1075
0
        if (cram_allocate_block(codec->u.e_xdelta.sub_codec, s, ds_id))
1076
0
            return -1;
1077
0
        codec->out = cram_new_block(0, 0); // ephemeral
1078
0
        if (!codec->out)
1079
0
            return -1;
1080
1081
0
        break;
1082
1083
0
    default:
1084
0
        break;
1085
1.23M
    }
1086
1087
1.23M
    return 0;
1088
1.23M
}
1089
1090
/*
1091
 * Encodes a single slice from a container
1092
 *
1093
 * Returns 0 on success
1094
 *        -1 on failure
1095
 */
1096
static int cram_encode_slice(cram_fd *fd, cram_container *c,
1097
                             cram_block_compression_hdr *h, cram_slice *s,
1098
62.9k
                             int embed_ref) {
1099
62.9k
    int rec, r = 0;
1100
62.9k
    int64_t last_pos;
1101
62.9k
    enum cram_DS_ID id;
1102
1103
    /*
1104
     * Slice external blocks:
1105
     * ID 0 => base calls (insertions, soft-clip)
1106
     * ID 1 => qualities
1107
     * ID 2 => names
1108
     * ID 3 => TS (insert size), NP (next frag)
1109
     * ID 4 => tag values
1110
     * ID 6 => tag IDs (TN), if CRAM_V1.0
1111
     * ID 7 => TD tag dictionary, if !CRAM_V1.0
1112
     */
1113
1114
    /* Create cram slice header */
1115
62.9k
    s->hdr->ref_base_id = embed_ref>0 && s->hdr->ref_seq_span > 0
1116
62.9k
        ? DS_ref
1117
62.9k
        : (CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : -1);
1118
62.9k
    s->hdr->record_counter = c->num_records + c->record_counter;
1119
62.9k
    c->num_records += s->hdr->num_records;
1120
1121
62.9k
    int ntags = c->tags_used ? c->tags_used->n_occupied : 0;
1122
62.9k
    s->block = calloc(DS_END + ntags*2, sizeof(s->block[0]));
1123
62.9k
    s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t));
1124
62.9k
    if (!s->block || !s->hdr->block_content_ids)
1125
0
        return -1;
1126
1127
    // Create first fixed blocks, always external.
1128
    // CORE
1129
62.9k
    if (!(s->block[0] = cram_new_block(CORE, 0)))
1130
0
        return -1;
1131
1132
    // TN block for CRAM v1
1133
62.9k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
1134
0
        if (h->codecs[DS_TN]->codec == E_EXTERNAL) {
1135
0
            if (!(s->block[DS_TN] = cram_new_block(EXTERNAL,DS_TN))) return -1;
1136
0
            h->codecs[DS_TN]->u.external.content_id = DS_TN;
1137
0
        } else {
1138
0
            s->block[DS_TN] = s->block[0];
1139
0
        }
1140
0
    }
1141
1142
    // Embedded reference
1143
62.9k
    if (embed_ref>0) {
1144
28.7k
        if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref)))
1145
0
            return -1;
1146
28.7k
        s->ref_id = DS_ref; // needed?
1147
28.7k
        BLOCK_APPEND(s->block[DS_ref],
1148
28.7k
                     c->ref + s->hdr->ref_seq_start - c->ref_start,
1149
28.7k
                     s->hdr->ref_seq_span);
1150
28.7k
    }
1151
1152
    /*
1153
     * All the data-series blocks if appropriate.
1154
     */
1155
1.76M
    for (id = DS_QS; id < DS_TN; id++) {
1156
1.70M
        if (cram_allocate_block(h->codecs[id], s, id) < 0)
1157
0
            return -1;
1158
1.70M
    }
1159
1160
    /*
1161
     * Add in the external tag blocks too.
1162
     */
1163
62.9k
    if (c->tags_used) {
1164
62.9k
        int n;
1165
62.9k
        s->hdr->num_blocks = DS_END;
1166
252k
        for (n = 0; n < s->naux_block; n++) {
1167
189k
            s->block[s->hdr->num_blocks++] = s->aux_block[n];
1168
189k
            s->aux_block[n] = NULL;
1169
189k
        }
1170
62.9k
    }
1171
1172
    /* Encode reads */
1173
62.9k
    last_pos = s->hdr->ref_seq_start;
1174
7.00M
    for (rec = 0; rec < s->hdr->num_records; rec++) {
1175
6.94M
        cram_record *cr = &s->crecs[rec];
1176
6.94M
        if (cram_encode_slice_read(fd, c, h, s, cr, &last_pos) == -1)
1177
0
            return -1;
1178
6.94M
    }
1179
1180
62.9k
    s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7);
1181
62.9k
    s->block[0]->comp_size = s->block[0]->uncomp_size;
1182
1183
    // Make sure the fixed blocks point to the correct sources
1184
62.9k
    if (s->block[DS_IN]) cram_free_block(s->block[DS_IN]);
1185
62.9k
    s->block[DS_IN] = s->base_blk; s->base_blk = NULL;
1186
62.9k
    if (s->block[DS_QS]) cram_free_block(s->block[DS_QS]);
1187
62.9k
    s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL;
1188
62.9k
    if (s->block[DS_RN]) cram_free_block(s->block[DS_RN]);
1189
62.9k
    s->block[DS_RN] = s->name_blk; s->name_blk = NULL;
1190
62.9k
    if (s->block[DS_SC]) cram_free_block(s->block[DS_SC]);
1191
62.9k
    s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL;
1192
1193
    // Finalise any data transforms.
1194
1.76M
    for (id = DS_QS; id < DS_TN; id++) {
1195
1.70M
       if (h->codecs[id] && h->codecs[id]->flush)
1196
0
           h->codecs[id]->flush(h->codecs[id]);
1197
1.70M
    }
1198
1199
    // Ensure block sizes are up to date.
1200
3.14M
    for (id = 1; id < s->hdr->num_blocks; id++) {
1201
3.08M
        if (!s->block[id] || s->block[id] == s->block[0])
1202
2.44M
            continue;
1203
1204
640k
        if (s->block[id]->uncomp_size == 0)
1205
640k
            BLOCK_UPLEN(s->block[id]);
1206
640k
    }
1207
1208
    // Compress it all
1209
62.9k
    if (cram_compress_slice(fd, c, s) == -1)
1210
0
        return -1;
1211
1212
    // Collapse empty blocks and create hdr_block
1213
62.9k
    {
1214
62.9k
        int i, j;
1215
1216
62.9k
        s->hdr->block_content_ids = realloc(s->hdr->block_content_ids,
1217
62.9k
                                            s->hdr->num_blocks * sizeof(int32_t));
1218
62.9k
        if (!s->hdr->block_content_ids)
1219
0
            return -1;
1220
1221
3.14M
        for (i = j = 1; i < s->hdr->num_blocks; i++) {
1222
3.08M
            if (!s->block[i] || s->block[i] == s->block[0])
1223
2.44M
                continue;
1224
640k
            if (s->block[i]->uncomp_size == 0) {
1225
306k
                cram_free_block(s->block[i]);
1226
306k
                s->block[i] = NULL;
1227
306k
                continue;
1228
306k
            }
1229
334k
            s->block[j] = s->block[i];
1230
334k
            s->hdr->block_content_ids[j-1] = s->block[i]->content_id;
1231
334k
            j++;
1232
334k
        }
1233
62.9k
        s->hdr->num_content_ids = j-1;
1234
62.9k
        s->hdr->num_blocks = j;
1235
1236
62.9k
        if (!(s->hdr_block = cram_encode_slice_header(fd, s)))
1237
121
            return -1;
1238
62.9k
    }
1239
1240
62.8k
    return r ? -1 : 0;
1241
1242
0
 block_err:
1243
0
    return -1;
1244
62.9k
}
1245
1246
6.94M
static inline const char *bam_data_end(bam1_t *b) {
1247
6.94M
    return (const char *)b->data + b->l_data;
1248
6.94M
}
1249
1250
/*
1251
 * A bounds checking version of bam_aux2i.
1252
 */
1253
4
static inline int bam_aux2i_end(const uint8_t *aux, const uint8_t *aux_end) {
1254
4
    int type = *aux++;
1255
4
    switch (type) {
1256
0
        case 'c':
1257
0
            if (aux_end - aux < 1) {
1258
0
                errno = EINVAL;
1259
0
                return 0;
1260
0
            }
1261
0
            return *(int8_t *)aux;
1262
1
        case 'C':
1263
1
            if (aux_end - aux < 1) {
1264
0
                errno = EINVAL;
1265
0
                return 0;
1266
0
            }
1267
1
            return *aux;
1268
0
        case 's':
1269
0
            if (aux_end - aux < 2) {
1270
0
                errno = EINVAL;
1271
0
                return 0;
1272
0
            }
1273
0
            return le_to_i16(aux);
1274
0
        case 'S':
1275
0
            if (aux_end - aux < 2) {
1276
0
                errno = EINVAL;
1277
0
                return 0;
1278
0
            }
1279
0
            return le_to_u16(aux);
1280
0
        case 'i':
1281
0
            if (aux_end - aux < 4) {
1282
0
                errno = EINVAL;
1283
0
                return 0;
1284
0
            }
1285
0
            return le_to_i32(aux);
1286
0
        case 'I':
1287
0
            if (aux_end - aux < 4) {
1288
0
                errno = EINVAL;
1289
0
                return 0;
1290
0
            }
1291
0
            return le_to_u32(aux);
1292
3
        default:
1293
3
            errno = EINVAL;
1294
4
    }
1295
3
    return 0;
1296
4
}
1297
1298
/*
1299
 * Returns the number of expected read names for this record.
1300
 */
1301
0
static int expected_template_count(bam_seq_t *b) {
1302
0
    int expected = bam_flag(b) & BAM_FPAIRED ? 2 : 1;
1303
1304
0
    uint8_t *TC = (uint8_t *)bam_aux_get(b, "TC");
1305
0
    if (TC) {
1306
0
        int n = bam_aux2i_end(TC, (uint8_t *)bam_data_end(b));
1307
0
        if (expected < n)
1308
0
            expected = n;
1309
0
    }
1310
1311
0
    if (!TC && bam_aux_get(b, "SA")) {
1312
        // We could count the semicolons, but we'd have to do this for
1313
        // read1, read2 and read(not-1-or-2) combining the results
1314
        // together.  This is a cheap and safe alternative for now.
1315
0
        expected = INT_MAX;
1316
0
    }
1317
1318
0
    return expected;
1319
0
}
1320
1321
/*
1322
 * Lossily reject read names.
1323
 *
1324
 * The rule here is that if all reads for this template reside in the
1325
 * same slice then we can lose the name.  Otherwise we keep them as we
1326
 * do not know when (or if) the other reads will turn up.
1327
 *
1328
 * Note there may be only 1 read (non-paired library) or more than 2
1329
 * reads (paired library with supplementary reads), or other weird
1330
 * setups.  We need to know how many are expected.  Ways to guess:
1331
 *
1332
 * - Flags (0x1 - has > 1 read)
1333
 * - TC aux field (not mandatory)
1334
 * - SA tags (count semicolons, NB per fragment so sum - hard)
1335
 * - RNEXT/PNEXT uniqueness count. (not implemented, tricky)
1336
 *
1337
 * Returns 0 on success
1338
 *        -1 on failure
1339
 */
1340
static int lossy_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
1341
64.7k
                            int bam_start) {
1342
64.7k
    int r1, r2, ret = -1;
1343
1344
    // Initialise cram_flags
1345
7.01M
    for (r2 = 0; r2 < s->hdr->num_records; r2++)
1346
6.94M
        s->crecs[r2].cram_flags = 0;
1347
1348
64.7k
    if (!fd->lossy_read_names)
1349
64.7k
        return 0;
1350
1351
0
    khash_t(m_s2u64) *names = kh_init(m_s2u64);
1352
0
    if (!names)
1353
0
        goto fail;
1354
1355
    // 1: Iterate through names to count frequency
1356
0
    for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
1357
        //cram_record *cr = &s->crecs[r2];
1358
0
        bam_seq_t *b = c->bams[r1];
1359
0
        khint_t k;
1360
0
        int n;
1361
0
        uint64_t e;
1362
0
        union {
1363
0
            uint64_t i64;
1364
0
            struct {
1365
0
                int32_t e,c; // expected & observed counts.
1366
0
            } counts;
1367
0
        } u;
1368
1369
0
        e = expected_template_count(b);
1370
0
        u.counts.e = e; u.counts.c = 1;
1371
1372
0
        k = kh_put(m_s2u64, names, bam_name(b), &n);
1373
0
        if (n == -1)
1374
0
            goto fail;
1375
1376
0
        if (n == 0) {
1377
            // not a new name
1378
0
            u.i64 = kh_val(names, k);
1379
0
            if (u.counts.e != e) {
1380
                // different expectation or already hit the max
1381
                //fprintf(stderr, "Err computing no. %s recs\n", bam_name(b));
1382
0
                kh_val(names, k) = 0;
1383
0
            } else {
1384
0
                u.counts.c++;
1385
0
                if (u.counts.e == u.counts.c) {
1386
                    // Reached expected count.
1387
0
                    kh_val(names, k) = -1;
1388
0
                } else {
1389
0
                    kh_val(names, k) = u.i64;
1390
0
                }
1391
0
            }
1392
0
        } else {
1393
            // new name
1394
0
            kh_val(names, k) = u.i64;
1395
0
        }
1396
0
    }
1397
1398
    // 2: Remove names if all present (hd.i == -1)
1399
0
    for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
1400
0
        cram_record *cr = &s->crecs[r2];
1401
0
        bam_seq_t *b = c->bams[r1];
1402
0
        khint_t k;
1403
1404
0
        k = kh_get(m_s2u64, names, bam_name(b));
1405
1406
0
        if (k == kh_end(names))
1407
0
            goto fail;
1408
1409
0
        if (kh_val(names, k) == -1)
1410
0
            cr->cram_flags = CRAM_FLAG_DISCARD_NAME;
1411
0
    }
1412
1413
0
    ret = 0;
1414
0
 fail: // ret==-1
1415
1416
0
    if (names)
1417
0
        kh_destroy(m_s2u64, names);
1418
1419
0
    return ret;
1420
0
}
1421
1422
/*
1423
 * Adds the reading names.  We do this here as a separate pass rather
1424
 * than per record in the process_one_read calls as that function can
1425
 * go back and change the CRAM_FLAG_DETACHED status of a previously
1426
 * processed read if it subsequently determines the TLEN field is
1427
 * incorrect.  Given DETACHED reads always try to decode read names,
1428
 * we need to know their status before generating the read-name block.
1429
 *
1430
 * Output is an update s->name_blk, and cr->name / cr->name_len
1431
 * fields.
1432
 */
1433
static int add_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
1434
63.0k
                          int bam_start) {
1435
63.0k
    int r1, r2;
1436
63.0k
    int keep_names = !fd->lossy_read_names;
1437
1438
63.0k
    for (r1 = bam_start, r2 = 0;
1439
7.00M
         r1 < c->curr_c_rec && r2 < s->hdr->num_records;
1440
6.94M
         r1++, r2++) {
1441
6.94M
        cram_record *cr = &s->crecs[r2];
1442
6.94M
        bam_seq_t *b = c->bams[r1];
1443
1444
6.94M
        cr->name        = BLOCK_SIZE(s->name_blk);
1445
6.94M
        if ((cr->cram_flags & CRAM_FLAG_DETACHED) || keep_names) {
1446
6.94M
            if (CRAM_MAJOR_VERS(fd->version) >= 4
1447
6.94M
                && (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM)
1448
6.94M
                && cr->mate_line) {
1449
                // Dedup read names in V4
1450
0
                BLOCK_APPEND(s->name_blk, "\0", 1);
1451
0
                cr->name_len    = 1;
1452
6.94M
            } else {
1453
6.94M
                BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b));
1454
6.94M
                cr->name_len    = bam_name_len(b);
1455
6.94M
            }
1456
6.94M
        } else {
1457
            // Can only discard duplicate names if not detached
1458
0
            cr->name_len = 0;
1459
0
        }
1460
1461
6.94M
        if (cram_stats_add(c->stats[DS_RN], cr->name_len) < 0)
1462
0
            goto block_err;
1463
6.94M
    }
1464
1465
63.0k
    return 0;
1466
1467
0
 block_err:
1468
0
    return -1;
1469
63.0k
}
1470
1471
// CRAM version >= 3.1
1472
97.4k
#define CRAM_ge31(v) ((v) >= 0x301)
1473
1474
// Returns the next cigar op code: one of the BAM_C* codes,
1475
// or -1 if no more are present.
1476
static inline
1477
int next_cigar_op(uint32_t *cigar, uint32_t ncigar, int *skip, int *spos,
1478
9.79k
                  uint32_t *cig_ind, uint32_t *cig_op, uint32_t *cig_len) {
1479
17.4k
    for(;;) {
1480
43.4k
        while (*cig_len == 0) {
1481
26.0k
            if (*cig_ind < ncigar) {
1482
25.9k
                *cig_op  = cigar[*cig_ind] & BAM_CIGAR_MASK;
1483
25.9k
                *cig_len = cigar[*cig_ind] >> BAM_CIGAR_SHIFT;
1484
25.9k
                (*cig_ind)++;
1485
25.9k
            } else {
1486
32
                return -1;
1487
32
            }
1488
26.0k
        }
1489
1490
17.4k
        if (skip[*cig_op]) {
1491
7.69k
            *spos += (bam_cigar_type(*cig_op)&1) * *cig_len;
1492
7.69k
            *cig_len = 0;
1493
7.69k
            continue;
1494
7.69k
        }
1495
1496
9.76k
        (*cig_len)--;
1497
9.76k
        break;
1498
17.4k
    }
1499
1500
9.76k
    return *cig_op;
1501
9.79k
}
1502
1503
// Ensure ref and hist are large enough.
1504
static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos,
1505
2.27M
                             hts_pos_t ref_start, hts_pos_t *ref_end) {
1506
2.27M
    if (pos < ref_start)
1507
326
        return -1;
1508
2.27M
    if (pos < *ref_end)
1509
2.23M
        return 0;
1510
1511
    // realloc
1512
34.4k
    if (pos - ref_start > UINT_MAX)
1513
88
        return -2; // protect overflow in new_end calculation
1514
1515
34.3k
    hts_pos_t old_end = *ref_end ? *ref_end : ref_start;
1516
34.3k
    hts_pos_t new_end = ref_start + 1000 + (pos-ref_start)*1.5;
1517
1518
    // Refuse to work on excessively large blocks.
1519
    // We'll just switch to referenceless encoding, which is probably better
1520
    // here as this must be very sparse data anyway.
1521
34.3k
    if (new_end - ref_start > UINT_MAX/sizeof(**hist)/2)
1522
906
        return -2;
1523
1524
33.4k
    char *tmp = realloc(*ref, new_end-ref_start+1);
1525
33.4k
    if (!tmp)
1526
0
        return -1;
1527
33.4k
    *ref = tmp;
1528
1529
33.4k
    uint32_t (*tmp5)[5] = realloc(**hist,
1530
33.4k
                                  (new_end - ref_start)*sizeof(**hist));
1531
33.4k
    if (!tmp5)
1532
0
        return -1;
1533
33.4k
    *hist = tmp5;
1534
33.4k
    *ref_end = new_end;
1535
1536
    // initialise
1537
33.4k
    old_end -= ref_start;
1538
33.4k
    new_end -= ref_start;
1539
33.4k
    memset(&(*ref)[old_end],  0,  new_end-old_end);
1540
33.4k
    memset(&(*hist)[old_end], 0, (new_end-old_end)*sizeof(**hist));
1541
1542
33.4k
    return 0;
1543
33.4k
}
1544
1545
// Walk through MD + seq to generate ref
1546
// Returns 1 on success, <0 on failure
1547
static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
1548
                              hts_pos_t ref_start, hts_pos_t *ref_end,
1549
33.0k
                              const uint8_t *MD) {
1550
33.0k
    uint8_t *seq = bam_get_seq(b);
1551
33.0k
    uint32_t *cigar = bam_get_cigar(b);
1552
33.0k
    uint32_t ncigar = b->core.n_cigar;
1553
33.0k
    uint32_t cig_op = 0, cig_len = 0, cig_ind = 0;
1554
1555
33.0k
    int iseq = 0, next_op;
1556
33.0k
    hts_pos_t iref = b->core.pos - ref_start;
1557
1558
    // Skip INS, REF_SKIP, *CLIP, PAD. and BACK.
1559
33.0k
    static int cig_skip[16] = {0,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1};
1560
39.3k
    while (iseq < b->core.l_qseq && *MD) {
1561
15.4k
        if (isdigit(*MD)) {
1562
            // match
1563
10.0k
            int overflow = 0;
1564
10.0k
            int len = hts_str2uint((char *)MD, (char **)&MD, 31, &overflow);
1565
10.0k
            if (overflow ||
1566
10.0k
                extend_ref(ref, hist, iref+ref_start + len,
1567
8.62k
                           ref_start, ref_end) < 0)
1568
1.57k
                return -1;
1569
8.54k
            while (iseq < b->core.l_qseq && len) {
1570
                // rewrite to have internal loops?
1571
4.49k
                if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
1572
4.49k
                                             &iseq, &cig_ind, &cig_op,
1573
4.49k
                                             &cig_len)) < 0)
1574
3
                    return -1;
1575
1576
4.48k
                if (next_op != BAM_CMATCH &&
1577
4.48k
                    next_op != BAM_CEQUAL) {
1578
4.45k
                    hts_log_info("MD:Z and CIGAR are incompatible for "
1579
4.45k
                                 "record %s", bam_get_qname(b));
1580
4.45k
                    return -1;
1581
4.45k
                }
1582
1583
                // Short-cut loop over same cigar op for efficiency
1584
36
                cig_len++;
1585
36
                do {
1586
36
                    cig_len--;
1587
36
                    (*ref)[iref++] = seq_nt16_str[bam_seqi(seq, iseq)];
1588
36
                    iseq++;
1589
36
                    len--;
1590
36
                } while (cig_len && iseq < b->core.l_qseq && len);
1591
36
            }
1592
4.05k
            if (len > 0)
1593
33
                return -1; // MD is longer than seq
1594
5.37k
        } else if (*MD == '^') {
1595
            // deletion
1596
679
            MD++;
1597
679
            while (isalpha(*MD)) {
1598
608
                if (extend_ref(ref, hist, iref+ref_start, ref_start,
1599
608
                               ref_end) < 0)
1600
1
                    return -1;
1601
607
                if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
1602
607
                                             &iseq, &cig_ind, &cig_op,
1603
607
                                             &cig_len)) < 0)
1604
6
                    return -1;
1605
1606
601
                if (next_op != BAM_CDEL) {
1607
29
                    hts_log_info("MD:Z and CIGAR are incompatible");
1608
29
                    return -1;
1609
29
                }
1610
1611
572
                (*ref)[iref++] = *MD++ & ~0x20;
1612
572
            }
1613
4.69k
        } else {
1614
            // substitution
1615
4.69k
            if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end) < 0)
1616
3
                return -1;
1617
4.69k
            if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
1618
4.69k
                                         &iseq, &cig_ind, &cig_op,
1619
4.69k
                                         &cig_len)) < 0)
1620
23
                return -1;
1621
1622
4.67k
            if (next_op != BAM_CMATCH && next_op != BAM_CDIFF) {
1623
3.04k
                hts_log_info("MD:Z and CIGAR are incompatible");
1624
3.04k
                return -1;
1625
3.04k
            }
1626
1627
1.63k
            (*ref)[iref++] = *MD++ & ~0x20;
1628
1.63k
            iseq++;
1629
1.63k
        }
1630
15.4k
    }
1631
1632
23.8k
    return 1;
1633
33.0k
}
1634
1635
// Append a sequence to a ref/consensus structure.
1636
// We maintain both an absolute refefence (ACGTN where MD:Z is
1637
// present) and a 5-way frequency array for when no MD:Z is known.
1638
// We then subsequently convert the 5-way frequencies to a consensus
1639
// ref in a second pass.
1640
//
1641
// Returns >=0 on success,
1642
//         -1 on failure (eg inconsistent data)
1643
static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5],
1644
80.4k
                           hts_pos_t ref_start, hts_pos_t *ref_end) {
1645
80.4k
    const uint8_t *MD = bam_aux_get(b, "MD");
1646
80.4k
    int ret = 0;
1647
80.4k
    if (MD && *MD == 'Z') {
1648
        // We can use MD to directly compute the reference
1649
33.0k
        int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end, MD+1);
1650
1651
33.0k
        if (ret > 0)
1652
23.8k
            return ret;
1653
33.0k
    }
1654
1655
    // Otherwise we just use SEQ+CIGAR and build a consensus which we later
1656
    // turn into a fake reference
1657
56.5k
    uint32_t *cigar = bam_get_cigar(b);
1658
56.5k
    uint32_t ncigar = b->core.n_cigar;
1659
56.5k
    uint32_t i, j;
1660
56.5k
    hts_pos_t iseq = 0, iref = b->core.pos - ref_start;
1661
56.5k
    uint8_t *seq = bam_get_seq(b);
1662
2.54M
    for (i = 0; i < ncigar; i++) {
1663
2.49M
        switch (bam_cigar_op(cigar[i])) {
1664
98.5k
        case BAM_CSOFT_CLIP:
1665
141k
        case BAM_CINS:
1666
141k
            iseq += bam_cigar_oplen(cigar[i]);
1667
141k
            break;
1668
1669
2.18M
        case BAM_CMATCH:
1670
2.22M
        case BAM_CEQUAL:
1671
2.23M
        case BAM_CDIFF: {
1672
2.23M
            int len = bam_cigar_oplen(cigar[i]);
1673
            // Maps an nt16 (A=1 C=2 G=4 T=8 bits) to 0123 plus N=4
1674
2.23M
            static uint8_t L16[16] = {4,0,1,4, 2,4,4,4, 3,4,4,4, 4,4,4,4};
1675
1676
2.23M
            if (extend_ref(ref, hist, iref+ref_start + len,
1677
2.23M
                           ref_start, ref_end) < 0)
1678
783
                return -1;
1679
2.22M
            if (iseq + len <= b->core.l_qseq) {
1680
                // Nullify failed MD:Z if appropriate
1681
18.2k
                if (ret < 0)
1682
0
                    memset(&(*ref)[iref], 0, len);
1683
1684
26.2k
                for (j = 0; j < len; j++, iref++, iseq++)
1685
8.06k
                    (*hist)[iref][L16[bam_seqi(seq, iseq)]]++;
1686
2.21M
            } else {
1687
                // Probably a 2ndary read with seq "*"
1688
2.21M
                iseq += len;
1689
2.21M
                iref += len;
1690
2.21M
            }
1691
2.22M
            break;
1692
2.23M
        }
1693
1694
33.8k
        case BAM_CDEL:
1695
36.6k
        case BAM_CREF_SKIP:
1696
36.6k
            iref += bam_cigar_oplen(cigar[i]);
1697
2.49M
        }
1698
2.49M
    }
1699
1700
55.7k
    return 1;
1701
56.5k
}
1702
1703
// Automatically generates the reference and stashed it in c->ref, also
1704
// setting c->ref_start and c->ref_end.
1705
//
1706
// If we have MD:Z tags then we use them to directly infer the reference,
1707
// along with SEQ + CIGAR.  Otherwise we use SEQ/CIGAR only to build up
1708
// a consensus and then assume the reference as the majority rule.
1709
//
1710
// In this latter scenario we need to be wary of auto-generating MD and NM
1711
// during decode, but that's handled elsewhere via an additional aux tag.
1712
//
1713
// Returns 0 on success,
1714
//        -1 on failure
1715
30.1k
static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) {
1716
    // TODO: if we can find an external reference then use it, even if the
1717
    // user told us to do embed_ref=2.
1718
30.1k
    char *ref = NULL;
1719
30.1k
    uint32_t (*hist)[5] = NULL;
1720
30.1k
    hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0;
1721
30.1k
    if (ref_start < 0)
1722
5
        return -1; // cannot build consensus from unmapped data
1723
1724
    // initial allocation
1725
30.1k
    if (extend_ref(&ref, &hist,
1726
30.1k
                   c->bams[r1 + s->hdr->num_records-1]->core.pos +
1727
30.1k
                   c->bams[r1 + s->hdr->num_records-1]->core.l_qseq,
1728
30.1k
                   ref_start, &ref_end) < 0)
1729
417
        return -1;
1730
1731
    // Add each bam file to the reference/consensus arrays
1732
29.7k
    int r2;
1733
29.7k
    hts_pos_t last_pos = -1;
1734
109k
    for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) {
1735
80.5k
        if (c->bams[r1]->core.pos < last_pos) {
1736
101
            hts_log_error("Cannot build reference with unsorted data");
1737
101
            goto err;
1738
101
        }
1739
80.4k
        last_pos = c->bams[r1]->core.pos;
1740
80.4k
        if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end) < 0)
1741
783
            goto err;
1742
80.4k
    }
1743
1744
    // Compute the consensus
1745
28.8k
    hts_pos_t i;
1746
2.57G
    for (i = 0; i < ref_end-ref_start; i++) {
1747
2.57G
        if (!ref[i]) {
1748
2.57G
            int max_v = 0, max_j = 4, j;
1749
12.8G
            for (j = 0; j < 4; j++)
1750
                // don't call N (j==4) unless no coverage
1751
10.2G
                if (max_v < hist[i][j])
1752
1.91k
                    max_v = hist[i][j], max_j = j;
1753
2.57G
            ref[i] = "ACGTN"[max_j];
1754
2.57G
        }
1755
2.57G
    }
1756
28.8k
    free(hist);
1757
1758
    // Put the reference in place so it appears to be an external
1759
    // ref file.
1760
28.8k
    c->ref       = ref;
1761
28.8k
    c->ref_start = ref_start+1;
1762
28.8k
    c->ref_end   = ref_end+1;
1763
28.8k
    c->ref_free  = 1;
1764
28.8k
    return 0;
1765
1766
884
 err:
1767
884
    free(ref);
1768
884
    free(hist);
1769
884
    return -1;
1770
29.7k
}
1771
1772
// Check if the SQ M5 tag matches the reference we've loaded.
1773
0
static int validate_md5(cram_fd *fd, int ref_id) {
1774
0
    if (fd->ignore_md5 || ref_id < 0 || ref_id >= fd->refs->nref)
1775
0
        return 0;
1776
1777
    // Have we already checked this ref?
1778
0
    if (fd->refs->ref_id[ref_id]->validated_md5)
1779
0
        return 0;
1780
1781
    // Check if we have the MD5 known.
1782
    // We should, but maybe we're using embedded references?
1783
0
    sam_hrecs_t *hrecs = fd->header->hrecs;
1784
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, "SQ", "SN",
1785
0
                                                 hrecs->ref[ref_id].name);
1786
0
    if (!ty)
1787
0
        return 0;
1788
1789
0
    sam_hrec_tag_t *m5tag = sam_hrecs_find_key(ty, "M5", NULL);
1790
0
    if (!m5tag)
1791
0
        return 0;
1792
1793
    // It's known, so compute md5 on the loaded reference sequence.
1794
0
    char *ref = fd->refs->ref_id[ref_id]->seq;
1795
0
    int64_t len = fd->refs->ref_id[ref_id]->length;
1796
0
    hts_md5_context *md5;
1797
0
    char unsigned buf[16];
1798
0
    char buf2[33];
1799
1800
0
    if (!(md5 = hts_md5_init()))
1801
0
        return -1;
1802
0
    hts_md5_update(md5, ref, len);
1803
0
    hts_md5_final(buf, md5);
1804
0
    hts_md5_destroy(md5);
1805
0
    hts_md5_hex(buf2, buf);
1806
1807
    // Compare it to header @SQ M5 tag
1808
0
    if (strcmp(m5tag->str+3, buf2)) {
1809
0
        hts_log_error("SQ header M5 tag discrepancy for reference '%s'",
1810
0
                      hrecs->ref[ref_id].name);
1811
0
        hts_log_error("Please use the correct reference, or "
1812
0
                      "consider using embed_ref=2");
1813
0
        return -1;
1814
0
    }
1815
0
    fd->refs->ref_id[ref_id]->validated_md5 = 1;
1816
1817
0
    return 0;
1818
0
}
1819
1820
/*
1821
 * Encodes all slices in a container into blocks.
1822
 * Returns 0 on success
1823
 *        -1 on failure
1824
 */
1825
63.4k
int cram_encode_container(cram_fd *fd, cram_container *c) {
1826
63.4k
    int i, j, slice_offset;
1827
63.4k
    cram_block_compression_hdr *h = c->comp_hdr;
1828
63.4k
    cram_block *c_hdr;
1829
63.4k
    int multi_ref = 0;
1830
63.4k
    int r1, r2, sn, nref, embed_ref, no_ref;
1831
63.4k
    spare_bams *spares;
1832
1833
63.4k
    if (!c->bams)
1834
0
        goto err;
1835
1836
63.4k
    if (CRAM_MAJOR_VERS(fd->version) == 1)
1837
0
        goto err;
1838
1839
//#define goto_err {fprintf(stderr, "ERR at %s:%d\n", __FILE__, __LINE__);goto err;}
1840
63.4k
#define goto_err goto err
1841
1842
    // Don't try embed ref if we repeatedly fail
1843
63.4k
    pthread_mutex_lock(&fd->ref_lock);
1844
63.4k
    int failed_embed = (fd->no_ref_counter >= 5); // maximum 5 tries
1845
63.4k
    if (!failed_embed && c->embed_ref == -2 && c->ref_id >= 0) {
1846
863
        hts_log_warning("Retrying embed_ref=2 mode for #%d/5", fd->no_ref_counter);
1847
863
        fd->no_ref = c->no_ref = 0;
1848
863
        fd->embed_ref = c->embed_ref = 2;
1849
62.6k
    } else if (failed_embed && c->embed_ref == -2) {
1850
        // We've tried several times, so this time give up for good
1851
35
        hts_log_warning("Keeping non-ref mode from now on");
1852
35
        fd->embed_ref = c->embed_ref = 0;
1853
35
    }
1854
63.4k
    pthread_mutex_unlock(&fd->ref_lock);
1855
1856
64.8k
 restart:
1857
    /* Cache references up-front if we have unsorted access patterns */
1858
64.8k
    pthread_mutex_lock(&fd->ref_lock);
1859
64.8k
    nref = fd->refs->nref;
1860
64.8k
    pthread_mutex_unlock(&fd->ref_lock);
1861
64.8k
    embed_ref = c->embed_ref;
1862
64.8k
    no_ref = c->no_ref;
1863
1864
    /* To create M5 strings */
1865
    /* Fetch reference sequence */
1866
64.8k
    if (!no_ref) {
1867
62.8k
        if (!c->bams || !c->curr_c_rec || !c->bams[0])
1868
53
            goto_err;
1869
62.7k
        bam_seq_t *b = c->bams[0];
1870
1871
62.7k
        if (embed_ref <= 1) {
1872
4.09k
            char *ref = cram_get_ref(fd, bam_ref(b), 1, 0);
1873
4.09k
            if (!ref && bam_ref(b) >= 0) {
1874
23
                if (!c->pos_sorted) {
1875
                    // TODO: maybe also check fd->no_ref?
1876
1
                    hts_log_warning("Failed to load reference #%d",
1877
1
                                    bam_ref(b));
1878
1
                    hts_log_warning("Switching to non-ref mode");
1879
1880
1
                    pthread_mutex_lock(&fd->ref_lock);
1881
1
                    c->embed_ref = fd->embed_ref = 0;
1882
1
                    c->no_ref = fd->no_ref = 1;
1883
1
                    pthread_mutex_unlock(&fd->ref_lock);
1884
1
                    goto restart;
1885
1
                }
1886
1887
22
                if (c->multi_seq || embed_ref == 0) {
1888
0
                    hts_log_error("Failed to load reference #%d", bam_ref(b));
1889
0
                    return -1;
1890
0
                }
1891
22
                hts_log_warning("Failed to load reference #%d", bam_ref(b));
1892
22
                hts_log_warning("Enabling embed_ref=2 mode to auto-generate"
1893
22
                                " reference");
1894
22
                if (embed_ref <= 0)
1895
22
                    hts_log_warning("NOTE: the CRAM file will be bigger than"
1896
22
                                    " using an external reference");
1897
22
                pthread_mutex_lock(&fd->ref_lock);
1898
22
                embed_ref = c->embed_ref = fd->embed_ref = 2;
1899
22
                pthread_mutex_unlock(&fd->ref_lock);
1900
22
                goto auto_ref;
1901
4.06k
            } else if (ref) {
1902
0
                if (validate_md5(fd, c->ref_seq_id) < 0)
1903
0
                    goto_err;
1904
0
            }
1905
4.06k
            if ((c->ref_id = bam_ref(b)) >= 0) {
1906
0
                c->ref_seq_id = c->ref_id;
1907
0
                c->ref       = fd->refs->ref_id[c->ref_seq_id]->seq;
1908
0
                c->ref_start = 1;
1909
0
                c->ref_end   = fd->refs->ref_id[c->ref_seq_id]->length;
1910
0
            }
1911
58.6k
        } else {
1912
58.7k
        auto_ref:
1913
            // Auto-embed ref.
1914
            // This starts as 'N' and is amended on-the-fly as we go
1915
            // based on MD:Z tags.
1916
58.7k
            if ((c->ref_id = bam_ref(b)) >= 0) {
1917
30.1k
                c->ref = NULL;
1918
                // c->ref_free is boolean; whether to free c->ref.  In this
1919
                // case c->ref will be our auto-embedded sequence instead of
1920
                // a "global" portion of reference from fd->refs.
1921
                // Do not confuse with fd->ref_free which is a pointer to a
1922
                // reference string to free.
1923
30.1k
                c->ref_free = 1;
1924
30.1k
            } else {
1925
                // Double check for broken input.  We shouldn't have
1926
                // embedded references enabled for unmapped data, but our
1927
                // data could be broken.
1928
28.5k
                embed_ref = 0;
1929
28.5k
                no_ref = c->no_ref = 1;
1930
28.5k
            }
1931
58.7k
        }
1932
62.7k
        c->ref_seq_id = c->ref_id;
1933
62.7k
    } else {
1934
1.98k
        c->ref_id = bam_ref(c->bams[0]);
1935
1.98k
        cram_ref_incr(fd->refs, c->ref_id);
1936
1.98k
        c->ref_seq_id = c->ref_id;
1937
1.98k
    }
1938
1939
64.7k
    if (!no_ref && c->refs_used) {
1940
1.53k
        for (i = 0; i < nref; i++) {
1941
1.06k
            if (c->refs_used[i]) {
1942
444
                if (cram_get_ref(fd, i, 1, 0)) {
1943
0
                    if (validate_md5(fd, i) < 0)
1944
0
                        goto_err;
1945
444
                } else {
1946
444
                    hts_log_warning("Failed to find reference, "
1947
444
                                    "switching to non-ref mode");
1948
444
                    no_ref = c->no_ref = 1;
1949
444
                }
1950
444
            }
1951
1.06k
        }
1952
468
    }
1953
1954
    /* Turn bams into cram_records and gather basic stats */
1955
127k
    for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) {
1956
64.7k
        cram_slice *s = c->slices[sn];
1957
64.7k
        int64_t first_base = INT64_MAX, last_base = INT64_MIN;
1958
1959
64.7k
        int r1_start = r1;
1960
1961
64.7k
        assert(sn < c->curr_slice);
1962
1963
        // Discover which read names *may* be safely removed.
1964
        // Ie which ones have all their records in this slice.
1965
64.7k
        if (lossy_read_names(fd, c, s, r1_start) != 0)
1966
0
            return -1;
1967
1968
        // Tracking of MD tags so we can spot when the auto-generated values
1969
        // will differ from the current stored ones.  The kstring here is
1970
        // simply to avoid excessive malloc and free calls.  All initialisation
1971
        // is done within process_one_read().
1972
64.7k
        kstring_t MD = {0};
1973
1974
        // Embed consensus / MD-generated ref
1975
64.7k
        if (embed_ref == 2) {
1976
30.1k
            if (c->ref_id < 0 || cram_generate_reference(c, s, r1) < 0) {
1977
                // Should this be a permanent thing via fd->no_ref?
1978
                // Doing so means we cannot easily switch back again should
1979
                // things fix themselves later on.  This is likely not a
1980
                // concern though as failure to generate a reference implies
1981
                // unsorted data which is rarely recovered from.
1982
1983
                // Only if sn == 0.  We're hosed if we're on the 2nd slice and
1984
                // the first worked, as no-ref is a container global param.
1985
1.30k
                if (sn > 0) {
1986
0
                    hts_log_error("Failed to build reference, "
1987
0
                                  "switching to non-ref mode");
1988
0
                    return -1;
1989
1.30k
                } else {
1990
1.30k
                    hts_log_warning("Failed to build reference, "
1991
1.30k
                                    "switching to non-ref mode");
1992
1.30k
                }
1993
1.30k
                pthread_mutex_lock(&fd->ref_lock);
1994
1.30k
                c->embed_ref = fd->embed_ref = -2; // was previously embed_ref
1995
1.30k
                c->no_ref = fd->no_ref = 1;
1996
1.30k
                fd->no_ref_counter++; // more likely to keep permanent action
1997
1.30k
                pthread_mutex_unlock(&fd->ref_lock);
1998
1.30k
                failed_embed = 1;
1999
1.30k
                goto restart;
2000
28.8k
            } else {
2001
28.8k
                pthread_mutex_lock(&fd->ref_lock);
2002
28.8k
                fd->no_ref_counter -= (fd->no_ref_counter > 0);
2003
28.8k
                pthread_mutex_unlock(&fd->ref_lock);
2004
28.8k
            }
2005
2006
28.8k
            if (c->ref_end > fd->refs->ref_id[c->ref_id]->LN_length)
2007
22.0k
                c->ref_end = fd->refs->ref_id[c->ref_id]->LN_length;
2008
28.8k
        }
2009
2010
        // Iterate through records creating the cram blocks for some
2011
        // fields and just gathering stats for others.
2012
7.00M
        for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) {
2013
6.94M
            cram_record *cr = &s->crecs[r2];
2014
6.94M
            bam_seq_t *b = c->bams[r1];
2015
2016
            /* If multi-ref we need to cope with changing reference per seq */
2017
6.94M
            if (c->multi_seq && !no_ref) {
2018
2.22k
                if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) {
2019
0
                    if (c->ref_seq_id >= 0)
2020
0
                        cram_ref_decr(fd->refs, c->ref_seq_id);
2021
2022
0
                    if (!cram_get_ref(fd, bam_ref(b), 1, 0)) {
2023
0
                        hts_log_error("Failed to load reference #%d", bam_ref(b));
2024
0
                        free(MD.s);
2025
0
                        return -1;
2026
0
                    }
2027
0
                    if (validate_md5(fd, bam_ref(b)) < 0)
2028
0
                        return -1;
2029
2030
0
                    c->ref_seq_id = bam_ref(b); // overwritten later by -2
2031
0
                    if (!fd->refs->ref_id[c->ref_seq_id]->seq)
2032
0
                        return -1;
2033
0
                    c->ref       = fd->refs->ref_id[c->ref_seq_id]->seq;
2034
0
                    c->ref_start = 1;
2035
0
                    c->ref_end   = fd->refs->ref_id[c->ref_seq_id]->length;
2036
0
                }
2037
2.22k
            }
2038
2039
6.94M
            if (process_one_read(fd, c, s, cr, b, r2, &MD, embed_ref,
2040
6.94M
                                 no_ref) != 0) {
2041
368
                free(MD.s);
2042
368
                return -1;
2043
368
            }
2044
2045
6.94M
            if (first_base > cr->apos)
2046
63.7k
                first_base = cr->apos;
2047
2048
6.94M
            if (last_base < cr->aend)
2049
65.4k
                last_base = cr->aend;
2050
6.94M
        }
2051
2052
63.0k
        free(MD.s);
2053
2054
        // Process_one_read doesn't add read names as it can change
2055
        // its mind during the loop on the CRAM_FLAG_DETACHED setting
2056
        // of earlier records (if it detects the auto-generation of
2057
        // TLEN is incorrect).  This affects which read-names can be
2058
        // lossily compressed, so we do these in another pass.
2059
63.0k
        if (add_read_names(fd, c, s, r1_start) < 0)
2060
0
            return -1;
2061
2062
63.0k
        if (c->multi_seq) {
2063
972
            s->hdr->ref_seq_id    = -2;
2064
972
            s->hdr->ref_seq_start = 0;
2065
972
            s->hdr->ref_seq_span  = 0;
2066
62.1k
        } else if (c->ref_id == -1 && CRAM_ge31(fd->version)) {
2067
            // Spec states span=0, but it broke our range queries.
2068
            // See commit message for this and prior.
2069
32.4k
            s->hdr->ref_seq_id    = -1;
2070
32.4k
            s->hdr->ref_seq_start = 0;
2071
32.4k
            s->hdr->ref_seq_span  = 0;
2072
32.4k
        } else {
2073
29.6k
            s->hdr->ref_seq_id    = c->ref_id;
2074
29.6k
            s->hdr->ref_seq_start = first_base;
2075
29.6k
            s->hdr->ref_seq_span  = MAX(0, last_base - first_base + 1);
2076
29.6k
        }
2077
63.0k
        s->hdr->num_records = r2;
2078
2079
        // Processed a slice, now stash the aux blocks so the next
2080
        // slice can start aggregating them from the start again.
2081
63.0k
        if (c->tags_used->n_occupied) {
2082
53.6k
            int ntags = c->tags_used->n_occupied;
2083
53.6k
            s->aux_block = calloc(ntags*2, sizeof(*s->aux_block));
2084
53.6k
            if (!s->aux_block)
2085
0
                return -1;
2086
2087
53.6k
            khint_t k;
2088
2089
53.6k
            s->naux_block = 0;
2090
466k
            for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
2091
412k
                if (!kh_exist(c->tags_used, k))
2092
223k
                    continue;
2093
2094
189k
                cram_tag_map *tm = kh_val(c->tags_used, k);
2095
189k
                if (!tm) goto_err;
2096
189k
                if (!tm->blk) continue;
2097
189k
                s->aux_block[s->naux_block++] = tm->blk;
2098
189k
                tm->blk = NULL;
2099
189k
                if (!tm->blk2) continue;
2100
0
                s->aux_block[s->naux_block++] = tm->blk2;
2101
0
                tm->blk2 = NULL;
2102
0
            }
2103
53.6k
            assert(s->naux_block <= 2*c->tags_used->n_occupied);
2104
53.6k
        }
2105
63.0k
    }
2106
2107
63.0k
    if (c->multi_seq && !no_ref) {
2108
24
        if (c->ref_seq_id >= 0)
2109
0
            cram_ref_decr(fd->refs, c->ref_seq_id);
2110
24
    }
2111
2112
    /* Link our bams[] array onto the spare bam list for reuse */
2113
63.0k
    spares = malloc(sizeof(*spares));
2114
63.0k
    if (!spares) goto_err;
2115
63.0k
    pthread_mutex_lock(&fd->bam_list_lock);
2116
63.0k
    spares->bams = c->bams;
2117
63.0k
    spares->next = fd->bl;
2118
63.0k
    fd->bl = spares;
2119
63.0k
    pthread_mutex_unlock(&fd->bam_list_lock);
2120
63.0k
    c->bams = NULL;
2121
2122
    /* Detect if a multi-seq container */
2123
63.0k
    cram_stats_encoding(fd, c->stats[DS_RI]);
2124
63.0k
    multi_ref = c->stats[DS_RI]->nvals > 1;
2125
63.0k
    pthread_mutex_lock(&fd->metrics_lock);
2126
63.0k
    fd->last_RI_count = c->stats[DS_RI]->nvals;
2127
63.0k
    pthread_mutex_unlock(&fd->metrics_lock);
2128
2129
2130
63.0k
    if (multi_ref) {
2131
418
        hts_log_info("Multi-ref container");
2132
418
        c->ref_seq_id = -2;
2133
418
        c->ref_seq_start = 0;
2134
418
        c->ref_seq_span = 0;
2135
418
    }
2136
2137
2138
    /* Compute MD5s */
2139
63.0k
    no_ref = c->no_ref;
2140
63.0k
    int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0;
2141
2142
126k
    for (i = 0; i < c->curr_slice; i++) {
2143
63.0k
        cram_slice *s = c->slices[i];
2144
2145
63.0k
        if (CRAM_MAJOR_VERS(fd->version) != 1) {
2146
63.0k
            if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !no_ref) {
2147
28.7k
                hts_md5_context *md5 = hts_md5_init();
2148
28.7k
                if (!md5)
2149
0
                    return -1;
2150
28.7k
                hts_md5_update(md5,
2151
28.7k
                               c->ref + s->hdr->ref_seq_start - c->ref_start,
2152
28.7k
                               s->hdr->ref_seq_span);
2153
28.7k
                hts_md5_final(s->hdr->md5, md5);
2154
28.7k
                hts_md5_destroy(md5);
2155
34.3k
            } else {
2156
34.3k
                memset(s->hdr->md5, 0, 16);
2157
34.3k
            }
2158
63.0k
        }
2159
63.0k
    }
2160
2161
63.0k
    c->num_records = 0;
2162
63.0k
    c->num_blocks = 1; // cram_block_compression_hdr
2163
63.0k
    c->length = 0;
2164
2165
    //fprintf(stderr, "=== BF ===\n");
2166
63.0k
    h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]),
2167
63.0k
                                         c->stats[DS_BF], E_INT, NULL,
2168
63.0k
                                         fd->version, &fd->vv);
2169
63.0k
    if (c->stats[DS_BF]->nvals && !h->codecs[DS_BF]) goto_err;
2170
2171
    //fprintf(stderr, "=== CF ===\n");
2172
63.0k
    h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]),
2173
63.0k
                                         c->stats[DS_CF], E_INT, NULL,
2174
63.0k
                                         fd->version, &fd->vv);
2175
63.0k
    if (c->stats[DS_CF]->nvals && !h->codecs[DS_CF]) goto_err;
2176
2177
    //fprintf(stderr, "=== RN ===\n");
2178
    //h->codecs[DS_RN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RN]),
2179
    //                                     c->stats[DS_RN], E_BYTE_ARRAY, NULL,
2180
    //                                     fd->version);
2181
2182
    //fprintf(stderr, "=== AP ===\n");
2183
63.0k
    if (c->pos_sorted || CRAM_MAJOR_VERS(fd->version) >= 4) {
2184
61.7k
        if (c->pos_sorted)
2185
61.7k
            h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]),
2186
61.7k
                                                 c->stats[DS_AP],
2187
61.7k
                                                 is_v4 ? E_LONG : E_INT,
2188
61.7k
                                                 NULL, fd->version, &fd->vv);
2189
0
        else
2190
            // Unsorted data has no stats, but hard-code VARINT_SIGNED / EXT.
2191
0
            h->codecs[DS_AP] = cram_encoder_init(is_v4 ? E_VARINT_SIGNED
2192
0
                                                       : E_EXTERNAL,
2193
0
                                                 NULL,
2194
0
                                                 is_v4 ? E_LONG : E_INT,
2195
0
                                                 NULL, fd->version, &fd->vv);
2196
61.7k
    } else {
2197
        // Removed BETA in v4.0.
2198
        // Should we consider dropping use of it for 3.0 too?
2199
1.35k
        hts_pos_t p[2] = {0, c->max_apos};
2200
1.35k
        h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL,
2201
1.35k
                                             is_v4 ? E_LONG : E_INT,
2202
1.35k
                                             p, fd->version, &fd->vv);
2203
//      cram_xdelta_encoder e;
2204
//      e.word_size = is_v4 ? 8 : 4;
2205
//      e.sub_encoding = E_EXTERNAL;
2206
//      e.sub_codec_dat = (void *)DS_AP;
2207
//
2208
//      h->codecs[DS_AP] = cram_encoder_init(E_XDELTA, NULL,
2209
//                                           is_v4 ? E_LONG : E_INT,
2210
//                                           &e, fd->version, &fd->vv);
2211
1.35k
    }
2212
63.0k
    if (!h->codecs[DS_AP]) goto_err;
2213
2214
    //fprintf(stderr, "=== RG ===\n");
2215
62.9k
    h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]),
2216
62.9k
                                         c->stats[DS_RG],
2217
62.9k
                                         E_INT,
2218
62.9k
                                         NULL,
2219
62.9k
                                         fd->version, &fd->vv);
2220
62.9k
    if (c->stats[DS_RG]->nvals && !h->codecs[DS_RG]) goto_err;
2221
2222
    //fprintf(stderr, "=== MQ ===\n");
2223
62.9k
    h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]),
2224
62.9k
                                         c->stats[DS_MQ], E_INT, NULL,
2225
62.9k
                                         fd->version, &fd->vv);
2226
62.9k
    if (c->stats[DS_MQ]->nvals && !h->codecs[DS_MQ]) goto_err;
2227
2228
    //fprintf(stderr, "=== NS ===\n");
2229
62.9k
    h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]),
2230
62.9k
                                         c->stats[DS_NS], E_INT, NULL,
2231
62.9k
                                         fd->version, &fd->vv);
2232
62.9k
    if (c->stats[DS_NS]->nvals && !h->codecs[DS_NS]) goto_err;
2233
2234
    //fprintf(stderr, "=== MF ===\n");
2235
62.9k
    h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]),
2236
62.9k
                                         c->stats[DS_MF], E_INT, NULL,
2237
62.9k
                                         fd->version, &fd->vv);
2238
62.9k
    if (c->stats[DS_MF]->nvals && !h->codecs[DS_MF]) goto_err;
2239
2240
    //fprintf(stderr, "=== TS ===\n");
2241
62.9k
    h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]),
2242
62.9k
                                         c->stats[DS_TS],
2243
62.9k
                                         is_v4 ? E_LONG : E_INT,
2244
62.9k
                                         NULL, fd->version, &fd->vv);
2245
62.9k
    if (c->stats[DS_TS]->nvals && !h->codecs[DS_TS]) goto_err;
2246
2247
    //fprintf(stderr, "=== NP ===\n");
2248
62.9k
    h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]),
2249
62.9k
                                         c->stats[DS_NP],
2250
62.9k
                                         is_v4 ? E_LONG : E_INT,
2251
62.9k
                                         NULL, fd->version, &fd->vv);
2252
62.9k
    if (c->stats[DS_NP]->nvals && !h->codecs[DS_NP]) goto_err;
2253
2254
    //fprintf(stderr, "=== NF ===\n");
2255
62.9k
    h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]),
2256
62.9k
                                         c->stats[DS_NF], E_INT, NULL,
2257
62.9k
                                         fd->version, &fd->vv);
2258
62.9k
    if (c->stats[DS_NF]->nvals && !h->codecs[DS_NF]) goto_err;
2259
2260
    //fprintf(stderr, "=== RL ===\n");
2261
62.9k
    h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]),
2262
62.9k
                                         c->stats[DS_RL], E_INT, NULL,
2263
62.9k
                                         fd->version, &fd->vv);
2264
62.9k
    if (c->stats[DS_RL]->nvals && !h->codecs[DS_RL]) goto_err;
2265
2266
    //fprintf(stderr, "=== FN ===\n");
2267
62.9k
    h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]),
2268
62.9k
                                         c->stats[DS_FN], E_INT, NULL,
2269
62.9k
                                         fd->version, &fd->vv);
2270
62.9k
    if (c->stats[DS_FN]->nvals && !h->codecs[DS_FN]) goto_err;
2271
2272
    //fprintf(stderr, "=== FC ===\n");
2273
62.9k
    h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]),
2274
62.9k
                                         c->stats[DS_FC], E_BYTE, NULL,
2275
62.9k
                                         fd->version, &fd->vv);
2276
62.9k
    if (c->stats[DS_FC]->nvals && !h->codecs[DS_FC]) goto_err;
2277
2278
    //fprintf(stderr, "=== FP ===\n");
2279
62.9k
    h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]),
2280
62.9k
                                         c->stats[DS_FP], E_INT, NULL,
2281
62.9k
                                         fd->version, &fd->vv);
2282
62.9k
    if (c->stats[DS_FP]->nvals && !h->codecs[DS_FP]) goto_err;
2283
2284
    //fprintf(stderr, "=== DL ===\n");
2285
62.9k
    h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]),
2286
62.9k
                                         c->stats[DS_DL], E_INT, NULL,
2287
62.9k
                                         fd->version, &fd->vv);
2288
62.9k
    if (c->stats[DS_DL]->nvals && !h->codecs[DS_DL]) goto_err;
2289
2290
    //fprintf(stderr, "=== BA ===\n");
2291
62.9k
    h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]),
2292
62.9k
                                         c->stats[DS_BA], E_BYTE, NULL,
2293
62.9k
                                         fd->version, &fd->vv);
2294
62.9k
    if (c->stats[DS_BA]->nvals && !h->codecs[DS_BA]) goto_err;
2295
2296
62.9k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2297
62.9k
        cram_byte_array_len_encoder e;
2298
2299
62.9k
        e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
2300
62.9k
            ? E_VARINT_UNSIGNED
2301
62.9k
            : E_EXTERNAL;
2302
62.9k
        e.len_dat = (void *)DS_BB_len;
2303
        //e.len_dat = (void *)DS_BB;
2304
2305
62.9k
        e.val_encoding = E_EXTERNAL;
2306
62.9k
        e.val_dat = (void *)DS_BB;
2307
2308
62.9k
        h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
2309
62.9k
                                             E_BYTE_ARRAY, (void *)&e,
2310
62.9k
                                             fd->version, &fd->vv);
2311
62.9k
        if (!h->codecs[DS_BB]) goto_err;
2312
62.9k
    } else {
2313
0
        h->codecs[DS_BB] = NULL;
2314
0
    }
2315
2316
    //fprintf(stderr, "=== BS ===\n");
2317
62.9k
    h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]),
2318
62.9k
                                         c->stats[DS_BS], E_BYTE, NULL,
2319
62.9k
                                         fd->version, &fd->vv);
2320
62.9k
    if (c->stats[DS_BS]->nvals && !h->codecs[DS_BS]) goto_err;
2321
2322
62.9k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
2323
0
        h->codecs[DS_TL] = NULL;
2324
0
        h->codecs[DS_RI] = NULL;
2325
0
        h->codecs[DS_RS] = NULL;
2326
0
        h->codecs[DS_PD] = NULL;
2327
0
        h->codecs[DS_HC] = NULL;
2328
0
        h->codecs[DS_SC] = NULL;
2329
2330
        //fprintf(stderr, "=== TC ===\n");
2331
0
        h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]),
2332
0
                                             c->stats[DS_TC], E_BYTE, NULL,
2333
0
                                             fd->version, &fd->vv);
2334
0
        if (c->stats[DS_TC]->nvals && !h->codecs[DS_TC]) goto_err;
2335
2336
        //fprintf(stderr, "=== TN ===\n");
2337
0
        h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]),
2338
0
                                             c->stats[DS_TN], E_INT, NULL,
2339
0
                                             fd->version, &fd->vv);
2340
0
        if (c->stats[DS_TN]->nvals && !h->codecs[DS_TN]) goto_err;
2341
62.9k
    } else {
2342
62.9k
        h->codecs[DS_TC] = NULL;
2343
62.9k
        h->codecs[DS_TN] = NULL;
2344
2345
        //fprintf(stderr, "=== TL ===\n");
2346
62.9k
        h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]),
2347
62.9k
                                             c->stats[DS_TL], E_INT, NULL,
2348
62.9k
                                             fd->version, &fd->vv);
2349
62.9k
        if (c->stats[DS_TL]->nvals && !h->codecs[DS_TL]) goto_err;
2350
2351
2352
        //fprintf(stderr, "=== RI ===\n");
2353
62.9k
        h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]),
2354
62.9k
                                             c->stats[DS_RI], E_INT, NULL,
2355
62.9k
                                             fd->version, &fd->vv);
2356
62.9k
        if (c->stats[DS_RI]->nvals && !h->codecs[DS_RI]) goto_err;
2357
2358
        //fprintf(stderr, "=== RS ===\n");
2359
62.9k
        h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]),
2360
62.9k
                                             c->stats[DS_RS], E_INT, NULL,
2361
62.9k
                                             fd->version, &fd->vv);
2362
62.9k
        if (c->stats[DS_RS]->nvals && !h->codecs[DS_RS]) goto_err;
2363
2364
        //fprintf(stderr, "=== PD ===\n");
2365
62.9k
        h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]),
2366
62.9k
                                             c->stats[DS_PD], E_INT, NULL,
2367
62.9k
                                             fd->version, &fd->vv);
2368
62.9k
        if (c->stats[DS_PD]->nvals && !h->codecs[DS_PD]) goto_err;
2369
2370
        //fprintf(stderr, "=== HC ===\n");
2371
62.9k
        h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]),
2372
62.9k
                                             c->stats[DS_HC], E_INT, NULL,
2373
62.9k
                                             fd->version, &fd->vv);
2374
62.9k
        if (c->stats[DS_HC]->nvals && !h->codecs[DS_HC]) goto_err;
2375
2376
        //fprintf(stderr, "=== SC ===\n");
2377
62.9k
        if (1) {
2378
62.9k
            int i2[2] = {0, DS_SC};
2379
2380
62.9k
            h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2381
62.9k
                                                 E_BYTE_ARRAY, (void *)i2,
2382
62.9k
                                                 fd->version, &fd->vv);
2383
62.9k
        } else {
2384
            // Appears to be no practical benefit to using this method,
2385
            // but it may work better if we start mixing SC, IN and BB
2386
            // elements into the same external block.
2387
0
            cram_byte_array_len_encoder e;
2388
2389
0
            e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
2390
0
                ? E_VARINT_UNSIGNED
2391
0
                : E_EXTERNAL;
2392
0
            e.len_dat = (void *)DS_SC_len;
2393
2394
0
            e.val_encoding = E_EXTERNAL;
2395
0
            e.val_dat = (void *)DS_SC;
2396
2397
0
            h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
2398
0
                                                 E_BYTE_ARRAY, (void *)&e,
2399
0
                                                 fd->version, &fd->vv);
2400
0
        }
2401
62.9k
        if (!h->codecs[DS_SC]) goto_err;
2402
62.9k
    }
2403
2404
    //fprintf(stderr, "=== IN ===\n");
2405
62.9k
    {
2406
62.9k
        int i2[2] = {0, DS_IN};
2407
62.9k
        h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2408
62.9k
                                             E_BYTE_ARRAY, (void *)i2,
2409
62.9k
                                             fd->version, &fd->vv);
2410
62.9k
        if (!h->codecs[DS_IN]) goto_err;
2411
62.9k
    }
2412
2413
62.9k
    h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE,
2414
62.9k
                                         (void *)DS_QS,
2415
62.9k
                                         fd->version, &fd->vv);
2416
62.9k
    if (!h->codecs[DS_QS]) goto_err;
2417
62.9k
    {
2418
62.9k
        int i2[2] = {0, DS_RN};
2419
62.9k
        h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2420
62.9k
                                             E_BYTE_ARRAY, (void *)i2,
2421
62.9k
                                             fd->version, &fd->vv);
2422
62.9k
        if (!h->codecs[DS_RN]) goto_err;
2423
62.9k
    }
2424
2425
2426
    /* Encode slices */
2427
125k
    for (i = 0; i < c->curr_slice; i++) {
2428
62.9k
        hts_log_info("Encode slice %d", i);
2429
2430
62.9k
        int local_embed_ref =
2431
62.9k
            embed_ref>0 && c->slices[i]->hdr->ref_seq_id != -1 ? 1 : 0;
2432
62.9k
        if (cram_encode_slice(fd, c, h, c->slices[i], local_embed_ref) != 0)
2433
121
            return -1;
2434
62.9k
    }
2435
2436
    /* Create compression header */
2437
62.8k
    {
2438
62.8k
        h->ref_seq_id    = c->ref_seq_id;
2439
62.8k
        h->ref_seq_start = c->ref_seq_start;
2440
62.8k
        h->ref_seq_span  = c->ref_seq_span;
2441
62.8k
        h->num_records   = c->num_records;
2442
62.8k
        h->qs_seq_orient = c->qs_seq_orient;
2443
        // slight misnomer - sorted or treat as-if sorted (ap_delta force to 1)
2444
62.8k
        h->AP_delta      = c->pos_sorted;
2445
62.8k
        memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20);
2446
2447
62.8k
        if (!(c_hdr = cram_encode_compression_header(fd, c, h, embed_ref)))
2448
0
            return -1;
2449
62.8k
    }
2450
2451
    /* Compute landmarks */
2452
    /* Fill out slice landmarks */
2453
62.8k
    c->num_landmarks = c->curr_slice;
2454
62.8k
    c->landmark = malloc(c->num_landmarks * sizeof(*c->landmark));
2455
62.8k
    if (!c->landmark)
2456
0
        return -1;
2457
2458
    /*
2459
     * Slice offset starts after the first block, so we need to simulate
2460
     * writing it to work out the correct offset
2461
     */
2462
62.8k
    {
2463
62.8k
        slice_offset = c_hdr->method == RAW
2464
62.8k
            ? c_hdr->uncomp_size
2465
62.8k
            : c_hdr->comp_size;
2466
62.8k
        slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
2467
62.8k
            fd->vv.varint_size(c_hdr->content_id) +
2468
62.8k
            fd->vv.varint_size(c_hdr->comp_size) +
2469
62.8k
            fd->vv.varint_size(c_hdr->uncomp_size);
2470
62.8k
    }
2471
2472
62.8k
    c->ref_seq_id    = c->slices[0]->hdr->ref_seq_id;
2473
62.8k
    if (c->ref_seq_id == -1 && CRAM_ge31(fd->version)) {
2474
        // Spec states span=0, but it broke our range queries.
2475
        // See commit message for this and prior.
2476
32.3k
        c->ref_seq_start = 0;
2477
32.3k
        c->ref_seq_span  = 0;
2478
32.3k
    } else {
2479
30.4k
        c->ref_seq_start = c->slices[0]->hdr->ref_seq_start;
2480
30.4k
        c->ref_seq_span  = c->slices[0]->hdr->ref_seq_span;
2481
30.4k
    }
2482
125k
    for (i = 0; i < c->curr_slice; i++) {
2483
62.8k
        cram_slice *s = c->slices[i];
2484
2485
62.8k
        c->num_blocks += s->hdr->num_blocks + 1; // slice header
2486
62.8k
        c->landmark[i] = slice_offset;
2487
2488
62.8k
        if (s->hdr->ref_seq_start + s->hdr->ref_seq_span >
2489
62.8k
            c->ref_seq_start + c->ref_seq_span) {
2490
0
            c->ref_seq_span = s->hdr->ref_seq_start + s->hdr->ref_seq_span
2491
0
                - c->ref_seq_start;
2492
0
        }
2493
2494
62.8k
        slice_offset += s->hdr_block->method == RAW
2495
62.8k
            ? s->hdr_block->uncomp_size
2496
62.8k
            : s->hdr_block->comp_size;
2497
2498
62.8k
        slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
2499
62.8k
            fd->vv.varint_size(s->hdr_block->content_id) +
2500
62.8k
            fd->vv.varint_size(s->hdr_block->comp_size) +
2501
62.8k
            fd->vv.varint_size(s->hdr_block->uncomp_size);
2502
2503
459k
        for (j = 0; j < s->hdr->num_blocks; j++) {
2504
396k
            slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
2505
396k
                fd->vv.varint_size(s->block[j]->content_id) +
2506
396k
                fd->vv.varint_size(s->block[j]->comp_size) +
2507
396k
                fd->vv.varint_size(s->block[j]->uncomp_size);
2508
2509
396k
            slice_offset += s->block[j]->method == RAW
2510
396k
                ? s->block[j]->uncomp_size
2511
396k
                : s->block[j]->comp_size;
2512
396k
        }
2513
62.8k
    }
2514
62.8k
    c->length += slice_offset; // just past the final slice
2515
2516
62.8k
    c->comp_hdr_block = c_hdr;
2517
2518
62.8k
    if (c->ref_seq_id >= 0) {
2519
29.5k
        if (c->ref_free) {
2520
29.3k
            free(c->ref);
2521
29.3k
            c->ref = NULL;
2522
29.3k
        } else {
2523
144
            cram_ref_decr(fd->refs, c->ref_seq_id);
2524
144
        }
2525
29.5k
    }
2526
2527
    /* Cache references up-front if we have unsorted access patterns */
2528
62.8k
    if (!no_ref && c->refs_used) {
2529
48
        for (i = 0; i < fd->refs->nref; i++) {
2530
24
            if (c->refs_used[i])
2531
0
                cram_ref_decr(fd->refs, i);
2532
24
        }
2533
24
    }
2534
2535
62.8k
    return 0;
2536
2537
147
 err:
2538
147
    return -1;
2539
62.8k
}
2540
2541
2542
/*
2543
 * Adds a feature code to a read within a slice. For purposes of minimising
2544
 * memory allocations and fragmentation we have one array of features for all
2545
 * reads within the slice. We return the index into this array for this new
2546
 * feature.
2547
 *
2548
 * Returns feature index on success
2549
 *         -1 on failure.
2550
 */
2551
static int cram_add_feature(cram_container *c, cram_slice *s,
2552
184k
                            cram_record *r, cram_feature *f) {
2553
184k
    if (s->nfeatures >= s->afeatures) {
2554
28.3k
        s->afeatures = s->afeatures ? s->afeatures*2 : 1024;
2555
28.3k
        s->features = realloc(s->features, s->afeatures*sizeof(*s->features));
2556
28.3k
        if (!s->features)
2557
0
            return -1;
2558
28.3k
    }
2559
2560
184k
    if (!r->nfeature++) {
2561
82.6k
        r->feature = s->nfeatures;
2562
82.6k
        if (cram_stats_add(c->stats[DS_FP], f->X.pos) < 0)
2563
0
            return -1;
2564
102k
    } else {
2565
102k
        if (cram_stats_add(c->stats[DS_FP],
2566
102k
                           f->X.pos - s->features[r->feature + r->nfeature-2].X.pos) < 0)
2567
0
            return -1;
2568
2569
102k
    }
2570
184k
    if (cram_stats_add(c->stats[DS_FC], f->X.code) < 0)
2571
0
        return -1;
2572
2573
184k
    s->features[s->nfeatures++] = *f;
2574
2575
184k
    return 0;
2576
184k
}
2577
2578
static int cram_add_substitution(cram_fd *fd, cram_container *c,
2579
                                 cram_slice *s, cram_record *r,
2580
2.80k
                                 int pos, char base, char qual, char ref) {
2581
2.80k
    cram_feature f;
2582
2583
    // seq=ACGTN vs ref=ACGT or seq=ACGT vs ref=ACGTN
2584
2.80k
    if (fd->L2[(uc)base]<4 || (fd->L2[(uc)base]<5 && fd->L2[(uc)ref]<4)) {
2585
2.24k
        f.X.pos = pos+1;
2586
2.24k
        f.X.code = 'X';
2587
2.24k
        f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f];
2588
2.24k
        if (cram_stats_add(c->stats[DS_BS], f.X.base) < 0)
2589
0
            return -1;
2590
2.24k
    } else {
2591
560
        f.B.pos = pos+1;
2592
560
        f.B.code = 'B';
2593
560
        f.B.base = base;
2594
560
        f.B.qual = qual;
2595
560
        if (cram_stats_add(c->stats[DS_BA], f.B.base) < 0) return -1;
2596
560
        if (cram_stats_add(c->stats[DS_QS], f.B.qual) < 0) return -1;
2597
560
        BLOCK_APPEND_CHAR(s->qual_blk, qual);
2598
560
    }
2599
2.80k
    return cram_add_feature(c, s, r, &f);
2600
2601
0
 block_err:
2602
0
    return -1;
2603
2.80k
}
2604
2605
static int cram_add_bases(cram_fd *fd, cram_container *c,
2606
                          cram_slice *s, cram_record *r,
2607
642
                          int pos, int len, char *base) {
2608
642
    cram_feature f;
2609
2610
642
    f.b.pos = pos+1;
2611
642
    f.b.code = 'b';
2612
642
    f.b.seq_idx = base - (char *)BLOCK_DATA(s->seqs_blk);
2613
642
    f.b.len = len;
2614
2615
642
    return cram_add_feature(c, s, r, &f);
2616
642
}
2617
2618
static int cram_add_base(cram_fd *fd, cram_container *c,
2619
                         cram_slice *s, cram_record *r,
2620
1.97k
                         int pos, char base, char qual) {
2621
1.97k
    cram_feature f;
2622
1.97k
    f.B.pos = pos+1;
2623
1.97k
    f.B.code = 'B';
2624
1.97k
    f.B.base = base;
2625
1.97k
    f.B.qual = qual;
2626
1.97k
    if (cram_stats_add(c->stats[DS_BA], base) < 0) return -1;
2627
1.97k
    if (cram_stats_add(c->stats[DS_QS], qual) < 0) return -1;
2628
1.97k
    BLOCK_APPEND_CHAR(s->qual_blk, qual);
2629
1.97k
    return cram_add_feature(c, s, r, &f);
2630
2631
0
 block_err:
2632
0
    return -1;
2633
1.97k
}
2634
2635
static int cram_add_quality(cram_fd *fd, cram_container *c,
2636
                            cram_slice *s, cram_record *r,
2637
97
                            int pos, char qual) {
2638
97
    cram_feature f;
2639
97
    f.Q.pos = pos+1;
2640
97
    f.Q.code = 'Q';
2641
97
    f.Q.qual = qual;
2642
97
    if (cram_stats_add(c->stats[DS_QS], qual) < 0) return -1;
2643
97
    BLOCK_APPEND_CHAR(s->qual_blk, qual);
2644
97
    return cram_add_feature(c, s, r, &f);
2645
2646
0
 block_err:
2647
0
    return -1;
2648
97
}
2649
2650
static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r,
2651
59.3k
                             int pos, int len, char *base) {
2652
59.3k
    cram_feature f;
2653
59.3k
    f.D.pos = pos+1;
2654
59.3k
    f.D.code = 'D';
2655
59.3k
    f.D.len = len;
2656
59.3k
    if (cram_stats_add(c->stats[DS_DL], len) < 0) return -1;
2657
59.3k
    return cram_add_feature(c, s, r, &f);
2658
59.3k
}
2659
2660
static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r,
2661
77.0k
                             int pos, int len, char *base, int version) {
2662
77.0k
    cram_feature f;
2663
77.0k
    f.S.pos = pos+1;
2664
77.0k
    f.S.code = 'S';
2665
77.0k
    f.S.len = len;
2666
77.0k
    switch (CRAM_MAJOR_VERS(version)) {
2667
0
    case 1:
2668
0
        f.S.seq_idx = BLOCK_SIZE(s->base_blk);
2669
0
        BLOCK_APPEND(s->base_blk, base, len);
2670
0
        BLOCK_APPEND_CHAR(s->base_blk, '\0');
2671
0
        break;
2672
2673
0
    case 2:
2674
77.0k
    default:
2675
77.0k
        f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
2676
77.0k
        if (base) {
2677
10.7k
            BLOCK_APPEND(s->soft_blk, base, len);
2678
66.2k
        } else {
2679
66.2k
            int i;
2680
347k
            for (i = 0; i < len; i++)
2681
281k
                BLOCK_APPEND_CHAR(s->soft_blk, 'N');
2682
66.2k
        }
2683
77.0k
        BLOCK_APPEND_CHAR(s->soft_blk, '\0');
2684
77.0k
        break;
2685
2686
        //default:
2687
        //    // v3.0 onwards uses BB data-series
2688
        //    f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
2689
77.0k
    }
2690
77.0k
    return cram_add_feature(c, s, r, &f);
2691
2692
0
 block_err:
2693
0
    return -1;
2694
77.0k
}
2695
2696
static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r,
2697
28.7k
                             int pos, int len, char *base) {
2698
28.7k
    cram_feature f;
2699
28.7k
    f.S.pos = pos+1;
2700
28.7k
    f.S.code = 'H';
2701
28.7k
    f.S.len = len;
2702
28.7k
    if (cram_stats_add(c->stats[DS_HC], len) < 0) return -1;
2703
28.7k
    return cram_add_feature(c, s, r, &f);
2704
28.7k
}
2705
2706
static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r,
2707
713
                         int pos, int len, char *base) {
2708
713
    cram_feature f;
2709
713
    f.S.pos = pos+1;
2710
713
    f.S.code = 'N';
2711
713
    f.S.len = len;
2712
713
    if (cram_stats_add(c->stats[DS_RS], len) < 0) return -1;
2713
713
    return cram_add_feature(c, s, r, &f);
2714
713
}
2715
2716
static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r,
2717
585
                        int pos, int len, char *base) {
2718
585
    cram_feature f;
2719
585
    f.S.pos = pos+1;
2720
585
    f.S.code = 'P';
2721
585
    f.S.len = len;
2722
585
    if (cram_stats_add(c->stats[DS_PD], len) < 0) return -1;
2723
585
    return cram_add_feature(c, s, r, &f);
2724
585
}
2725
2726
static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r,
2727
12.7k
                              int pos, int len, char *base) {
2728
12.7k
    cram_feature f;
2729
12.7k
    f.I.pos = pos+1;
2730
12.7k
    if (len == 1) {
2731
12.0k
        char b = base ? *base : 'N';
2732
12.0k
        f.i.code = 'i';
2733
12.0k
        f.i.base = b;
2734
12.0k
        if (cram_stats_add(c->stats[DS_BA], b) < 0) return -1;
2735
12.0k
    } else {
2736
695
        f.I.code = 'I';
2737
695
        f.I.len = len;
2738
695
        f.S.seq_idx = BLOCK_SIZE(s->base_blk);
2739
695
        if (base) {
2740
143
            BLOCK_APPEND(s->base_blk, base, len);
2741
552
        } else {
2742
552
            int i;
2743
14.3M
            for (i = 0; i < len; i++)
2744
14.3M
                BLOCK_APPEND_CHAR(s->base_blk, 'N');
2745
552
        }
2746
695
        BLOCK_APPEND_CHAR(s->base_blk, '\0');
2747
695
    }
2748
12.7k
    return cram_add_feature(c, s, r, &f);
2749
2750
0
 block_err:
2751
0
    return -1;
2752
12.7k
}
2753
2754
/*
2755
 * Encodes auxiliary data. Largely duplicated from above, but done so to
2756
 * keep it simple and avoid a myriad of version ifs.
2757
 *
2758
 * Returns the RG header line pointed to by the BAM aux fields on success,
2759
 *         NULL on failure or no rg present, also sets "*err" to non-zero
2760
 */
2761
static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
2762
                                      cram_container *c,
2763
                                      cram_slice *s, cram_record *cr,
2764
                                      int verbatim_NM, int verbatim_MD,
2765
                                      int NM, kstring_t *MD, int cf_tag,
2766
6.94M
                                      int no_ref, int *err) {
2767
6.94M
    char *aux, *orig;
2768
6.94M
    sam_hrec_rg_t *brg = NULL;
2769
6.94M
    int aux_size = bam_get_l_aux(b);
2770
6.94M
    const char *aux_end = bam_data_end(b);
2771
6.94M
    cram_block *td_b = c->comp_hdr->TD_blk;
2772
6.94M
    int TD_blk_size = BLOCK_SIZE(td_b), new;
2773
6.94M
    char *key;
2774
6.94M
    khint_t k;
2775
2776
6.94M
    if (err) *err = 1;
2777
2778
6.94M
    orig = aux = (char *)bam_aux(b);
2779
2780
2781
    // cF:i  => Extra CRAM bit flags.
2782
    // 1:  Don't auto-decode MD (may be invalid)
2783
    // 2:  Don't auto-decode NM (may be invalid)
2784
6.94M
    if (cf_tag && CRAM_MAJOR_VERS(fd->version) < 4) {
2785
        // Temporary copy of aux so we can ammend it.
2786
78.9k
        aux = malloc(aux_size+4);
2787
78.9k
        if (!aux)
2788
0
            return NULL;
2789
2790
78.9k
        memcpy(aux, orig, aux_size);
2791
78.9k
        aux[aux_size++] = 'c';
2792
78.9k
        aux[aux_size++] = 'F';
2793
78.9k
        aux[aux_size++] = 'C';
2794
78.9k
        aux[aux_size++] = cf_tag;
2795
78.9k
        orig = aux;
2796
78.9k
        aux_end = aux + aux_size;
2797
78.9k
    }
2798
2799
    // Copy aux keys to td_b and aux values to slice aux blocks
2800
7.63M
    while (aux_end - aux >= 1 && aux[0] != 0) {
2801
693k
        int r;
2802
2803
        // Room for code + type + at least 1 byte of data
2804
693k
        if (aux - orig >= aux_size - 3)
2805
6
            goto err;
2806
2807
        // RG:Z
2808
693k
        if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
2809
691
            char *rg = &aux[3];
2810
691
            aux = rg;
2811
63.4k
            while (aux < aux_end && *aux++);
2812
691
            if (aux == aux_end && aux[-1] != '\0') {
2813
2
                hts_log_error("Unterminated RG:Z tag for read \"%s\"",
2814
2
                              bam_get_qname(b));
2815
2
                goto err;
2816
2
            }
2817
689
            brg = sam_hrecs_find_rg(fd->header->hrecs, rg);
2818
689
            if (brg) {
2819
190
                if (CRAM_MAJOR_VERS(fd->version) >= 4)
2820
0
                    BLOCK_APPEND(td_b, "RG*", 3);
2821
190
                continue;
2822
499
            } else {
2823
                // RG:Z tag will be stored verbatim
2824
499
                hts_log_warning("Missing @RG header for RG \"%s\"", rg);
2825
499
                aux = rg - 3;
2826
499
            }
2827
689
        }
2828
2829
        // MD:Z
2830
693k
        if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
2831
51.8k
            if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) {
2832
13.4k
                if (MD && MD->s && strncasecmp(MD->s, aux+3, orig + aux_size - (aux+3)) == 0) {
2833
8.85k
                    while (aux < aux_end && *aux++);
2834
1.47k
                    if (aux == aux_end && aux[-1] != '\0') {
2835
0
                        hts_log_error("Unterminated MD:Z tag for read \"%s\"",
2836
0
                                      bam_get_qname(b));
2837
0
                        goto err;
2838
0
                    }
2839
1.47k
                    if (CRAM_MAJOR_VERS(fd->version) >= 4)
2840
0
                        BLOCK_APPEND(td_b, "MD*", 3);
2841
1.47k
                    continue;
2842
1.47k
                }
2843
13.4k
            }
2844
51.8k
        }
2845
2846
        // NM:i
2847
691k
        if (aux[0] == 'N' && aux[1] == 'M') {
2848
201
            if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_NM) {
2849
4
                int NM_ = bam_aux2i_end((uint8_t *)aux+2, (uint8_t *)aux_end);
2850
4
                if (NM_ == NM) {
2851
0
                    switch(aux[2]) {
2852
0
                    case 'A': case 'C': case 'c': aux+=4; break;
2853
0
                    case 'S': case 's':           aux+=5; break;
2854
0
                    case 'I': case 'i': case 'f': aux+=7; break;
2855
0
                    default:
2856
0
                        hts_log_error("Unhandled type code for NM tag");
2857
0
                        goto err;
2858
0
                    }
2859
0
                    if (CRAM_MAJOR_VERS(fd->version) >= 4)
2860
0
                        BLOCK_APPEND(td_b, "NM*", 3);
2861
0
                    continue;
2862
0
                }
2863
4
            }
2864
201
        }
2865
2866
691k
        BLOCK_APPEND(td_b, aux, 3);
2867
2868
        // Container level tags_used, for TD series
2869
        // Maps integer key ('X0i') to cram_tag_map struct.
2870
691k
        int key = (((unsigned char *) aux)[0]<<16 |
2871
691k
                   ((unsigned char *) aux)[1]<<8  |
2872
691k
                   ((unsigned char *) aux)[2]);
2873
691k
        k = kh_put(m_tagmap, c->tags_used, key, &r);
2874
691k
        if (-1 == r)
2875
0
            goto err;
2876
691k
        else if (r != 0)
2877
189k
            kh_val(c->tags_used, k) = NULL;
2878
2879
691k
        if (r == 1) {
2880
189k
            khint_t k_global;
2881
2882
            // Global tags_used for cram_metrics support
2883
189k
            pthread_mutex_lock(&fd->metrics_lock);
2884
189k
            k_global = kh_put(m_metrics, fd->tags_used, key, &r);
2885
189k
            if (-1 == r) {
2886
0
                pthread_mutex_unlock(&fd->metrics_lock);
2887
0
                goto err;
2888
0
            }
2889
189k
            if (r >= 1) {
2890
11.0k
                kh_val(fd->tags_used, k_global) = cram_new_metrics();
2891
11.0k
                if (!kh_val(fd->tags_used, k_global)) {
2892
0
                    kh_del(m_metrics, fd->tags_used, k_global);
2893
0
                    pthread_mutex_unlock(&fd->metrics_lock);
2894
0
                    goto err;
2895
0
                }
2896
11.0k
            }
2897
2898
189k
            pthread_mutex_unlock(&fd->metrics_lock);
2899
2900
189k
            int i2[2] = {'\t',key};
2901
189k
            size_t sk = key;
2902
189k
            cram_tag_map *m = calloc(1, sizeof(*m));
2903
189k
            if (!m)
2904
0
                goto_err;
2905
189k
            kh_val(c->tags_used, k) = m;
2906
2907
189k
            cram_codec *c;
2908
2909
            // Use a block content id based on the tag id.
2910
            // Codec type depends on tag data type.
2911
189k
            switch(aux[2]) {
2912
74.9k
            case 'Z': case 'H':
2913
                // string as byte_array_stop
2914
74.9k
                c = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2915
74.9k
                                      E_BYTE_ARRAY, (void *)i2,
2916
74.9k
                                      fd->version, &fd->vv);
2917
74.9k
                break;
2918
2919
96.8k
            case 'A': case 'c': case 'C': {
2920
                // byte array len, 1 byte
2921
96.8k
                cram_byte_array_len_encoder e;
2922
96.8k
                cram_stats st;
2923
2924
96.8k
                if (CRAM_MAJOR_VERS(fd->version) <= 3) {
2925
96.8k
                    e.len_encoding = E_HUFFMAN;
2926
96.8k
                    e.len_dat = NULL; // will get codes from st
2927
96.8k
                } else {
2928
0
                    e.len_encoding = E_CONST_INT;
2929
0
                    e.len_dat = NULL; // will get codes from st
2930
0
                }
2931
96.8k
                memset(&st, 0, sizeof(st));
2932
96.8k
                if (cram_stats_add(&st, 1) < 0) goto block_err;
2933
96.8k
                cram_stats_encoding(fd, &st);
2934
2935
96.8k
                e.val_encoding = E_EXTERNAL;
2936
96.8k
                e.val_dat = (void *)sk;
2937
2938
96.8k
                c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2939
96.8k
                                      E_BYTE_ARRAY, (void *)&e,
2940
96.8k
                                      fd->version, &fd->vv);
2941
96.8k
                break;
2942
96.8k
            }
2943
2944
4.72k
            case 's': case 'S': {
2945
                // byte array len, 2 byte
2946
4.72k
                cram_byte_array_len_encoder e;
2947
4.72k
                cram_stats st;
2948
2949
4.72k
                if (CRAM_MAJOR_VERS(fd->version) <= 3) {
2950
4.72k
                    e.len_encoding = E_HUFFMAN;
2951
4.72k
                    e.len_dat = NULL; // will get codes from st
2952
4.72k
                } else {
2953
0
                    e.len_encoding = E_CONST_INT;
2954
0
                    e.len_dat = NULL; // will get codes from st
2955
0
                }
2956
4.72k
                memset(&st, 0, sizeof(st));
2957
4.72k
                if (cram_stats_add(&st, 2) < 0) goto block_err;
2958
4.72k
                cram_stats_encoding(fd, &st);
2959
2960
4.72k
                e.val_encoding = E_EXTERNAL;
2961
4.72k
                e.val_dat = (void *)sk;
2962
2963
4.72k
                c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2964
4.72k
                                      E_BYTE_ARRAY, (void *)&e,
2965
4.72k
                                      fd->version, &fd->vv);
2966
4.72k
                break;
2967
4.72k
            }
2968
11.6k
            case 'i': case 'I': case 'f': {
2969
                // byte array len, 4 byte
2970
11.6k
                cram_byte_array_len_encoder e;
2971
11.6k
                cram_stats st;
2972
2973
11.6k
                if (CRAM_MAJOR_VERS(fd->version) <= 3) {
2974
11.6k
                    e.len_encoding = E_HUFFMAN;
2975
11.6k
                    e.len_dat = NULL; // will get codes from st
2976
11.6k
                } else {
2977
0
                    e.len_encoding = E_CONST_INT;
2978
0
                    e.len_dat = NULL; // will get codes from st
2979
0
                }
2980
11.6k
                memset(&st, 0, sizeof(st));
2981
11.6k
                if (cram_stats_add(&st, 4) < 0) goto block_err;
2982
11.6k
                cram_stats_encoding(fd, &st);
2983
2984
11.6k
                e.val_encoding = E_EXTERNAL;
2985
11.6k
                e.val_dat = (void *)sk;
2986
2987
11.6k
                c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2988
11.6k
                                      E_BYTE_ARRAY, (void *)&e,
2989
11.6k
                                      fd->version, &fd->vv);
2990
11.6k
                break;
2991
11.6k
            }
2992
2993
1.73k
            case 'B': {
2994
                // Byte array of variable size, but we generate our tag
2995
                // byte stream at the wrong stage (during reading and not
2996
                // after slice header construction). So we use
2997
                // BYTE_ARRAY_LEN with the length codec being external
2998
                // too.
2999
1.73k
                cram_byte_array_len_encoder e;
3000
3001
1.73k
                e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
3002
1.73k
                    ? E_VARINT_UNSIGNED
3003
1.73k
                    : E_EXTERNAL;
3004
1.73k
                e.len_dat = (void *)sk; // or key+128 for len?
3005
3006
1.73k
                e.val_encoding = E_EXTERNAL;
3007
1.73k
                e.val_dat = (void *)sk;
3008
3009
1.73k
                c = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
3010
1.73k
                                      E_BYTE_ARRAY, (void *)&e,
3011
1.73k
                                      fd->version, &fd->vv);
3012
1.73k
                break;
3013
11.6k
            }
3014
3015
105
            default:
3016
105
                hts_log_error("Unsupported SAM aux type '%c'", aux[2]);
3017
105
                c = NULL;
3018
189k
            }
3019
3020
189k
            if (!c)
3021
105
                goto_err;
3022
3023
189k
            m->codec = c;
3024
3025
            // Link to fd-global tag metrics
3026
189k
            pthread_mutex_lock(&fd->metrics_lock);
3027
189k
            m->m = k_global ? (cram_metrics *)kh_val(fd->tags_used, k_global) : NULL;
3028
189k
            pthread_mutex_unlock(&fd->metrics_lock);
3029
189k
        }
3030
3031
691k
        cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
3032
691k
        if (!tm) goto_err;
3033
691k
        cram_codec *codec = tm->codec;
3034
691k
        if (!tm->codec) goto_err;
3035
3036
691k
        switch(aux[2]) {
3037
336k
        case 'A': case 'C': case 'c':
3038
336k
            if (aux_end - aux < 3+1)
3039
0
                goto err;
3040
3041
336k
            if (!tm->blk) {
3042
96.8k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3043
0
                    goto err;
3044
96.8k
                codec->u.e_byte_array_len.val_codec->out = tm->blk;
3045
96.8k
            }
3046
3047
336k
            aux+=3;
3048
            //codec->encode(s, codec, aux, 1);
3049
            // Functionally equivalent, but less code.
3050
336k
            BLOCK_APPEND_CHAR(tm->blk, *aux);
3051
336k
            aux++;
3052
336k
            break;
3053
3054
25.8k
        case 'S': case 's':
3055
25.8k
            if (aux_end - aux < 3+2)
3056
3
                goto err;
3057
3058
25.8k
            if (!tm->blk) {
3059
4.72k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3060
0
                    goto err;
3061
4.72k
                codec->u.e_byte_array_len.val_codec->out = tm->blk;
3062
4.72k
            }
3063
3064
25.8k
            aux+=3;
3065
            //codec->encode(s, codec, aux, 2);
3066
25.8k
            BLOCK_APPEND(tm->blk, aux, 2);
3067
25.8k
            aux+=2;
3068
25.8k
            break;
3069
3070
73.3k
        case 'I': case 'i': case 'f':
3071
73.3k
            if (aux_end - aux < 3+4)
3072
5
                goto err;
3073
3074
73.3k
            if (!tm->blk) {
3075
11.6k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3076
0
                    goto err;
3077
11.6k
                codec->u.e_byte_array_len.val_codec->out = tm->blk;
3078
11.6k
            }
3079
3080
73.3k
            aux+=3;
3081
            //codec->encode(s, codec, aux, 4);
3082
73.3k
            BLOCK_APPEND(tm->blk, aux, 4);
3083
73.3k
            aux+=4;
3084
73.3k
            break;
3085
3086
0
        case 'd':
3087
0
            if (aux_end - aux < 3+8)
3088
0
                goto err;
3089
3090
0
            if (!tm->blk) {
3091
0
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3092
0
                    goto err;
3093
0
                codec->u.e_byte_array_len.val_codec->out = tm->blk;
3094
0
            }
3095
3096
0
            aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
3097
            //codec->encode(s, codec, aux, 8);
3098
0
            BLOCK_APPEND(tm->blk, aux, 8);
3099
0
            aux+=8;
3100
0
            break;
3101
3102
216k
        case 'Z': case 'H': {
3103
216k
            if (aux_end - aux < 3)
3104
0
                goto err;
3105
3106
216k
            if (!tm->blk) {
3107
74.9k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3108
0
                    goto err;
3109
74.9k
                codec->out = tm->blk;
3110
74.9k
            }
3111
3112
216k
            char *aux_s;
3113
216k
            aux += 3;
3114
216k
            aux_s = aux;
3115
13.0M
            while (aux < aux_end && *aux++);
3116
216k
            if (aux == aux_end && aux[-1] != '\0') {
3117
1
                hts_log_error("Unterminated %c%c:%c tag for read \"%s\"",
3118
1
                              aux_s[-3], aux_s[-2], aux_s[-1],
3119
1
                              bam_get_qname(b));
3120
1
                goto err;
3121
1
            }
3122
216k
            if (codec->encode(s, codec, aux_s, aux - aux_s) < 0)
3123
0
                goto err;
3124
216k
            break;
3125
216k
        }
3126
3127
216k
        case 'B': {
3128
38.8k
            if (aux_end - aux < 4+4)
3129
4
                goto err;
3130
3131
38.8k
            int type = aux[3];
3132
38.8k
            uint64_t count = (((uint64_t)((unsigned char *)aux)[4]) << 0 |
3133
38.8k
                              ((uint64_t)((unsigned char *)aux)[5]) << 8 |
3134
38.8k
                              ((uint64_t)((unsigned char *)aux)[6]) <<16 |
3135
38.8k
                              ((uint64_t)((unsigned char *)aux)[7]) <<24);
3136
38.8k
            uint64_t blen;
3137
38.8k
            if (!tm->blk) {
3138
1.73k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3139
0
                    goto err;
3140
1.73k
                if (codec->u.e_byte_array_len.val_codec->codec == E_XDELTA) {
3141
0
                    if (!(tm->blk2 = cram_new_block(EXTERNAL, key+128)))
3142
0
                        goto err;
3143
0
                    codec->u.e_byte_array_len.len_codec->out = tm->blk2;
3144
0
                    codec->u.e_byte_array_len.val_codec->u.e_xdelta.sub_codec->out = tm->blk;
3145
1.73k
                } else {
3146
1.73k
                    codec->u.e_byte_array_len.len_codec->out = tm->blk;
3147
1.73k
                    codec->u.e_byte_array_len.val_codec->out = tm->blk;
3148
1.73k
                }
3149
1.73k
            }
3150
3151
            // skip TN field
3152
38.8k
            aux+=3;
3153
3154
            // We use BYTE_ARRAY_LEN with external length, so store that first
3155
38.8k
            switch (type) {
3156
11.3k
            case 'c': case 'C':
3157
11.3k
                blen = count;
3158
11.3k
                break;
3159
4.65k
            case 's': case 'S':
3160
4.65k
                blen = 2*count;
3161
4.65k
                break;
3162
22.8k
            case 'i': case 'I': case 'f':
3163
22.8k
                blen = 4*count;
3164
22.8k
                break;
3165
19
            default:
3166
19
                hts_log_error("Unknown sub-type '%c' for aux type 'B'", type);
3167
19
                goto err;
3168
38.8k
            }
3169
3170
38.8k
            blen += 5; // sub-type & length
3171
38.8k
            if (aux_end - aux < blen || blen > INT_MAX)
3172
8
                goto err;
3173
3174
38.8k
            if (codec->encode(s, codec, aux, (int) blen) < 0)
3175
0
                goto err;
3176
38.8k
            aux += blen;
3177
38.8k
            break;
3178
38.8k
        }
3179
0
        default:
3180
0
            hts_log_error("Unknown aux type '%c'", aux_end - aux < 2 ? '?' : aux[2]);
3181
0
            goto err;
3182
691k
        }
3183
691k
        tm->blk->m = tm->m;
3184
691k
    }
3185
3186
    // FIXME: sort BLOCK_DATA(td_b) by char[3] triples
3187
3188
    // And and increment TD hash entry
3189
6.94M
    BLOCK_APPEND_CHAR(td_b, 0);
3190
3191
    // Duplicate key as BLOCK_DATA() can be realloced to a new pointer.
3192
6.94M
    key = string_ndup(c->comp_hdr->TD_keys,
3193
6.94M
                      (char *)BLOCK_DATA(td_b) + TD_blk_size,
3194
6.94M
                      BLOCK_SIZE(td_b) - TD_blk_size);
3195
6.94M
    if (!key)
3196
0
        goto block_err;
3197
6.94M
    k = kh_put(m_s2i, c->comp_hdr->TD_hash, key, &new);
3198
6.94M
    if (new < 0) {
3199
0
        goto err;
3200
6.94M
    } else if (new == 0) {
3201
6.86M
        BLOCK_SIZE(td_b) = TD_blk_size;
3202
6.86M
    } else {
3203
81.7k
        kh_val(c->comp_hdr->TD_hash, k) = c->comp_hdr->nTL;
3204
81.7k
        c->comp_hdr->nTL++;
3205
81.7k
    }
3206
3207
6.94M
    cr->TL = kh_val(c->comp_hdr->TD_hash, k);
3208
6.94M
    if (cram_stats_add(c->stats[DS_TL], cr->TL) < 0)
3209
0
        goto block_err;
3210
3211
6.94M
    if (orig != (char *)bam_aux(b))
3212
78.9k
        free(orig);
3213
3214
6.94M
    if (err) *err = 0;
3215
3216
6.94M
    return brg;
3217
3218
153
 err:
3219
153
 block_err:
3220
153
    if (orig != (char *)bam_aux(b))
3221
28
        free(orig);
3222
153
    return NULL;
3223
153
}
3224
3225
/*
3226
 * During cram_next_container or before the final flush at end of
3227
 * file, we update the current slice headers and increment the slice
3228
 * number to the next slice.
3229
 *
3230
 * See cram_next_container() and cram_close().
3231
 */
3232
63.4k
void cram_update_curr_slice(cram_container *c, int version) {
3233
63.4k
    cram_slice *s = c->slice;
3234
63.4k
    if (c->multi_seq) {
3235
975
        s->hdr->ref_seq_id    = -2;
3236
975
        s->hdr->ref_seq_start = 0;
3237
975
        s->hdr->ref_seq_span  = 0;
3238
62.5k
    } else if (c->curr_ref == -1 && CRAM_ge31(version)) {
3239
        // Spec states span=0, but it broke our range queries.
3240
        // See commit message for this and prior.
3241
32.6k
        s->hdr->ref_seq_id    = -1;
3242
32.6k
        s->hdr->ref_seq_start = 0;
3243
32.6k
        s->hdr->ref_seq_span  = 0;
3244
32.6k
    } else {
3245
29.9k
        s->hdr->ref_seq_id    = c->curr_ref;
3246
29.9k
        s->hdr->ref_seq_start = c->first_base;
3247
29.9k
        s->hdr->ref_seq_span  = MAX(0, c->last_base - c->first_base + 1);
3248
29.9k
    }
3249
63.4k
    s->hdr->num_records   = c->curr_rec;
3250
3251
63.4k
    if (c->curr_slice == 0) {
3252
63.4k
        if (c->ref_seq_id != s->hdr->ref_seq_id)
3253
36.0k
            c->ref_seq_id  = s->hdr->ref_seq_id;
3254
63.4k
        c->ref_seq_start = c->first_base;
3255
63.4k
    }
3256
3257
63.4k
    c->curr_slice++;
3258
63.4k
}
3259
3260
/*
3261
 * Handles creation of a new container or new slice, flushing any
3262
 * existing containers when appropriate.
3263
 *
3264
 * Really this is next slice, which may or may not lead to a new container.
3265
 *
3266
 * Returns cram_container pointer on success
3267
 *         NULL on failure.
3268
 */
3269
63.5k
static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) {
3270
63.5k
    cram_container *c = fd->ctr;
3271
63.5k
    int i;
3272
3273
    /* First occurrence */
3274
63.5k
    if (c->curr_ref == -2)
3275
5.74k
        c->curr_ref = bam_ref(b);
3276
3277
63.5k
    if (c->slice)
3278
57.7k
        cram_update_curr_slice(c, fd->version);
3279
3280
    /* Flush container */
3281
63.5k
    if (c->curr_slice == c->max_slice ||
3282
63.5k
        (bam_ref(b) != c->curr_ref && !c->multi_seq)) {
3283
57.7k
        c->ref_seq_span = fd->last_base - c->ref_seq_start + 1;
3284
57.7k
        hts_log_info("Flush container %d/%"PRId64"..%"PRId64,
3285
57.7k
                     c->ref_seq_id, c->ref_seq_start,
3286
57.7k
                     c->ref_seq_start + c->ref_seq_span -1);
3287
3288
        /* Encode slices */
3289
57.7k
        if (-1 == cram_flush_container_mt(fd, c))
3290
29
            return NULL;
3291
57.7k
        if (!fd->pool) {
3292
            // Move to sep func, as we need cram_flush_container for
3293
            // the closing phase to flush the partial container.
3294
115k
            for (i = 0; i < c->max_slice; i++) {
3295
57.7k
                cram_free_slice(c->slices[i]);
3296
57.7k
                c->slices[i] = NULL;
3297
57.7k
            }
3298
3299
57.7k
            c->slice = NULL;
3300
57.7k
            c->curr_slice = 0;
3301
3302
            /* Easy approach for purposes of freeing stats */
3303
57.7k
            cram_free_container(c);
3304
57.7k
        }
3305
3306
57.7k
        c = fd->ctr = cram_new_container(fd->seqs_per_slice,
3307
57.7k
                                         fd->slices_per_container);
3308
57.7k
        if (!c)
3309
0
            return NULL;
3310
3311
57.7k
        pthread_mutex_lock(&fd->ref_lock);
3312
57.7k
        c->no_ref = fd->no_ref;
3313
57.7k
        c->embed_ref = fd->embed_ref;
3314
57.7k
        c->record_counter = fd->record_counter;
3315
57.7k
        pthread_mutex_unlock(&fd->ref_lock);
3316
57.7k
        c->curr_ref = bam_ref(b);
3317
57.7k
    }
3318
3319
63.4k
    c->last_pos = c->first_base = c->last_base = bam_pos(b)+1;
3320
3321
    /* New slice */
3322
63.4k
    c->slice = c->slices[c->curr_slice] =
3323
63.4k
        cram_new_slice(MAPPED_SLICE, c->max_rec);
3324
63.4k
    if (!c->slice)
3325
0
        return NULL;
3326
3327
63.4k
    if (c->multi_seq) {
3328
0
        c->slice->hdr->ref_seq_id = -2;
3329
0
        c->slice->hdr->ref_seq_start = 0;
3330
0
        c->slice->last_apos = 1;
3331
63.4k
    } else {
3332
63.4k
        c->slice->hdr->ref_seq_id = bam_ref(b);
3333
        // wrong for unsorted data, will fix during encoding.
3334
63.4k
        c->slice->hdr->ref_seq_start = bam_pos(b)+1;
3335
63.4k
        c->slice->last_apos = bam_pos(b)+1;
3336
63.4k
    }
3337
3338
63.4k
    c->curr_rec = 0;
3339
63.4k
    c->s_num_bases = 0;
3340
63.4k
    c->n_mapped = 0;
3341
3342
    // QO field: 0 implies original orientation, 1 implies sequence orientation
3343
    // 1 is often preferable for NovaSeq, but impact is slight. ~0.5% diff.
3344
    // Conversely other data sets it's often better than 1% saving for 0.
3345
    // Short of trying both and learning, for now we use use 0 for V4, 1 for V3.
3346
63.4k
    c->qs_seq_orient = CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : 1;
3347
3348
63.4k
    return c;
3349
63.4k
}
3350
3351
3352
/*
3353
 * Converts a single bam record into a cram record.
3354
 * Possibly used within a thread.
3355
 *
3356
 * Returns 0 on success;
3357
 *        -1 on failure
3358
 */
3359
static int process_one_read(cram_fd *fd, cram_container *c,
3360
                            cram_slice *s, cram_record *cr,
3361
                            bam_seq_t *b, int rnum, kstring_t *MD,
3362
6.94M
                            int embed_ref, int no_ref) {
3363
6.94M
    int i, fake_qual = -1, NM = 0;
3364
6.94M
    char *cp;
3365
6.94M
    char *ref, *seq, *qual;
3366
3367
    // Any places with N in seq and/or reference can lead to ambiguous
3368
    // interpretation of the SAM NM:i tag.  So we store these verbatim
3369
    // to ensure valid data round-trips the same regardless of who
3370
    // defines it as valid.
3371
    // Similarly when alignments go beyond end of the reference.
3372
6.94M
    int verbatim_NM = fd->store_nm;
3373
6.94M
    int verbatim_MD = fd->store_md;
3374
3375
    // FIXME: multi-ref containers
3376
3377
6.94M
    cr->flags       = bam_flag(b);
3378
6.94M
    cr->len         = bam_seq_len(b);
3379
6.94M
    uint8_t *md;
3380
6.94M
    if (!(md = bam_aux_get(b, "MD")))
3381
6.89M
        MD = NULL;
3382
49.4k
    else
3383
49.4k
        MD->l = 0;
3384
3385
6.94M
    int cf_tag = 0;
3386
3387
6.94M
    if (embed_ref == 2) {
3388
79.1k
        cf_tag  = MD ? 0 : 1;                   // No MD
3389
79.1k
        cf_tag |= bam_aux_get(b, "NM") ? 0 : 2; // No NM
3390
79.1k
    }
3391
3392
    //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg);
3393
3394
6.94M
    ref = c->ref ? c->ref - (c->ref_start-1) : NULL;
3395
6.94M
    cr->ref_id      = bam_ref(b);
3396
6.94M
    if (cram_stats_add(c->stats[DS_RI], cr->ref_id) < 0)
3397
0
        goto block_err;
3398
6.94M
    if (cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]) < 0)
3399
0
        goto block_err;
3400
3401
    // Non reference based encoding means storing the bases verbatim as features, which in
3402
    // turn means every base also has a quality already stored.
3403
6.94M
    if (!no_ref || CRAM_MAJOR_VERS(fd->version) >= 3)
3404
6.94M
        cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
3405
3406
6.94M
    if (cr->len <= 0 && CRAM_MAJOR_VERS(fd->version) >= 3)
3407
6.62M
        cr->cram_flags |= CRAM_FLAG_NO_SEQ;
3408
    //cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK);
3409
3410
6.94M
    c->num_bases   += cr->len;
3411
6.94M
    cr->apos        = bam_pos(b)+1;
3412
6.94M
    if (cr->apos < 0 || cr->apos > INT64_MAX/2)
3413
33
        goto err;
3414
6.94M
    if (c->pos_sorted) {
3415
6.92M
        if (cr->apos < s->last_apos && !fd->ap_delta) {
3416
384
            c->pos_sorted = 0;
3417
6.92M
        } else {
3418
6.92M
            if (cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos) < 0)
3419
0
                goto block_err;
3420
6.92M
            s->last_apos = cr->apos;
3421
6.92M
        }
3422
6.92M
    } else {
3423
        //cram_stats_add(c->stats[DS_AP], cr->apos);
3424
17.4k
    }
3425
6.94M
    c->max_apos += (cr->apos > c->max_apos) * (cr->apos - c->max_apos);
3426
3427
    /*
3428
     * This seqs_ds is largely pointless and it could reuse the same memory
3429
     * over and over.
3430
     * s->base_blk is what we need for encoding.
3431
     */
3432
6.94M
    cr->seq         = BLOCK_SIZE(s->seqs_blk);
3433
6.94M
    cr->qual        = BLOCK_SIZE(s->qual_blk);
3434
6.94M
    BLOCK_GROW(s->seqs_blk, cr->len+1);
3435
6.94M
    BLOCK_GROW(s->qual_blk, cr->len);
3436
3437
    // Convert BAM nibble encoded sequence to string of base pairs
3438
6.94M
    seq = cp = (char *)BLOCK_END(s->seqs_blk);
3439
6.94M
    *seq = 0;
3440
6.94M
    nibble2base(bam_seq(b), cp, cr->len);
3441
6.94M
    BLOCK_SIZE(s->seqs_blk) += cr->len;
3442
3443
6.94M
    qual = cp = (char *)bam_qual(b);
3444
3445
3446
    /* Copy and parse */
3447
6.94M
    if (!(cr->flags & BAM_FUNMAP)) {
3448
83.3k
        uint32_t *cig_to, *cig_from;
3449
83.3k
        int64_t apos = cr->apos-1, spos = 0;
3450
83.3k
        int64_t MD_last = apos; // last position of edit in MD tag
3451
3452
83.3k
        if (apos < 0) {
3453
4
            hts_log_error("Mapped read with position <= 0 is disallowed");
3454
4
            return -1;
3455
4
        }
3456
3457
83.3k
        cr->cigar       = s->ncigar;
3458
83.3k
        cr->ncigar      = bam_cigar_len(b);
3459
83.6k
        while (cr->cigar + cr->ncigar >= s->cigar_alloc) {
3460
256
            s->cigar_alloc = s->cigar_alloc ? s->cigar_alloc*2 : 1024;
3461
256
            s->cigar = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar));
3462
256
            if (!s->cigar)
3463
0
                return -1;
3464
256
        }
3465
3466
83.3k
        cig_to = (uint32_t *)s->cigar;
3467
83.3k
        cig_from = (uint32_t *)bam_cigar(b);
3468
3469
83.3k
        cr->feature = 0;
3470
83.3k
        cr->nfeature = 0;
3471
592k
        for (i = 0; i < cr->ncigar; i++) {
3472
509k
            enum cigar_op cig_op = cig_from[i] & BAM_CIGAR_MASK;
3473
509k
            uint32_t cig_len = cig_from[i] >> BAM_CIGAR_SHIFT;
3474
509k
            cig_to[i] = cig_from[i];
3475
3476
            /* Can also generate events from here for CRAM diffs */
3477
3478
509k
            switch (cig_op) {
3479
0
                int l;
3480
3481
                // Don't trust = and X ops to be correct.
3482
310k
            case BAM_CMATCH:
3483
328k
            case BAM_CBASE_MATCH:
3484
330k
            case BAM_CBASE_MISMATCH:
3485
                //fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n",
3486
                //      cig_len, &ref[apos], cig_len, &seq[spos]);
3487
330k
                l = 0;
3488
330k
                if (!no_ref && cr->len) {
3489
13.9k
                    int end = cig_len+apos < c->ref_end
3490
13.9k
                        ? cig_len : c->ref_end - apos;
3491
13.9k
                    char *sp = &seq[spos];
3492
13.9k
                    char *rp = &ref[apos];
3493
13.9k
                    char *qp = &qual[spos];
3494
13.9k
                    if (end > cr->len) {
3495
1
                        hts_log_error("CIGAR and query sequence are of different length");
3496
1
                        return -1;
3497
1
                    }
3498
21.2k
                    for (l = 0; l < end; l++) {
3499
                        // This case is just too disputed and different tools
3500
                        // interpret these in different ways.  We give up and
3501
                        // store verbatim.
3502
7.23k
                        if (rp[l] == 'N' && sp[l] == 'N')
3503
209
                            verbatim_NM = verbatim_MD = 1;
3504
7.23k
                        if (rp[l] != sp[l]) {
3505
                            // Build our own MD tag if one is on the sequence, so
3506
                            // we can ensure it matches and thus can be discarded.
3507
2.80k
                            if (MD && ref) {
3508
1.32k
                                if (kputuw(apos+l - MD_last, MD) < 0) goto err;
3509
1.32k
                                if (kputc(rp[l], MD) < 0) goto err;
3510
1.32k
                                MD_last = apos+l+1;
3511
1.32k
                            }
3512
2.80k
                            NM++;
3513
2.80k
                            if (!sp[l])
3514
0
                                break;
3515
2.80k
                            if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) {
3516
#if 0
3517
                                // Disabled for the time being as it doesn't
3518
                                // seem to gain us much.
3519
                                int ol=l;
3520
                                while (l<end && rp[l] != sp[l])
3521
                                    l++;
3522
                                if (l-ol > 1) {
3523
                                    if (cram_add_bases(fd, c, s, cr, spos+ol,
3524
                                                       l-ol, &seq[spos+ol]))
3525
                                        return -1;
3526
                                    l--;
3527
                                } else {
3528
                                    l = ol;
3529
                                    if (cram_add_substitution(fd, c, s, cr,
3530
                                                              spos+l, sp[l],
3531
                                                              qp[l], rp[l]))
3532
                                        return -1;
3533
                                }
3534
#else
3535
                                // With urmap pushed to the limit and lots
3536
                                // of unaligned data (should be soft-clipped)
3537
                                // this saves ~2-7%. Worth it?
3538
0
                                int nl = l;
3539
0
                                int max_end = nl, max_score = 0, score = 0;
3540
0
                                while (nl < end) {
3541
0
                                    if (rp[nl] != sp[nl]) {
3542
0
                                        score += 3;
3543
0
                                        if (max_score < score) {
3544
0
                                            max_score = score;
3545
0
                                            max_end = nl;
3546
0
                                        }
3547
0
                                    } else {
3548
0
                                        score--;
3549
0
                                        if (score < -2 ||
3550
0
                                            max_score - score > 7)
3551
0
                                            break;
3552
0
                                    }
3553
0
                                    nl++;
3554
0
                                }
3555
0
                                if (max_score > 20) {
3556
0
                                    cram_add_bases(fd, c, s, cr, spos+l,
3557
0
                                                   max_end-l, &seq[spos+l]);
3558
0
                                    l = max_end-1;
3559
0
                                } else {
3560
0
                                    while (l < nl) {
3561
0
                                        if (rp[l] != sp[l])
3562
0
                                            cram_add_substitution(fd, c, s,
3563
0
                                                                  cr, spos+l,
3564
0
                                                                  sp[l], qp[l],
3565
0
                                                                  rp[l]);
3566
0
                                        l++;
3567
0
                                    }
3568
0
                                    l--;
3569
0
                                }
3570
0
#endif
3571
2.80k
                            } else {
3572
2.80k
                                if (cram_add_substitution(fd, c, s, cr, spos+l,
3573
2.80k
                                                          sp[l], qp[l], rp[l]))
3574
0
                                    return -1;
3575
2.80k
                            }
3576
2.80k
                        }
3577
7.23k
                    }
3578
13.9k
                    spos += l;
3579
13.9k
                    apos += l;
3580
13.9k
                }
3581
3582
330k
                if (l < cig_len && cr->len) {
3583
2.58k
                    if (no_ref) {
3584
642
                        if (CRAM_MAJOR_VERS(fd->version) == 3) {
3585
642
                            if (cram_add_bases(fd, c, s, cr, spos,
3586
642
                                               cig_len-l, &seq[spos]))
3587
0
                                return -1;
3588
642
                            spos += cig_len-l;
3589
642
                        } else {
3590
0
                            for (; l < cig_len && seq[spos]; l++, spos++) {
3591
0
                                if (cram_add_base(fd, c, s, cr, spos,
3592
0
                                                  seq[spos], qual[spos]))
3593
0
                                    return -1;
3594
0
                            }
3595
0
                        }
3596
1.94k
                    } else {
3597
                        /* off end of sequence or non-ref based output */
3598
1.94k
                        verbatim_NM = verbatim_MD = 1;
3599
3.91k
                        for (; l < cig_len && seq[spos]; l++, spos++) {
3600
1.97k
                            if (cram_add_base(fd, c, s, cr, spos,
3601
1.97k
                                              seq[spos], qual[spos]))
3602
0
                                return -1;
3603
1.97k
                        }
3604
1.94k
                    }
3605
2.58k
                    apos += cig_len;
3606
327k
                } else if (!cr->len) {
3607
                    /* Seq "*" */
3608
315k
                    verbatim_NM = verbatim_MD = 1;
3609
315k
                    apos += cig_len;
3610
315k
                    spos += cig_len;
3611
315k
                }
3612
330k
                break;
3613
3614
330k
            case BAM_CDEL:
3615
59.3k
                if (MD && ref) {
3616
23.2k
                    if (kputuw(apos - MD_last, MD) < 0) goto err;
3617
23.2k
                    if (apos < c->ref_end) {
3618
16.9k
                        if (kputc_('^', MD) < 0) goto err;
3619
16.9k
                        if (kputsn(&ref[apos], MIN(c->ref_end - apos, cig_len), MD) < 0)
3620
0
                            goto err;
3621
16.9k
                    }
3622
23.2k
                }
3623
59.3k
                NM += cig_len;
3624
3625
59.3k
                if (cram_add_deletion(c, s, cr, spos, cig_len, &seq[spos]))
3626
0
                    return -1;
3627
59.3k
                apos += cig_len;
3628
59.3k
                MD_last = apos;
3629
59.3k
                break;
3630
3631
713
            case BAM_CREF_SKIP:
3632
713
                if (cram_add_skip(c, s, cr, spos, cig_len, &seq[spos]))
3633
0
                    return -1;
3634
713
                apos += cig_len;
3635
713
                MD_last += cig_len;
3636
713
                break;
3637
3638
12.7k
            case BAM_CINS:
3639
12.7k
                if (cram_add_insertion(c, s, cr, spos, cig_len,
3640
12.7k
                                       cr->len ? &seq[spos] : NULL))
3641
0
                    return -1;
3642
12.7k
                if (no_ref && cr->len) {
3643
230
                    for (l = 0; l < cig_len; l++, spos++) {
3644
97
                        cram_add_quality(fd, c, s, cr, spos, qual[spos]);
3645
97
                    }
3646
12.6k
                } else {
3647
12.6k
                    spos += cig_len;
3648
12.6k
                }
3649
12.7k
                NM += cig_len;
3650
12.7k
                break;
3651
3652
77.0k
            case BAM_CSOFT_CLIP:
3653
77.0k
                if (cram_add_softclip(c, s, cr, spos, cig_len,
3654
77.0k
                                      cr->len ? &seq[spos] : NULL,
3655
77.0k
                                      fd->version))
3656
0
                    return -1;
3657
3658
77.0k
                if (no_ref &&
3659
77.0k
                    !(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
3660
0
                    if (cr->len) {
3661
0
                        for (l = 0; l < cig_len; l++, spos++) {
3662
0
                            cram_add_quality(fd, c, s, cr, spos, qual[spos]);
3663
0
                        }
3664
0
                    } else {
3665
0
                        for (l = 0; l < cig_len; l++, spos++) {
3666
0
                            cram_add_quality(fd, c, s, cr, spos, -1);
3667
0
                        }
3668
0
                    }
3669
77.0k
                } else {
3670
77.0k
                    spos += cig_len;
3671
77.0k
                }
3672
77.0k
                break;
3673
3674
28.7k
            case BAM_CHARD_CLIP:
3675
28.7k
                if (cram_add_hardclip(c, s, cr, spos, cig_len, &seq[spos]))
3676
0
                    return -1;
3677
28.7k
                break;
3678
3679
28.7k
            case BAM_CPAD:
3680
585
                if (cram_add_pad(c, s, cr, spos, cig_len, &seq[spos]))
3681
0
                    return -1;
3682
585
                break;
3683
3684
585
            default:
3685
174
                hts_log_error("Unknown CIGAR op code %d", cig_op);
3686
174
                return -1;
3687
509k
            }
3688
509k
        }
3689
83.2k
        if (cr->len && spos != cr->len) {
3690
3
            hts_log_error("CIGAR and query sequence are of different length");
3691
3
            return -1;
3692
3
        }
3693
83.2k
        fake_qual = spos;
3694
        // Protect against negative length refs (fuzz 382922241)
3695
83.2k
        cr->aend = no_ref ? apos : MIN(apos, MAX(0, c->ref_end));
3696
83.2k
        if (cram_stats_add(c->stats[DS_FN], cr->nfeature) < 0)
3697
0
            goto block_err;
3698
3699
83.2k
        if (MD && ref)
3700
29.6k
            if (kputuw(apos - MD_last, MD) < 0) goto err;
3701
6.85M
    } else {
3702
        // Unmapped
3703
6.85M
        cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
3704
6.85M
        cr->cigar  = 0;
3705
6.85M
        cr->ncigar = 0;
3706
6.85M
        cr->nfeature = 0;
3707
6.85M
        cr->aend = MIN(cr->apos, c->ref_end);
3708
473M
        for (i = 0; i < cr->len; i++)
3709
466M
            if (cram_stats_add(c->stats[DS_BA], seq[i]) < 0)
3710
0
                goto block_err;
3711
6.85M
        fake_qual = 0;
3712
6.85M
    }
3713
3714
6.94M
    cr->ntags      = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags);
3715
6.94M
    int err = 0;
3716
6.94M
    sam_hrec_rg_t *brg =
3717
6.94M
        cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD,
3718
6.94M
                        cf_tag, no_ref, &err);
3719
6.94M
    if (err)
3720
153
        goto block_err;
3721
3722
    /* Read group, identified earlier */
3723
6.94M
    if (brg) {
3724
190
        cr->rg = brg->id;
3725
6.94M
    } else if (CRAM_MAJOR_VERS(fd->version) == 1) {
3726
0
        sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, "UNKNOWN");
3727
0
        if (!brg) goto block_err;
3728
0
        cr->rg = brg->id;
3729
6.94M
    } else {
3730
6.94M
        cr->rg = -1;
3731
6.94M
    }
3732
6.94M
    if (cram_stats_add(c->stats[DS_RG], cr->rg) < 0)
3733
0
        goto block_err;
3734
3735
    /*
3736
     * Append to the qual block now. We do this here as
3737
     * cram_add_substitution() can generate BA/QS events which need to
3738
     * be in the qual block before we append the rest of the data.
3739
     */
3740
6.94M
    if (cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES) {
3741
        /* Special case of seq "*" */
3742
6.94M
        if (cr->len == 0) {
3743
6.62M
            cr->len = fake_qual;
3744
6.62M
            BLOCK_GROW(s->qual_blk, cr->len);
3745
6.62M
            cp = (char *)BLOCK_END(s->qual_blk);
3746
6.62M
            memset(cp, 255, cr->len);
3747
6.62M
        } else {
3748
317k
            BLOCK_GROW(s->qual_blk, cr->len);
3749
317k
            cp = (char *)BLOCK_END(s->qual_blk);
3750
317k
            char *from = (char *)&bam_qual(b)[0];
3751
317k
            char *to = &cp[0];
3752
317k
            memcpy(to, from, cr->len);
3753
3754
            // Store quality in original orientation for better compression.
3755
317k
            if (!c->qs_seq_orient) {
3756
0
                if (cr->flags & BAM_FREVERSE) {
3757
0
                    int i, j;
3758
0
                    for (i = 0, j = cr->len-1; i < j; i++, j--) {
3759
0
                        unsigned char c;
3760
0
                        c = to[i];
3761
0
                        to[i] = to[j];
3762
0
                        to[j] = c;
3763
0
                    }
3764
0
                }
3765
0
            }
3766
317k
        }
3767
6.94M
        BLOCK_SIZE(s->qual_blk) += cr->len;
3768
6.94M
    } else {
3769
0
        if (cr->len == 0)
3770
0
            cr->len = fake_qual >= 0 ? fake_qual : cr->aend - cr->apos + 1;
3771
0
    }
3772
3773
6.94M
    if (cram_stats_add(c->stats[DS_RL], cr->len) < 0)
3774
0
        goto block_err;
3775
3776
    /* Now we know apos and aend both, update mate-pair information */
3777
6.94M
    {
3778
6.94M
        int new;
3779
6.94M
        khint_t k;
3780
6.94M
        int sec = (cr->flags & BAM_FSECONDARY) ? 1 : 0;
3781
3782
        //fprintf(stderr, "Checking %"PRId64"/%.*s\t", rnum,
3783
        //      cr->name_len, DSTRING_STR(s->name_ds)+cr->name);
3784
6.94M
        if (cr->flags & BAM_FPAIRED) {
3785
28.3k
            char *key = string_ndup(s->pair_keys, bam_name(b), bam_name_len(b));
3786
28.3k
            if (!key)
3787
0
                return -1;
3788
3789
28.3k
            k = kh_put(m_s2i, s->pair[sec], key, &new);
3790
28.3k
            if (-1 == new)
3791
0
                return -1;
3792
28.3k
            else if (new > 0)
3793
8.05k
                kh_val(s->pair[sec], k) = rnum;
3794
6.91M
        } else {
3795
6.91M
            new = 1;
3796
6.91M
            k = 0; // Prevents false-positive warning from gcc -Og
3797
6.91M
        }
3798
3799
6.94M
        if (new == 0) {
3800
20.3k
            cram_record *p = &s->crecs[kh_val(s->pair[sec], k)];
3801
20.3k
            int64_t aleft, aright;
3802
20.3k
            int sign;
3803
3804
20.3k
            aleft = MIN(cr->apos, p->apos);
3805
20.3k
            aright = MAX(cr->aend, p->aend);
3806
20.3k
            if (cr->apos < p->apos) {
3807
1.11k
                sign = 1;
3808
19.2k
            } else if (cr->apos > p->apos) {
3809
1.02k
                sign = -1;
3810
18.1k
            } else if (cr->flags & BAM_FREAD1) {
3811
6.23k
                sign = 1;
3812
11.9k
            } else {
3813
11.9k
                sign = -1;
3814
11.9k
            }
3815
3816
            // This vs p: tlen, matepos, flags. Permit TLEN 0 and/or TLEN +/-
3817
            // a small amount, if appropriate options set.
3818
20.3k
            if ((!fd->tlen_zero && MAX(bam_mate_pos(b)+1, 0) != p->apos) &&
3819
20.3k
                !(fd->tlen_zero && bam_mate_pos(b) == 0))
3820
12.9k
                goto detached;
3821
3822
7.37k
            if (((bam_flag(b) & BAM_FMUNMAP) != 0) !=
3823
7.37k
                ((p->flags & BAM_FUNMAP) != 0))
3824
287
                goto detached;
3825
3826
7.08k
            if (((bam_flag(b) & BAM_FMREVERSE) != 0) !=
3827
7.08k
                ((p->flags & BAM_FREVERSE) != 0))
3828
51
                goto detached;
3829
3830
3831
            // p vs this: tlen, matepos, flags
3832
7.03k
            if (p->ref_id != cr->ref_id &&
3833
7.03k
                !(fd->tlen_zero && p->ref_id == -1))
3834
3
                goto detached;
3835
3836
7.03k
            if (p->mate_pos != cr->apos &&
3837
7.03k
                !(fd->tlen_zero && p->mate_pos == 0))
3838
22
                goto detached;
3839
3840
7.00k
            if (((p->flags & BAM_FMUNMAP) != 0) !=
3841
7.00k
                ((p->mate_flags & CRAM_M_UNMAP) != 0))
3842
6
                goto detached;
3843
3844
7.00k
            if (((p->flags & BAM_FMREVERSE) != 0) !=
3845
7.00k
                ((p->mate_flags & CRAM_M_REVERSE) != 0))
3846
4
                goto detached;
3847
3848
            // Supplementary reads are just too ill defined
3849
6.99k
            if ((cr->flags & BAM_FSUPPLEMENTARY) ||
3850
6.99k
                (p->flags & BAM_FSUPPLEMENTARY))
3851
15
                goto detached;
3852
3853
            // When in lossy name mode, if a read isn't detached we
3854
            // cannot store the name.  The corollary is that when we
3855
            // must store the name, it must be detached (inefficient).
3856
6.98k
            if (fd->lossy_read_names &&
3857
6.98k
                (!(cr->cram_flags & CRAM_FLAG_DISCARD_NAME) ||
3858
0
                 !((p->cram_flags & CRAM_FLAG_DISCARD_NAME))))
3859
0
                goto detached;
3860
3861
            // Now check TLEN.  We do this last as sometimes it's the
3862
            // only thing that differs.  In CRAM4 we have a better way
3863
            // of handling this that doesn't break detached status
3864
6.98k
            int explicit_tlen = 0;
3865
6.98k
            int tflag1 = ((bam_ins_size(b) &&
3866
6.98k
                           llabs(bam_ins_size(b) - sign*(aright-aleft+1))
3867
60
                           > fd->tlen_approx)
3868
6.98k
                          || (!bam_ins_size(b) && !fd->tlen_zero));
3869
3870
6.98k
            int tflag2 = ((p->tlen && llabs(p->tlen - -sign*(aright-aleft+1))
3871
60
                           > fd->tlen_approx)
3872
6.98k
                          || (!p->tlen && !fd->tlen_zero));
3873
3874
6.98k
            if (tflag1 || tflag2) {
3875
6.97k
                if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3876
0
                    explicit_tlen = CRAM_FLAG_EXPLICIT_TLEN;
3877
6.97k
                } else {
3878
                    // Stil do detached for unmapped data in CRAM4 as this
3879
                    // also impacts RNEXT calculation.
3880
6.97k
                    goto detached;
3881
6.97k
                }
3882
6.97k
            }
3883
3884
            /*
3885
             * The fields below are unused when encoding this read as it is
3886
             * no longer detached.  In theory they may get referred to when
3887
             * processing a 3rd or 4th read in this template?, so we set them
3888
             * here just to be sure.
3889
             *
3890
             * They do not need cram_stats_add() calls those as they are
3891
             * not emitted.
3892
             */
3893
11
            cr->mate_pos = p->apos;
3894
11
            cram_stats_add(c->stats[DS_NP], cr->mate_pos);
3895
11
            cr->tlen = explicit_tlen ? bam_ins_size(b) : sign*(aright-aleft+1);
3896
11
            cram_stats_add(c->stats[DS_TS], cr->tlen);
3897
11
            cr->mate_flags =
3898
11
                ((p->flags & BAM_FMUNMAP)   == BAM_FMUNMAP)   * CRAM_M_UNMAP +
3899
11
                ((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE;
3900
3901
            // Decrement statistics aggregated earlier
3902
11
            if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
3903
11
                cram_stats_del(c->stats[DS_NP], p->mate_pos);
3904
11
                cram_stats_del(c->stats[DS_MF], p->mate_flags);
3905
11
                if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN))
3906
11
                    cram_stats_del(c->stats[DS_TS], p->tlen);
3907
11
                cram_stats_del(c->stats[DS_NS], p->mate_ref_id);
3908
11
            }
3909
3910
            /* Similarly we could correct the p-> values too, but these will no
3911
             * longer have any code that refers back to them as the new 'p'
3912
             * for this template is our current 'cr'.
3913
             */
3914
            //p->mate_pos = cr->apos;
3915
            //p->mate_flags =
3916
            //  ((cr->flags & BAM_FMUNMAP)   == BAM_FMUNMAP)  * CRAM_M_UNMAP +
3917
            //  ((cr->flags & BAM_FMREVERSE) == BAM_FMREVERSE)* CRAM_M_REVERSE;
3918
            //p->tlen = p->apos - cr->aend;
3919
3920
            // Clear detached from cr flags
3921
11
            cr->cram_flags &= ~CRAM_FLAG_DETACHED;
3922
11
            cr->cram_flags |= explicit_tlen;
3923
11
            if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0)
3924
0
                goto block_err;
3925
3926
            // Clear detached from p flags and set downstream
3927
11
            if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
3928
11
                cram_stats_del(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK);
3929
11
                p->cram_flags &= ~CRAM_FLAG_STATS_ADDED;
3930
11
            }
3931
3932
11
            p->cram_flags  &= ~CRAM_FLAG_DETACHED;
3933
11
            p->cram_flags  |=  CRAM_FLAG_MATE_DOWNSTREAM | explicit_tlen;;
3934
11
            if (cram_stats_add(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK) < 0)
3935
0
                goto block_err;
3936
3937
11
            p->mate_line = rnum - (kh_val(s->pair[sec], k) + 1);
3938
11
            if (cram_stats_add(c->stats[DS_NF], p->mate_line) < 0)
3939
0
                goto block_err;
3940
3941
11
            kh_val(s->pair[sec], k) = rnum;
3942
6.92M
        } else {
3943
6.94M
        detached:
3944
            //fprintf(stderr, "unpaired\n");
3945
3946
            /* Derive mate flags from this flag */
3947
6.94M
            cr->mate_flags = 0;
3948
6.94M
            if (bam_flag(b) & BAM_FMUNMAP)
3949
13.7k
                cr->mate_flags |= CRAM_M_UNMAP;
3950
6.94M
            if (bam_flag(b) & BAM_FMREVERSE)
3951
235
                cr->mate_flags |= CRAM_M_REVERSE;
3952
3953
6.94M
            if (cram_stats_add(c->stats[DS_MF], cr->mate_flags) < 0)
3954
0
                goto block_err;
3955
3956
6.94M
            cr->mate_pos    = MAX(bam_mate_pos(b)+1, 0);
3957
6.94M
            if (cram_stats_add(c->stats[DS_NP], cr->mate_pos) < 0)
3958
0
                goto block_err;
3959
3960
6.94M
            cr->tlen        = bam_ins_size(b);
3961
6.94M
            if (cram_stats_add(c->stats[DS_TS], cr->tlen) < 0)
3962
0
                goto block_err;
3963
3964
6.94M
            cr->cram_flags |= CRAM_FLAG_DETACHED;
3965
6.94M
            if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0)
3966
0
                goto block_err;
3967
6.94M
            if (cram_stats_add(c->stats[DS_NS], bam_mate_ref(b)) < 0)
3968
0
                goto block_err;
3969
3970
6.94M
            cr->cram_flags |= CRAM_FLAG_STATS_ADDED;
3971
6.94M
        }
3972
6.94M
    }
3973
3974
6.94M
    cr->mqual       = bam_map_qual(b);
3975
6.94M
    if (cram_stats_add(c->stats[DS_MQ], cr->mqual) < 0)
3976
0
        goto block_err;
3977
3978
6.94M
    cr->mate_ref_id = bam_mate_ref(b);
3979
3980
6.94M
    if (!(bam_flag(b) & BAM_FUNMAP)) {
3981
83.1k
        if (c->first_base > cr->apos)
3982
178
            c->first_base = cr->apos;
3983
3984
83.1k
        if (c->last_base < cr->aend)
3985
3.48k
            c->last_base = cr->aend;
3986
83.1k
    }
3987
3988
6.94M
    return 0;
3989
3990
153
 block_err:
3991
186
 err:
3992
186
    return -1;
3993
153
}
3994
3995
/*
3996
 * Write iterator: put BAM format sequences into a CRAM file.
3997
 * We buffer up a containers worth of data at a time.
3998
 *
3999
 * Returns 0 on success
4000
 *        -1 on failure
4001
 */
4002
6.94M
int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
4003
6.94M
    cram_container *c;
4004
4005
6.94M
    if (!fd->ctr) {
4006
5.74k
        fd->ctr = cram_new_container(fd->seqs_per_slice,
4007
5.74k
                                     fd->slices_per_container);
4008
5.74k
        if (!fd->ctr)
4009
0
            return -1;
4010
5.74k
        fd->ctr->record_counter = fd->record_counter;
4011
4012
5.74k
        pthread_mutex_lock(&fd->ref_lock);
4013
5.74k
        fd->ctr->no_ref = fd->no_ref;
4014
5.74k
        fd->ctr->embed_ref = fd->embed_ref;
4015
5.74k
        pthread_mutex_unlock(&fd->ref_lock);
4016
5.74k
    }
4017
6.94M
    c = fd->ctr;
4018
4019
6.94M
    int embed_ref = c->embed_ref;
4020
4021
6.94M
    if (!c->slice || c->curr_rec == c->max_rec ||
4022
6.94M
        (bam_ref(b) != c->curr_ref && c->curr_ref >= -1) ||
4023
6.94M
        (c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice)) {
4024
67.4k
        int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1;
4025
67.4k
        int curr_ref = c->slice ? c->curr_ref : bam_ref(b);
4026
4027
        /*
4028
         * Start packing slices when we routinely have under 1/4tr full.
4029
         *
4030
         * This option isn't available if we choose to embed references
4031
         * since we can only have one per slice.
4032
         *
4033
         * The multi_seq var here refers to our intention for the next slice.
4034
         * This slice has already been encoded so we output as-is.
4035
         */
4036
67.4k
        if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 &&
4037
67.4k
            fd->last_slice && fd->last_slice < c->max_rec/4+10 &&
4038
67.4k
            embed_ref<=0) {
4039
822
            if (!c->multi_seq)
4040
577
                hts_log_info("Multi-ref enabled for next container");
4041
822
            multi_seq = 1;
4042
66.6k
        } else if (fd->multi_seq == 1) {
4043
4.34k
            pthread_mutex_lock(&fd->metrics_lock);
4044
4.34k
            if (fd->last_RI_count <= c->max_slice && fd->multi_seq_user != 1) {
4045
555
                multi_seq = 0;
4046
555
                hts_log_info("Multi-ref disabled for next container");
4047
555
            }
4048
4.34k
            pthread_mutex_unlock(&fd->metrics_lock);
4049
4.34k
        }
4050
4051
67.4k
        slice_rec = c->slice_rec;
4052
67.4k
        curr_rec  = c->curr_rec;
4053
4054
67.4k
        if (CRAM_MAJOR_VERS(fd->version) == 1 ||
4055
67.4k
            c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice ||
4056
67.4k
            c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice) {
4057
63.5k
            if (NULL == (c = cram_next_container(fd, b))) {
4058
29
                if (fd->ctr) {
4059
                    // prevent cram_close attempting to flush
4060
29
                    fd->ctr_mt = fd->ctr; // delay free when threading
4061
29
                    fd->ctr = NULL;
4062
29
                }
4063
29
                return -1;
4064
29
            }
4065
63.5k
        }
4066
4067
        /*
4068
         * Due to our processing order, some things we've already done we
4069
         * cannot easily undo. So when we first notice we should be packing
4070
         * multiple sequences per container we emit the small partial
4071
         * container as-is and then start a fresh one in a different mode.
4072
         */
4073
67.4k
        if (multi_seq == 0 && fd->multi_seq == 1 && fd->multi_seq_user == -1) {
4074
            // User selected auto-mode, we're currently using multi-seq, but
4075
            // have detected we don't need to.  Switch back to auto.
4076
554
            fd->multi_seq = -1;
4077
66.8k
        } else if (multi_seq) {
4078
            // We detected we need multi-seq
4079
4.61k
            fd->multi_seq = 1;
4080
4.61k
            c->multi_seq = 1;
4081
4.61k
            c->pos_sorted = 0;
4082
4083
            // Cram_next_container may end up flushing an existing one and
4084
            // triggering fd->embed_ref=2 if no reference is found.
4085
            // Embedded refs are incompatible with multi-seq, so we bail
4086
            // out and switch to no_ref in this scenario.  We do this
4087
            // within the container only, as multi_seq may be temporary
4088
            // and we switch back away from it again.
4089
4.61k
            pthread_mutex_lock(&fd->ref_lock);
4090
4.61k
            if (fd->embed_ref > 0 && c->curr_rec == 0 && c->curr_slice == 0) {
4091
110
                hts_log_warning("Changing from embed_ref to no_ref mode");
4092
                // Should we update fd->embed_ref and no_ref here too?
4093
                // Doing so means if we go into multi-seq and back out
4094
                // again, eg due a cluster of tiny refs in the middle of
4095
                // much larger ones, then we bake in no-ref mode.
4096
                //
4097
                // However for unsorted data we're realistically not
4098
                // going to switch back.
4099
110
                c->embed_ref = fd->embed_ref = 0; // or -1 for auto?
4100
110
                c->no_ref = fd->no_ref = 1;
4101
110
            }
4102
4.61k
            pthread_mutex_unlock(&fd->ref_lock);
4103
4104
4.61k
            if (!c->refs_used) {
4105
975
                pthread_mutex_lock(&fd->ref_lock);
4106
975
                c->refs_used = calloc(fd->refs->nref, sizeof(int));
4107
975
                pthread_mutex_unlock(&fd->ref_lock);
4108
975
                if (!c->refs_used)
4109
0
                    return -1;
4110
975
            }
4111
4.61k
        }
4112
4113
67.4k
        fd->last_slice = curr_rec - slice_rec;
4114
67.4k
        c->slice_rec = c->curr_rec;
4115
4116
        // Have we seen this reference before?
4117
67.4k
        if (bam_ref(b) >= 0 && curr_ref >= 0 && bam_ref(b) != curr_ref &&
4118
67.4k
            embed_ref<=0 && !fd->unsorted && multi_seq) {
4119
4120
20
            if (!c->refs_used) {
4121
0
                pthread_mutex_lock(&fd->ref_lock);
4122
0
                c->refs_used = calloc(fd->refs->nref, sizeof(int));
4123
0
                pthread_mutex_unlock(&fd->ref_lock);
4124
0
                if (!c->refs_used)
4125
0
                    return -1;
4126
20
            } else if (c->refs_used && c->refs_used[bam_ref(b)]) {
4127
6
                pthread_mutex_lock(&fd->ref_lock);
4128
6
                fd->unsorted = 1;
4129
6
                fd->multi_seq = 1;
4130
6
                pthread_mutex_unlock(&fd->ref_lock);
4131
6
            }
4132
20
        }
4133
4134
67.4k
        c->curr_ref = bam_ref(b);
4135
67.4k
        if (c->refs_used && c->curr_ref >= 0) c->refs_used[c->curr_ref]++;
4136
67.4k
    }
4137
4138
6.94M
    if (!c->bams) {
4139
        /* First time through, allocate a set of bam pointers */
4140
63.4k
        pthread_mutex_lock(&fd->bam_list_lock);
4141
63.4k
        if (fd->bl) {
4142
57.7k
            spare_bams *spare = fd->bl;
4143
57.7k
            c->bams = spare->bams;
4144
57.7k
            fd->bl = spare->next;
4145
57.7k
            free(spare);
4146
57.7k
        } else {
4147
5.74k
            c->bams = calloc(c->max_c_rec, sizeof(bam_seq_t *));
4148
5.74k
            if (!c->bams) {
4149
0
                pthread_mutex_unlock(&fd->bam_list_lock);
4150
0
                return -1;
4151
0
            }
4152
5.74k
        }
4153
63.4k
        pthread_mutex_unlock(&fd->bam_list_lock);
4154
63.4k
    }
4155
4156
    /* Copy or alloc+copy the bam record, for later encoding */
4157
6.94M
    if (c->bams[c->curr_c_rec]) {
4158
5.61M
        if (bam_copy1(c->bams[c->curr_c_rec], b) == NULL)
4159
0
            return -1;
4160
5.61M
    } else {
4161
1.32M
        c->bams[c->curr_c_rec] = bam_dup1(b);
4162
1.32M
        if (c->bams[c->curr_c_rec] == NULL)
4163
0
            return -1;
4164
1.32M
    }
4165
6.94M
    if (bam_seq_len(b)) {
4166
317k
        c->s_num_bases += bam_seq_len(b);
4167
6.62M
    } else {
4168
        // No sequence in BAM record.  CRAM doesn't directly support this
4169
        // case, it ends up being stored as a string of N's for each query
4170
        // consuming CIGAR operation.  As this can become very inefficient
4171
        // in time and memory, data where the query length is excessively
4172
        // long are rejected.
4173
6.62M
        hts_pos_t qlen = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
4174
6.62M
        if (qlen > 100000000) {
4175
74
            hts_log_error("CIGAR query length %"PRIhts_pos
4176
74
                          " for read \"%s\" is too long",
4177
74
                          qlen, bam_get_qname(b));
4178
74
            return -1;
4179
74
        }
4180
6.62M
        c->s_num_bases += qlen;
4181
6.62M
    }
4182
6.94M
    c->curr_rec++;
4183
6.94M
    c->curr_c_rec++;
4184
6.94M
    c->s_aux_bytes += bam_get_l_aux(b);
4185
6.94M
    c->n_mapped += (bam_flag(b) & BAM_FUNMAP) ? 0 : 1;
4186
6.94M
    fd->record_counter++;
4187
4188
6.94M
    return 0;
4189
6.94M
}