Coverage Report

Created: 2025-11-09 07:19

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/cram/cram_encode.c
Line
Count
Source
1
/*
2
Copyright (c) 2012-2020, 2022-2024 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
32
#include <config.h>
33
34
#include <stdio.h>
35
#include <errno.h>
36
#include <assert.h>
37
#include <stdlib.h>
38
#include <string.h>
39
#include <strings.h>
40
#include <zlib.h>
41
#include <sys/types.h>
42
#include <sys/stat.h>
43
#include <math.h>
44
#include <inttypes.h>
45
#include <ctype.h>
46
47
#include "cram.h"
48
#include "os.h"
49
#include "../sam_internal.h" // for nibble2base
50
#include "../htslib/hts.h"
51
#include "../htslib/hts_endian.h"
52
#include "../textutils_internal.h"
53
54
KHASH_MAP_INIT_STR(m_s2u64, uint64_t)
55
56
#define Z_CRAM_STRAT Z_FILTERED
57
//#define Z_CRAM_STRAT Z_RLE
58
//#define Z_CRAM_STRAT Z_HUFFMAN_ONLY
59
//#define Z_CRAM_STRAT Z_DEFAULT_STRATEGY
60
61
static int process_one_read(cram_fd *fd, cram_container *c,
62
                            cram_slice *s, cram_record *cr,
63
                            bam_seq_t *b, int rnum, kstring_t *MD,
64
                            int embed_ref, int no_ref);
65
66
/*
67
 * Returns index of val into key.
68
 * Basically strchr(key, val)-key;
69
 */
70
1.05M
static int sub_idx(char *key, char val) {
71
1.05M
    int i;
72
73
2.63M
    for (i = 0; i < 4 && *key++ != val; i++);
74
1.05M
    return i;
75
1.05M
}
76
77
/*
78
 * Encodes a compression header block into a generic cram_block structure.
79
 *
80
 * Returns cram_block ptr on success
81
 *         NULL on failure
82
 */
83
cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c,
84
                                           cram_block_compression_hdr *h,
85
59.0k
                                           int embed_ref) {
86
59.0k
    cram_block *cb  = cram_new_block(COMPRESSION_HEADER, 0);
87
59.0k
    cram_block *map = cram_new_block(COMPRESSION_HEADER, 0);
88
59.0k
    int i, mc, r = 0;
89
90
59.0k
    int no_ref = c->no_ref;
91
92
59.0k
    if (!cb || !map)
93
0
        return NULL;
94
95
    /*
96
     * This is a concatenation of several blocks of data:
97
     * header + landmarks, preservation map, read encoding map, and the tag
98
     * encoding map.
99
     * All 4 are variable sized and we need to know how large these are
100
     * before creating the compression header itself as this starts with
101
     * the total size (stored as a variable length string).
102
     */
103
104
    // Duplicated from container itself, and removed in 1.1
105
59.0k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
106
0
        r |= itf8_put_blk(cb, h->ref_seq_id);
107
0
        r |= itf8_put_blk(cb, h->ref_seq_start);
108
0
        r |= itf8_put_blk(cb, h->ref_seq_span);
109
0
        r |= itf8_put_blk(cb, h->num_records);
110
0
        r |= itf8_put_blk(cb, h->num_landmarks);
111
0
        for (i = 0; i < h->num_landmarks; i++) {
112
0
            r |= itf8_put_blk(cb, h->landmark[i]);
113
0
        }
114
0
    }
115
116
59.0k
    if (h->preservation_map) {
117
0
        kh_destroy(map, h->preservation_map);
118
0
        h->preservation_map = NULL;
119
0
    }
120
121
    /* Create in-memory preservation map */
122
    /* FIXME: should create this when we create the container */
123
59.0k
    if (c->num_records > 0) {
124
52.7k
        khint_t k;
125
52.7k
        int r;
126
127
52.7k
        if (!(h->preservation_map = kh_init(map)))
128
0
            return NULL;
129
130
52.7k
        k = kh_put(map, h->preservation_map, "RN", &r);
131
52.7k
        if (-1 == r) return NULL;
132
52.7k
        kh_val(h->preservation_map, k).i = !fd->lossy_read_names;
133
134
52.7k
        if (CRAM_MAJOR_VERS(fd->version) == 1) {
135
0
            k = kh_put(map, h->preservation_map, "PI", &r);
136
0
            if (-1 == r) return NULL;
137
0
            kh_val(h->preservation_map, k).i = 0;
138
139
0
            k = kh_put(map, h->preservation_map, "UI", &r);
140
0
            if (-1 == r) return NULL;
141
0
            kh_val(h->preservation_map, k).i = 1;
142
143
0
            k = kh_put(map, h->preservation_map, "MI", &r);
144
0
            if (-1 == r) return NULL;
145
0
            kh_val(h->preservation_map, k).i = 1;
146
147
52.7k
        } else {
148
            // Technically SM was in 1.0, but wasn't in Java impl.
149
52.7k
            k = kh_put(map, h->preservation_map, "SM", &r);
150
52.7k
            if (-1 == r) return NULL;
151
52.7k
            kh_val(h->preservation_map, k).i = 0;
152
153
52.7k
            k = kh_put(map, h->preservation_map, "TD", &r);
154
52.7k
            if (-1 == r) return NULL;
155
52.7k
            kh_val(h->preservation_map, k).i = 0;
156
157
52.7k
            k = kh_put(map, h->preservation_map, "AP", &r);
158
52.7k
            if (-1 == r) return NULL;
159
52.7k
            kh_val(h->preservation_map, k).i = h->AP_delta;
160
161
52.7k
            if (CRAM_MAJOR_VERS(fd->version) >= 4) {
162
0
                k = kh_put(map, h->preservation_map, "QO", &r);
163
0
                if (-1 == r) return NULL;
164
0
                kh_val(h->preservation_map, k).i = h->qs_seq_orient;
165
0
            }
166
167
52.7k
            if (no_ref || embed_ref>0) {
168
                // Reference Required == No
169
50.7k
                k = kh_put(map, h->preservation_map, "RR", &r);
170
50.7k
                if (-1 == r) return NULL;
171
50.7k
                kh_val(h->preservation_map, k).i = 0;
172
50.7k
            }
173
52.7k
        }
174
52.7k
    }
175
176
    /* Encode preservation map; could collapse this and above into one */
177
59.0k
    mc = 0;
178
59.0k
    BLOCK_SIZE(map) = 0;
179
59.0k
    if (h->preservation_map) {
180
52.7k
        khint_t k;
181
182
52.7k
        for (k = kh_begin(h->preservation_map);
183
475k
             k != kh_end(h->preservation_map);
184
422k
             k++) {
185
422k
            const char *key;
186
422k
            khash_t(map) *pmap = h->preservation_map;
187
188
189
422k
            if (!kh_exist(pmap, k))
190
160k
                continue;
191
192
261k
            key = kh_key(pmap, k);
193
261k
            BLOCK_APPEND(map, key, 2);
194
195
261k
            switch(CRAM_KEY(key[0], key[1])) {
196
0
            case CRAM_KEY('M','I'):
197
0
            case CRAM_KEY('U','I'):
198
0
            case CRAM_KEY('P','I'):
199
52.7k
            case CRAM_KEY('A','P'):
200
105k
            case CRAM_KEY('R','N'):
201
156k
            case CRAM_KEY('R','R'):
202
156k
            case CRAM_KEY('Q','O'):
203
156k
                BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i);
204
156k
                break;
205
206
156k
            case CRAM_KEY('S','M'): {
207
52.7k
                char smat[5], *mp = smat;
208
                // Output format is for order ACGTN (minus ref base)
209
                // to store the code value 0-3 for each symbol.
210
                //
211
                // Note this is different to storing the symbols in order
212
                // that the codes occur from 0-3, which is what we used to
213
                // do.  (It didn't matter as we always had a fixed table in
214
                // the order.)
215
52.7k
                *mp++ =
216
52.7k
                    (sub_idx(h->substitution_matrix[0], 'C') << 6) |
217
52.7k
                    (sub_idx(h->substitution_matrix[0], 'G') << 4) |
218
52.7k
                    (sub_idx(h->substitution_matrix[0], 'T') << 2) |
219
52.7k
                    (sub_idx(h->substitution_matrix[0], 'N') << 0);
220
52.7k
                *mp++ =
221
52.7k
                    (sub_idx(h->substitution_matrix[1], 'A') << 6) |
222
52.7k
                    (sub_idx(h->substitution_matrix[1], 'G') << 4) |
223
52.7k
                    (sub_idx(h->substitution_matrix[1], 'T') << 2) |
224
52.7k
                    (sub_idx(h->substitution_matrix[1], 'N') << 0);
225
52.7k
                *mp++ =
226
52.7k
                    (sub_idx(h->substitution_matrix[2], 'A') << 6) |
227
52.7k
                    (sub_idx(h->substitution_matrix[2], 'C') << 4) |
228
52.7k
                    (sub_idx(h->substitution_matrix[2], 'T') << 2) |
229
52.7k
                    (sub_idx(h->substitution_matrix[2], 'N') << 0);
230
52.7k
                *mp++ =
231
52.7k
                    (sub_idx(h->substitution_matrix[3], 'A') << 6) |
232
52.7k
                    (sub_idx(h->substitution_matrix[3], 'C') << 4) |
233
52.7k
                    (sub_idx(h->substitution_matrix[3], 'G') << 2) |
234
52.7k
                    (sub_idx(h->substitution_matrix[3], 'N') << 0);
235
52.7k
                *mp++ =
236
52.7k
                    (sub_idx(h->substitution_matrix[4], 'A') << 6) |
237
52.7k
                    (sub_idx(h->substitution_matrix[4], 'C') << 4) |
238
52.7k
                    (sub_idx(h->substitution_matrix[4], 'G') << 2) |
239
52.7k
                    (sub_idx(h->substitution_matrix[4], 'T') << 0);
240
52.7k
                BLOCK_APPEND(map, smat, 5);
241
52.7k
                break;
242
52.7k
            }
243
244
52.7k
            case CRAM_KEY('T','D'): {
245
52.7k
                r |= (fd->vv.varint_put32_blk(map, BLOCK_SIZE(h->TD_blk)) <= 0);
246
52.7k
                BLOCK_APPEND(map,
247
52.7k
                             BLOCK_DATA(h->TD_blk),
248
52.7k
                             BLOCK_SIZE(h->TD_blk));
249
52.7k
                break;
250
52.7k
            }
251
252
52.7k
            default:
253
0
                hts_log_warning("Unknown preservation key '%.2s'", key);
254
0
                break;
255
261k
            }
256
257
261k
            mc++;
258
261k
        }
259
52.7k
    }
260
59.0k
    r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
261
59.0k
    r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
262
59.0k
    BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
263
264
    /* rec encoding map */
265
59.0k
    mc = 0;
266
59.0k
    BLOCK_SIZE(map) = 0;
267
59.0k
    if (h->codecs[DS_BF]) {
268
52.7k
        if (-1 == h->codecs[DS_BF]->store(h->codecs[DS_BF], map, "BF",
269
52.7k
                                          fd->version))
270
0
            return NULL;
271
52.7k
        mc++;
272
52.7k
    }
273
59.0k
    if (h->codecs[DS_CF]) {
274
52.7k
        if (-1 == h->codecs[DS_CF]->store(h->codecs[DS_CF], map, "CF",
275
52.7k
                                          fd->version))
276
0
            return NULL;
277
52.7k
        mc++;
278
52.7k
    }
279
59.0k
    if (h->codecs[DS_RL]) {
280
52.7k
        if (-1 == h->codecs[DS_RL]->store(h->codecs[DS_RL], map, "RL",
281
52.7k
                                          fd->version))
282
0
            return NULL;
283
52.7k
        mc++;
284
52.7k
    }
285
59.0k
    if (h->codecs[DS_AP]) {
286
52.7k
        if (-1 == h->codecs[DS_AP]->store(h->codecs[DS_AP], map, "AP",
287
52.7k
                                          fd->version))
288
0
            return NULL;
289
52.7k
        mc++;
290
52.7k
    }
291
59.0k
    if (h->codecs[DS_RG]) {
292
52.7k
        if (-1 == h->codecs[DS_RG]->store(h->codecs[DS_RG], map, "RG",
293
52.7k
                                          fd->version))
294
0
            return NULL;
295
52.7k
        mc++;
296
52.7k
    }
297
59.0k
    if (h->codecs[DS_MF]) {
298
52.7k
        if (-1 == h->codecs[DS_MF]->store(h->codecs[DS_MF], map, "MF",
299
52.7k
                                          fd->version))
300
0
            return NULL;
301
52.7k
        mc++;
302
52.7k
    }
303
59.0k
    if (h->codecs[DS_NS]) {
304
52.7k
        if (-1 == h->codecs[DS_NS]->store(h->codecs[DS_NS], map, "NS",
305
52.7k
                                          fd->version))
306
0
            return NULL;
307
52.7k
        mc++;
308
52.7k
    }
309
59.0k
    if (h->codecs[DS_NP]) {
310
52.7k
        if (-1 == h->codecs[DS_NP]->store(h->codecs[DS_NP], map, "NP",
311
52.7k
                                          fd->version))
312
0
            return NULL;
313
52.7k
        mc++;
314
52.7k
    }
315
59.0k
    if (h->codecs[DS_TS]) {
316
52.7k
        if (-1 == h->codecs[DS_TS]->store(h->codecs[DS_TS], map, "TS",
317
52.7k
                                          fd->version))
318
0
            return NULL;
319
52.7k
        mc++;
320
52.7k
    }
321
59.0k
    if (h->codecs[DS_NF]) {
322
2
        if (-1 == h->codecs[DS_NF]->store(h->codecs[DS_NF], map, "NF",
323
2
                                          fd->version))
324
0
            return NULL;
325
2
        mc++;
326
2
    }
327
59.0k
    if (h->codecs[DS_TC]) {
328
0
        if (-1 == h->codecs[DS_TC]->store(h->codecs[DS_TC], map, "TC",
329
0
                                          fd->version))
330
0
            return NULL;
331
0
        mc++;
332
0
    }
333
59.0k
    if (h->codecs[DS_TN]) {
334
0
        if (-1 == h->codecs[DS_TN]->store(h->codecs[DS_TN], map, "TN",
335
0
                                          fd->version))
336
0
            return NULL;
337
0
        mc++;
338
0
    }
339
59.0k
    if (h->codecs[DS_TL]) {
340
52.7k
        if (-1 == h->codecs[DS_TL]->store(h->codecs[DS_TL], map, "TL",
341
52.7k
                                          fd->version))
342
0
            return NULL;
343
52.7k
        mc++;
344
52.7k
    }
345
59.0k
    if (h->codecs[DS_FN]) {
346
24.9k
        if (-1 == h->codecs[DS_FN]->store(h->codecs[DS_FN], map, "FN",
347
24.9k
                                          fd->version))
348
0
            return NULL;
349
24.9k
        mc++;
350
24.9k
    }
351
59.0k
    if (h->codecs[DS_FC]) {
352
24.9k
        if (-1 == h->codecs[DS_FC]->store(h->codecs[DS_FC], map, "FC",
353
24.9k
                                          fd->version))
354
0
            return NULL;
355
24.9k
        mc++;
356
24.9k
    }
357
59.0k
    if (h->codecs[DS_FP]) {
358
24.9k
        if (-1 == h->codecs[DS_FP]->store(h->codecs[DS_FP], map, "FP",
359
24.9k
                                          fd->version))
360
0
            return NULL;
361
24.9k
        mc++;
362
24.9k
    }
363
59.0k
    if (h->codecs[DS_BS]) {
364
2.17k
        if (-1 == h->codecs[DS_BS]->store(h->codecs[DS_BS], map, "BS",
365
2.17k
                                          fd->version))
366
0
            return NULL;
367
2.17k
        mc++;
368
2.17k
    }
369
59.0k
    if (h->codecs[DS_IN]) {
370
52.7k
        if (-1 == h->codecs[DS_IN]->store(h->codecs[DS_IN], map, "IN",
371
52.7k
                                          fd->version))
372
0
            return NULL;
373
52.7k
        mc++;
374
52.7k
    }
375
59.0k
    if (h->codecs[DS_DL]) {
376
7.95k
        if (-1 == h->codecs[DS_DL]->store(h->codecs[DS_DL], map, "DL",
377
7.95k
                                          fd->version))
378
0
            return NULL;
379
7.95k
        mc++;
380
7.95k
    }
381
59.0k
    if (h->codecs[DS_BA]) {
382
5.45k
        if (-1 == h->codecs[DS_BA]->store(h->codecs[DS_BA], map, "BA",
383
5.45k
                                          fd->version))
384
0
            return NULL;
385
5.45k
        mc++;
386
5.45k
    }
387
59.0k
    if (h->codecs[DS_BB]) {
388
52.7k
        if (-1 == h->codecs[DS_BB]->store(h->codecs[DS_BB], map, "BB",
389
52.7k
                                          fd->version))
390
0
            return NULL;
391
52.7k
        mc++;
392
52.7k
    }
393
59.0k
    if (h->codecs[DS_MQ]) {
394
52.7k
        if (-1 == h->codecs[DS_MQ]->store(h->codecs[DS_MQ], map, "MQ",
395
52.7k
                                          fd->version))
396
0
            return NULL;
397
52.7k
        mc++;
398
52.7k
    }
399
59.0k
    if (h->codecs[DS_RN]) {
400
52.7k
        if (-1 == h->codecs[DS_RN]->store(h->codecs[DS_RN], map, "RN",
401
52.7k
                                          fd->version))
402
0
            return NULL;
403
52.7k
        mc++;
404
52.7k
    }
405
59.0k
    if (h->codecs[DS_QS]) {
406
52.7k
        if (-1 == h->codecs[DS_QS]->store(h->codecs[DS_QS], map, "QS",
407
52.7k
                                          fd->version))
408
0
            return NULL;
409
52.7k
        mc++;
410
52.7k
    }
411
59.0k
    if (h->codecs[DS_QQ]) {
412
0
        if (-1 == h->codecs[DS_QQ]->store(h->codecs[DS_QQ], map, "QQ",
413
0
                                          fd->version))
414
0
            return NULL;
415
0
        mc++;
416
0
    }
417
59.0k
    if (h->codecs[DS_RI]) {
418
52.7k
        if (-1 == h->codecs[DS_RI]->store(h->codecs[DS_RI], map, "RI",
419
52.7k
                                          fd->version))
420
0
            return NULL;
421
52.7k
        mc++;
422
52.7k
    }
423
59.0k
    if (CRAM_MAJOR_VERS(fd->version) != 1) {
424
59.0k
        if (h->codecs[DS_SC]) {
425
52.7k
            if (-1 == h->codecs[DS_SC]->store(h->codecs[DS_SC], map, "SC",
426
52.7k
                                              fd->version))
427
0
                return NULL;
428
52.7k
            mc++;
429
52.7k
        }
430
59.0k
        if (h->codecs[DS_RS]) {
431
127
            if (-1 == h->codecs[DS_RS]->store(h->codecs[DS_RS], map, "RS",
432
127
                                              fd->version))
433
0
                return NULL;
434
127
            mc++;
435
127
        }
436
59.0k
        if (h->codecs[DS_PD]) {
437
74
            if (-1 == h->codecs[DS_PD]->store(h->codecs[DS_PD], map, "PD",
438
74
                                              fd->version))
439
0
                return NULL;
440
74
            mc++;
441
74
        }
442
59.0k
        if (h->codecs[DS_HC]) {
443
5.21k
            if (-1 == h->codecs[DS_HC]->store(h->codecs[DS_HC], map, "HC",
444
5.21k
                                              fd->version))
445
0
                return NULL;
446
5.21k
            mc++;
447
5.21k
        }
448
59.0k
    }
449
59.0k
    if (h->codecs[DS_TM]) {
450
0
        if (-1 == h->codecs[DS_TM]->store(h->codecs[DS_TM], map, "TM",
451
0
                                          fd->version))
452
0
            return NULL;
453
0
        mc++;
454
0
    }
455
59.0k
    if (h->codecs[DS_TV]) {
456
0
        if (-1 == h->codecs[DS_TV]->store(h->codecs[DS_TV], map, "TV",
457
0
                                          fd->version))
458
0
            return NULL;
459
0
        mc++;
460
0
    }
461
59.0k
    r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
462
59.0k
    r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
463
59.0k
    BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
464
465
    /* tag encoding map */
466
59.0k
    mc = 0;
467
59.0k
    BLOCK_SIZE(map) = 0;
468
59.0k
    if (c->tags_used) {
469
52.7k
        khint_t k;
470
471
542k
        for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
472
489k
            int key;
473
489k
            if (!kh_exist(c->tags_used, k))
474
251k
                continue;
475
476
238k
            key = kh_key(c->tags_used, k);
477
238k
            cram_codec *cd = kh_val(c->tags_used, k)->codec;
478
479
238k
            r |= (fd->vv.varint_put32_blk(map, key) <= 0);
480
238k
            if (-1 == cd->store(cd, map, NULL, fd->version))
481
0
                return NULL;
482
483
238k
            mc++;
484
238k
        }
485
52.7k
    }
486
487
59.0k
    r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0);
488
59.0k
    r |= (fd->vv.varint_put32_blk(cb, mc) <= 0);
489
59.0k
    BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map));
490
491
59.0k
    hts_log_info("Wrote compression block header in %d bytes", (int)BLOCK_SIZE(cb));
492
493
59.0k
    BLOCK_UPLEN(cb);
494
495
59.0k
    cram_free_block(map);
496
497
59.0k
    if (r >= 0)
498
59.0k
        return cb;
499
500
0
 block_err:
501
0
    return NULL;
502
59.0k
}
503
504
505
/*
506
 * Encodes a slice compression header.
507
 *
508
 * Returns cram_block on success
509
 *         NULL on failure
510
 */
511
52.9k
cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) {
512
52.9k
    char *buf;
513
52.9k
    char *cp;
514
52.9k
    cram_block *b = cram_new_block(MAPPED_SLICE, 0);
515
52.9k
    int j;
516
517
52.9k
    if (!b)
518
0
        return NULL;
519
520
52.9k
    cp = buf = malloc(22+16+5*(8+s->hdr->num_blocks));
521
52.9k
    if (NULL == buf) {
522
0
        cram_free_block(b);
523
0
        return NULL;
524
0
    }
525
526
52.9k
    cp += fd->vv.varint_put32s(cp, NULL, s->hdr->ref_seq_id);
527
52.9k
    if (CRAM_MAJOR_VERS(fd->version) >= 4) {
528
0
        cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_start);
529
0
        cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_span);
530
52.9k
    } else {
531
52.9k
        if (s->hdr->ref_seq_start < 0 || s->hdr->ref_seq_start > INT_MAX) {
532
194
            hts_log_error("Reference position too large for CRAM 3");
533
194
            cram_free_block(b);
534
194
            free(buf);
535
194
            return NULL;
536
194
        }
537
52.7k
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_start);
538
52.7k
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_span);
539
52.7k
    }
540
52.7k
    cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_records);
541
52.7k
    if (CRAM_MAJOR_VERS(fd->version) == 2)
542
0
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->record_counter);
543
52.7k
    else if (CRAM_MAJOR_VERS(fd->version) >= 3)
544
52.7k
        cp += fd->vv.varint_put64(cp, NULL, s->hdr->record_counter);
545
52.7k
    cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_blocks);
546
52.7k
    cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_content_ids);
547
425k
    for (j = 0; j < s->hdr->num_content_ids; j++) {
548
372k
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->block_content_ids[j]);
549
372k
    }
550
52.7k
    if (s->hdr->content_type == MAPPED_SLICE)
551
52.7k
        cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_base_id);
552
553
52.7k
    if (CRAM_MAJOR_VERS(fd->version) != 1) {
554
52.7k
        memcpy(cp, s->hdr->md5, 16); cp += 16;
555
52.7k
    }
556
557
52.7k
    assert(cp-buf <= 22+16+5*(8+s->hdr->num_blocks));
558
559
52.7k
    b->data = (unsigned char *)buf;
560
52.7k
    b->comp_size = b->uncomp_size = cp-buf;
561
562
52.7k
    return b;
563
52.7k
}
564
565
566
/*
567
 * Encodes a single read.
568
 *
569
 * Returns 0 on success
570
 *        -1 on failure
571
 */
572
static int cram_encode_slice_read(cram_fd *fd,
573
                                  cram_container *c,
574
                                  cram_block_compression_hdr *h,
575
                                  cram_slice *s,
576
                                  cram_record *cr,
577
6.47M
                                  int64_t *last_pos) {
578
6.47M
    int r = 0;
579
6.47M
    int32_t i32;
580
6.47M
    int64_t i64;
581
6.47M
    unsigned char uc;
582
583
    //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name);
584
585
    //printf("BF=0x%x\n", cr->flags);
586
    //      bf = cram_flag_swap[cr->flags];
587
6.47M
    i32 = fd->cram_flag_swap[cr->flags & 0xfff];
588
6.47M
    r |= h->codecs[DS_BF]->encode(s, h->codecs[DS_BF], (char *)&i32, 1);
589
590
6.47M
    i32 = cr->cram_flags & CRAM_FLAG_MASK;
591
6.47M
    r |= h->codecs[DS_CF]->encode(s, h->codecs[DS_CF], (char *)&i32, 1);
592
593
6.47M
    if (CRAM_MAJOR_VERS(fd->version) != 1 && s->hdr->ref_seq_id == -2)
594
15.8k
        r |= h->codecs[DS_RI]->encode(s, h->codecs[DS_RI], (char *)&cr->ref_id, 1);
595
596
6.47M
    r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1);
597
598
6.47M
    if (c->pos_sorted) {
599
6.45M
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
600
0
            i64 = cr->apos - *last_pos;
601
0
            r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1);
602
6.45M
        } else {
603
6.45M
            i32 = cr->apos - *last_pos;
604
6.45M
            r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
605
6.45M
        }
606
6.45M
        *last_pos = cr->apos;
607
6.45M
    } else {
608
18.2k
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
609
0
            i64 = cr->apos;
610
0
            r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1);
611
18.2k
        } else {
612
18.2k
            i32 = cr->apos;
613
18.2k
            r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1);
614
18.2k
        }
615
18.2k
    }
616
617
6.47M
    r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1);
618
619
6.47M
    if (cr->cram_flags & CRAM_FLAG_DETACHED) {
620
6.47M
        i32 = cr->mate_flags;
621
6.47M
        r |= h->codecs[DS_MF]->encode(s, h->codecs[DS_MF], (char *)&i32, 1);
622
623
6.47M
        r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS],
624
6.47M
                                      (char *)&cr->mate_ref_id, 1);
625
626
6.47M
        if (CRAM_MAJOR_VERS(fd->version) >= 4) {
627
0
            r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
628
0
                                          (char *)&cr->mate_pos, 1);
629
0
            r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
630
0
                                          (char *)&cr->tlen, 1);
631
6.47M
        } else {
632
6.47M
            i32 = cr->mate_pos;
633
6.47M
            r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP],
634
6.47M
                                          (char *)&i32, 1);
635
6.47M
            i32 = cr->tlen;
636
6.47M
            r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
637
6.47M
                                          (char *)&i32, 1);
638
6.47M
        }
639
6.47M
    } else {
640
4
        if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) {
641
2
            r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF],
642
2
                                          (char *)&cr->mate_line, 1);
643
2
        }
644
4
        if (cr->cram_flags & CRAM_FLAG_EXPLICIT_TLEN) {
645
0
            if (CRAM_MAJOR_VERS(fd->version) >= 4) {
646
0
                r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS],
647
0
                                              (char *)&cr->tlen, 1);
648
0
            }
649
0
        }
650
4
    }
651
652
    /* Aux tags */
653
6.47M
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
654
0
        int j;
655
0
        uc = cr->ntags;
656
0
        r |= h->codecs[DS_TC]->encode(s, h->codecs[DS_TC], (char *)&uc, 1);
657
658
0
        for (j = 0; j < cr->ntags; j++) {
659
0
            uint32_t i32 = s->TN[cr->TN_idx + j]; // id
660
0
            r |= h->codecs[DS_TN]->encode(s, h->codecs[DS_TN], (char *)&i32, 1);
661
0
        }
662
6.47M
    } else {
663
6.47M
        r |= h->codecs[DS_TL]->encode(s, h->codecs[DS_TL], (char *)&cr->TL, 1);
664
6.47M
    }
665
666
    // qual
667
    // QS codec : Already stored in block[2].
668
669
    // features (diffs)
670
6.47M
    if (!(cr->flags & BAM_FUNMAP)) {
671
101k
        int prev_pos = 0, j;
672
673
101k
        r |= h->codecs[DS_FN]->encode(s, h->codecs[DS_FN],
674
101k
                                      (char *)&cr->nfeature, 1);
675
306k
        for (j = 0; j < cr->nfeature; j++) {
676
204k
            cram_feature *f = &s->features[cr->feature + j];
677
678
204k
            uc = f->X.code;
679
204k
            r |= h->codecs[DS_FC]->encode(s, h->codecs[DS_FC], (char *)&uc, 1);
680
204k
            i32 = f->X.pos - prev_pos;
681
204k
            r |= h->codecs[DS_FP]->encode(s, h->codecs[DS_FP], (char *)&i32, 1);
682
204k
            prev_pos = f->X.pos;
683
684
204k
            switch(f->X.code) {
685
                //char *seq;
686
687
6.24k
            case 'X':
688
                //fprintf(stderr, "    FC=%c FP=%d base=%d\n", f->X.code, i32, f->X.base);
689
690
6.24k
                uc = f->X.base;
691
6.24k
                r |= h->codecs[DS_BS]->encode(s, h->codecs[DS_BS],
692
6.24k
                                              (char *)&uc, 1);
693
6.24k
                break;
694
67.4k
            case 'S':
695
                // Already done
696
                //r |= h->codecs[DS_SC]->encode(s, h->codecs[DS_SC],
697
                //                              BLOCK_DATA(s->soft_blk) + f->S.seq_idx,
698
                //                              f->S.len);
699
700
                //if (CRAM_MAJOR_VERS(fd->version) >= 3) {
701
                //    r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
702
                //                                  BLOCK_DATA(s->seqs_blk) + f->S.seq_idx,
703
                //                                  f->S.len);
704
                //}
705
67.4k
                break;
706
418
            case 'I':
707
                //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
708
                //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
709
                //                           seq, f->S.len);
710
                //if (CRAM_MAJOR_VERS(fd->version) >= 3) {
711
                //    r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
712
                //                                  BLOCK_DATA(s->seqs_blk) + f->I.seq_idx,
713
                //                                  f->I.len);
714
                //}
715
418
                break;
716
133
            case 'i':
717
133
                uc = f->i.base;
718
133
                r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
719
133
                                              (char *)&uc, 1);
720
                //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx;
721
                //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN],
722
                //                           seq, 1);
723
133
                break;
724
84.0k
            case 'D':
725
84.0k
                i32 = f->D.len;
726
84.0k
                r |= h->codecs[DS_DL]->encode(s, h->codecs[DS_DL],
727
84.0k
                                              (char *)&i32, 1);
728
84.0k
                break;
729
730
2.15k
            case 'B':
731
                //                  // Used when we try to store a non ACGTN base or an N
732
                //                  // that aligns against a non ACGTN reference
733
734
2.15k
                uc  = f->B.base;
735
2.15k
                r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA],
736
2.15k
                                              (char *)&uc, 1);
737
738
                //                  Already added
739
                //                  uc  = f->B.qual;
740
                //                  r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
741
                //                                           (char *)&uc, 1);
742
2.15k
                break;
743
744
1.10k
            case 'b':
745
                // string of bases
746
1.10k
                r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB],
747
1.10k
                                              (char *)BLOCK_DATA(s->seqs_blk)
748
1.10k
                                                      + f->b.seq_idx,
749
1.10k
                                              f->b.len);
750
1.10k
                break;
751
752
63
            case 'Q':
753
                //                  Already added
754
                //                  uc  = f->B.qual;
755
                //                  r |= h->codecs[DS_QS]->encode(s, h->codecs[DS_QS],
756
                //                                           (char *)&uc, 1);
757
63
                break;
758
759
258
            case 'N':
760
258
                i32 = f->N.len;
761
258
                r |= h->codecs[DS_RS]->encode(s, h->codecs[DS_RS],
762
258
                                              (char *)&i32, 1);
763
258
                break;
764
765
225
            case 'P':
766
225
                i32 = f->P.len;
767
225
                r |= h->codecs[DS_PD]->encode(s, h->codecs[DS_PD],
768
225
                                              (char *)&i32, 1);
769
225
                break;
770
771
42.8k
            case 'H':
772
42.8k
                i32 = f->H.len;
773
42.8k
                r |= h->codecs[DS_HC]->encode(s, h->codecs[DS_HC],
774
42.8k
                                              (char *)&i32, 1);
775
42.8k
                break;
776
777
778
0
            default:
779
0
                hts_log_error("Unhandled feature code %c", f->X.code);
780
0
                return -1;
781
204k
            }
782
204k
        }
783
784
101k
        r |= h->codecs[DS_MQ]->encode(s, h->codecs[DS_MQ],
785
101k
                                      (char *)&cr->mqual, 1);
786
6.37M
    } else {
787
6.37M
        char *seq = (char *)BLOCK_DATA(s->seqs_blk) + cr->seq;
788
6.37M
        if (cr->len)
789
208k
            r |= h->codecs[DS_BA]->encode(s, h->codecs[DS_BA], seq, cr->len);
790
6.37M
    }
791
792
6.47M
    return r ? -1 : 0;
793
6.47M
}
794
795
796
/*
797
 * Applies various compression methods to specific blocks, depending on
798
 * known observations of how data series compress.
799
 *
800
 * Returns 0 on success
801
 *        -1 on failure
802
 */
803
52.9k
static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) {
804
52.9k
    int level = fd->level, i;
805
52.9k
    int method = 1<<GZIP | 1<<GZIP_RLE, methodF = method;
806
52.9k
    int v31_or_above = (fd->version >= (3<<8)+1);
807
808
    /* Compress the CORE Block too, with minimal zlib level */
809
52.9k
    if (level > 5 && s->block[0]->uncomp_size > 500)
810
0
        cram_compress_block2(fd, s, s->block[0], NULL, 1<<GZIP, 1);
811
812
52.9k
    if (fd->use_bz2)
813
0
        method |= 1<<BZIP2;
814
815
52.9k
    int method_rans   = (1<<RANS0) | (1<<RANS1);
816
52.9k
    int method_ranspr = method_rans;
817
818
52.9k
    if (fd->use_rans) {
819
52.9k
        method_ranspr = (1<<RANS_PR0)   | (1<<RANS_PR1);
820
52.9k
        if (level > 1)
821
52.9k
            method_ranspr |=
822
52.9k
                  (1<<RANS_PR64)  | (1<<RANS_PR9)
823
52.9k
                | (1<<RANS_PR128) | (1<<RANS_PR193);
824
52.9k
        if (level > 5)
825
0
            method_ranspr |= (1<<RANS_PR129) | (1<<RANS_PR192);
826
52.9k
    }
827
828
52.9k
    if (fd->use_rans) {
829
52.9k
        methodF |= v31_or_above ? method_ranspr : method_rans;
830
52.9k
        method  |= v31_or_above ? method_ranspr : method_rans;
831
52.9k
    }
832
833
52.9k
    int method_arith   = 0;
834
52.9k
    if (fd->use_arith) {
835
0
        method_arith = (1<<ARITH_PR0)   | (1<<ARITH_PR1);
836
0
        if (level > 1)
837
0
            method_arith |=
838
0
                  (1<<ARITH_PR64)  | (1<<ARITH_PR9)
839
0
                | (1<<ARITH_PR128) | (1<<ARITH_PR129)
840
0
                | (1<<ARITH_PR192) | (1u<<ARITH_PR193);
841
0
    }
842
52.9k
    if (fd->use_arith && v31_or_above) {
843
0
        methodF |= method_arith;
844
0
        method  |= method_arith;
845
0
    }
846
847
52.9k
    if (fd->use_lzma)
848
0
        method |= (1<<LZMA);
849
850
    /* Faster method for data series we only need entropy encoding on */
851
52.9k
    methodF = method & ~(1<<GZIP | 1<<BZIP2 | 1<<LZMA);
852
52.9k
    if (level >= 5) {
853
52.9k
        method |= 1<<GZIP_1;
854
52.9k
        methodF = method;
855
52.9k
    }
856
52.9k
    if (level == 1) {
857
0
        method &= ~(1<<GZIP);
858
0
        method |=   1<<GZIP_1;
859
0
        methodF = method;
860
0
    }
861
862
52.9k
    int qmethod  = method;
863
52.9k
    int qmethodF = method;
864
52.9k
    if (v31_or_above && fd->use_fqz) {
865
0
        qmethod  |= 1<<FQZ;
866
0
        qmethodF |= 1<<FQZ;
867
0
        if (fd->level > 4) {
868
0
            qmethod  |= 1<<FQZ_b;
869
0
            qmethodF |= 1<<FQZ_b;
870
0
        }
871
0
        if (fd->level > 6) {
872
0
            qmethod  |= (1<<FQZ_c) | (1<<FQZ_d);
873
0
            qmethodF |= (1<<FQZ_c) | (1<<FQZ_d);
874
0
        }
875
0
    }
876
877
52.9k
    pthread_mutex_lock(&fd->metrics_lock);
878
2.54M
    for (i = 0; i < DS_END; i++)
879
2.49M
        if (c->stats[i] && c->stats[i]->nvals > 16)
880
237
            fd->m[i]->unpackable = 1;
881
52.9k
    pthread_mutex_unlock(&fd->metrics_lock);
882
883
    /* Specific compression methods for certain block types */
884
52.9k
    if (cram_compress_block2(fd, s, s->block[DS_IN], fd->m[DS_IN], //IN (seq)
885
52.9k
                             method, level))
886
0
        return -1;
887
888
52.9k
    if (fd->level == 0) {
889
        /* Do nothing */
890
52.9k
    } else if (fd->level == 1) {
891
0
        if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
892
0
                                 qmethodF, 1))
893
0
            return -1;
894
0
        for (i = DS_aux; i <= DS_aux_oz; i++) {
895
0
            if (s->block[i])
896
0
                if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
897
0
                                         method, 1))
898
0
                    return -1;
899
0
        }
900
52.9k
    } else if (fd->level < 3) {
901
0
        if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
902
0
                                 qmethod, 1))
903
0
            return -1;
904
0
        if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA],
905
0
                                 method, 1))
906
0
            return -1;
907
0
        if (s->block[DS_BB])
908
0
            if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB],
909
0
                                     method, 1))
910
0
                return -1;
911
0
        for (i = DS_aux; i <= DS_aux_oz; i++) {
912
0
            if (s->block[i])
913
0
                if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
914
0
                                         method, level))
915
0
                    return -1;
916
0
        }
917
52.9k
    } else {
918
52.9k
        if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS],
919
52.9k
                                 qmethod, level))
920
0
            return -1;
921
52.9k
        if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA],
922
52.9k
                                 method, level))
923
0
            return -1;
924
52.9k
        if (s->block[DS_BB])
925
52.9k
            if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB],
926
52.9k
                                     method, level))
927
0
                return -1;
928
529k
        for (i = DS_aux; i <= DS_aux_oz; i++) {
929
476k
            if (s->block[i])
930
0
                if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
931
0
                                         method, level))
932
0
                    return -1;
933
476k
        }
934
52.9k
    }
935
936
    // NAME: best is generally xz, bzip2, zlib then rans1
937
52.9k
    int method_rn = method & ~(method_rans | method_ranspr | 1<<GZIP_RLE);
938
52.9k
    if (fd->version >= (3<<8)+1 && fd->use_tok)
939
52.9k
        method_rn |= fd->use_arith ? (1<<TOKA) : (1<<TOK3);
940
52.9k
    if (cram_compress_block2(fd, s, s->block[DS_RN], fd->m[DS_RN],
941
52.9k
                             method_rn, level))
942
0
        return -1;
943
944
    // NS shows strong local correlation as rearrangements are localised
945
52.9k
    if (s->block[DS_NS] && s->block[DS_NS] != s->block[0])
946
275
        if (cram_compress_block2(fd, s, s->block[DS_NS], fd->m[DS_NS],
947
275
                                 method, level))
948
0
            return -1;
949
950
951
    /*
952
     * Compress any auxiliary tags with their own per-tag metrics
953
     */
954
52.9k
    {
955
52.9k
        int i;
956
291k
        for (i = DS_END /*num_blk - naux_blk*/; i < s->hdr->num_blocks; i++) {
957
238k
            if (!s->block[i] || s->block[i] == s->block[0])
958
0
                continue;
959
960
238k
            if (s->block[i]->method != RAW)
961
0
                continue;
962
963
238k
            if (cram_compress_block2(fd, s, s->block[i], s->block[i]->m,
964
238k
                                     method, level))
965
0
                return -1;
966
238k
        }
967
52.9k
    }
968
969
    /*
970
     * Minimal compression of any block still uncompressed, bar CORE
971
     */
972
52.9k
    {
973
52.9k
        int i;
974
2.49M
        for (i = 1; i < s->hdr->num_blocks && i < DS_END; i++) {
975
2.43M
            if (!s->block[i] || s->block[i] == s->block[0])
976
2.05M
                continue;
977
978
383k
            if (s->block[i]->method != RAW)
979
13.5k
                continue;
980
981
369k
            if (cram_compress_block2(fd, s, s->block[i], fd->m[i],
982
369k
                                    methodF, level))
983
0
                return -1;
984
369k
        }
985
52.9k
    }
986
987
52.9k
    return 0;
988
52.9k
}
989
990
/*
991
 * Allocates a block associated with the cram codec associated with
992
 * data series ds_id or the internal codec_id (depending on codec
993
 * type).
994
 *
995
 * The ds_ids are what end up written to disk as an external block.
996
 * The c_ids are internal and used when daisy-chaining transforms
997
 * such as MAP and RLE.  These blocks are also allocated, but
998
 * are ephemeral in nature.  (The codecs themselves cannot allocate
999
 * these as the same codec pointer may be operating on multiple slices
1000
 * if we're using a multi-slice container.)
1001
 *
1002
 * Returns 0 on success
1003
 *        -1 on failure
1004
 */
1005
1.53M
static int cram_allocate_block(cram_codec *codec, cram_slice *s, int ds_id) {
1006
1.53M
    if (!codec)
1007
486k
        return 0;
1008
1009
1.05M
    switch(codec->codec) {
1010
    // Codecs which are hard-coded to use the CORE block
1011
0
    case E_GOLOMB:
1012
690k
    case E_HUFFMAN:
1013
691k
    case E_BETA:
1014
691k
    case E_SUBEXP:
1015
691k
    case E_GOLOMB_RICE:
1016
691k
    case E_GAMMA:
1017
691k
        codec->out = s->block[0];
1018
691k
        break;
1019
1020
    // Codecs which don't use external blocks
1021
0
    case E_CONST_BYTE:
1022
0
    case E_CONST_INT:
1023
0
       codec->out = NULL;
1024
0
       break;
1025
1026
    // Codecs that emit directly to external blocks
1027
199k
    case E_EXTERNAL:
1028
199k
    case E_VARINT_UNSIGNED:
1029
199k
    case E_VARINT_SIGNED:
1030
199k
        if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id)))
1031
0
            return -1;
1032
199k
        codec->u.external.content_id = ds_id;
1033
199k
        codec->out = s->block[ds_id];
1034
199k
        break;
1035
1036
105k
    case E_BYTE_ARRAY_STOP: // Why no sub-codec?
1037
105k
        if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id)))
1038
0
            return -1;
1039
105k
        codec->u.byte_array_stop.content_id = ds_id;
1040
105k
        codec->out = s->block[ds_id];
1041
105k
        break;
1042
1043
1044
    // Codecs that contain sub-codecs which may in turn emit to external blocks
1045
52.9k
    case E_BYTE_ARRAY_LEN: {
1046
52.9k
        cram_codec *bal = codec->u.e_byte_array_len.len_codec;
1047
52.9k
        if (cram_allocate_block(bal, s, bal->u.external.content_id))
1048
0
            return -1;
1049
52.9k
        bal = codec->u.e_byte_array_len.val_codec;
1050
52.9k
        if (cram_allocate_block(bal, s, bal->u.external.content_id))
1051
0
            return -1;
1052
1053
52.9k
        break;
1054
52.9k
    }
1055
1056
52.9k
    case E_XRLE:
1057
0
        if (cram_allocate_block(codec->u.e_xrle.len_codec, s, ds_id))
1058
                                //ds_id == DS_QS ? DS_QS_len : ds_id))
1059
0
            return -1;
1060
0
        if (cram_allocate_block(codec->u.e_xrle.lit_codec, s, ds_id))
1061
0
            return -1;
1062
1063
0
        break;
1064
1065
0
    case E_XPACK:
1066
0
        if (cram_allocate_block(codec->u.e_xpack.sub_codec, s, ds_id))
1067
0
            return -1;
1068
0
        codec->out = cram_new_block(0, 0); // ephemeral
1069
0
        if (!codec->out)
1070
0
            return -1;
1071
1072
0
        break;
1073
1074
0
    case E_XDELTA:
1075
0
        if (cram_allocate_block(codec->u.e_xdelta.sub_codec, s, ds_id))
1076
0
            return -1;
1077
0
        codec->out = cram_new_block(0, 0); // ephemeral
1078
0
        if (!codec->out)
1079
0
            return -1;
1080
1081
0
        break;
1082
1083
0
    default:
1084
0
        break;
1085
1.05M
    }
1086
1087
1.05M
    return 0;
1088
1.05M
}
1089
1090
/*
1091
 * Encodes a single slice from a container
1092
 *
1093
 * Returns 0 on success
1094
 *        -1 on failure
1095
 */
1096
static int cram_encode_slice(cram_fd *fd, cram_container *c,
1097
                             cram_block_compression_hdr *h, cram_slice *s,
1098
52.9k
                             int embed_ref) {
1099
52.9k
    int rec, r = 0;
1100
52.9k
    int64_t last_pos;
1101
52.9k
    enum cram_DS_ID id;
1102
1103
    /*
1104
     * Slice external blocks:
1105
     * ID 0 => base calls (insertions, soft-clip)
1106
     * ID 1 => qualities
1107
     * ID 2 => names
1108
     * ID 3 => TS (insert size), NP (next frag)
1109
     * ID 4 => tag values
1110
     * ID 6 => tag IDs (TN), if CRAM_V1.0
1111
     * ID 7 => TD tag dictionary, if !CRAM_V1.0
1112
     */
1113
1114
    /* Create cram slice header */
1115
52.9k
    s->hdr->ref_base_id = embed_ref>0 && s->hdr->ref_seq_span > 0
1116
52.9k
        ? DS_ref
1117
52.9k
        : (CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : -1);
1118
52.9k
    s->hdr->record_counter = c->num_records + c->record_counter;
1119
52.9k
    c->num_records += s->hdr->num_records;
1120
1121
52.9k
    int ntags = c->tags_used ? c->tags_used->n_occupied : 0;
1122
52.9k
    s->block = calloc(DS_END + ntags*2, sizeof(s->block[0]));
1123
52.9k
    s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t));
1124
52.9k
    if (!s->block || !s->hdr->block_content_ids)
1125
0
        return -1;
1126
1127
    // Create first fixed blocks, always external.
1128
    // CORE
1129
52.9k
    if (!(s->block[0] = cram_new_block(CORE, 0)))
1130
0
        return -1;
1131
1132
    // TN block for CRAM v1
1133
52.9k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
1134
0
        if (h->codecs[DS_TN]->codec == E_EXTERNAL) {
1135
0
            if (!(s->block[DS_TN] = cram_new_block(EXTERNAL,DS_TN))) return -1;
1136
0
            h->codecs[DS_TN]->u.external.content_id = DS_TN;
1137
0
        } else {
1138
0
            s->block[DS_TN] = s->block[0];
1139
0
        }
1140
0
    }
1141
1142
    // Embedded reference
1143
52.9k
    if (embed_ref>0) {
1144
24.8k
        if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref)))
1145
0
            return -1;
1146
24.8k
        s->ref_id = DS_ref; // needed?
1147
24.8k
        BLOCK_APPEND(s->block[DS_ref],
1148
24.8k
                     c->ref + s->hdr->ref_seq_start - c->ref_start,
1149
24.8k
                     s->hdr->ref_seq_span);
1150
24.8k
    }
1151
1152
    /*
1153
     * All the data-series blocks if appropriate.
1154
     */
1155
1.48M
    for (id = DS_QS; id < DS_TN; id++) {
1156
1.43M
        if (cram_allocate_block(h->codecs[id], s, id) < 0)
1157
0
            return -1;
1158
1.43M
    }
1159
1160
    /*
1161
     * Add in the external tag blocks too.
1162
     */
1163
52.9k
    if (c->tags_used) {
1164
52.9k
        int n;
1165
52.9k
        s->hdr->num_blocks = DS_END;
1166
291k
        for (n = 0; n < s->naux_block; n++) {
1167
238k
            s->block[s->hdr->num_blocks++] = s->aux_block[n];
1168
238k
            s->aux_block[n] = NULL;
1169
238k
        }
1170
52.9k
    }
1171
1172
    /* Encode reads */
1173
52.9k
    last_pos = s->hdr->ref_seq_start;
1174
6.52M
    for (rec = 0; rec < s->hdr->num_records; rec++) {
1175
6.47M
        cram_record *cr = &s->crecs[rec];
1176
6.47M
        if (cram_encode_slice_read(fd, c, h, s, cr, &last_pos) == -1)
1177
0
            return -1;
1178
6.47M
    }
1179
1180
52.9k
    s->block[0]->uncomp_size = s->block[0]->byte + (s->block[0]->bit < 7);
1181
52.9k
    s->block[0]->comp_size = s->block[0]->uncomp_size;
1182
1183
    // Make sure the fixed blocks point to the correct sources
1184
52.9k
    if (s->block[DS_IN]) cram_free_block(s->block[DS_IN]);
1185
52.9k
    s->block[DS_IN] = s->base_blk; s->base_blk = NULL;
1186
52.9k
    if (s->block[DS_QS]) cram_free_block(s->block[DS_QS]);
1187
52.9k
    s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL;
1188
52.9k
    if (s->block[DS_RN]) cram_free_block(s->block[DS_RN]);
1189
52.9k
    s->block[DS_RN] = s->name_blk; s->name_blk = NULL;
1190
52.9k
    if (s->block[DS_SC]) cram_free_block(s->block[DS_SC]);
1191
52.9k
    s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL;
1192
1193
    // Finalise any data transforms.
1194
1.48M
    for (id = DS_QS; id < DS_TN; id++) {
1195
1.43M
       if (h->codecs[id] && h->codecs[id]->flush)
1196
0
           h->codecs[id]->flush(h->codecs[id]);
1197
1.43M
    }
1198
1199
    // Ensure block sizes are up to date.
1200
2.72M
    for (id = 1; id < s->hdr->num_blocks; id++) {
1201
2.67M
        if (!s->block[id] || s->block[id] == s->block[0])
1202
2.05M
            continue;
1203
1204
622k
        if (s->block[id]->uncomp_size == 0)
1205
622k
            BLOCK_UPLEN(s->block[id]);
1206
622k
    }
1207
1208
    // Compress it all
1209
52.9k
    if (cram_compress_slice(fd, c, s) == -1)
1210
0
        return -1;
1211
1212
    // Collapse empty blocks and create hdr_block
1213
52.9k
    {
1214
52.9k
        int i, j;
1215
1216
52.9k
        s->hdr->block_content_ids = realloc(s->hdr->block_content_ids,
1217
52.9k
                                            s->hdr->num_blocks * sizeof(int32_t));
1218
52.9k
        if (!s->hdr->block_content_ids)
1219
0
            return -1;
1220
1221
2.72M
        for (i = j = 1; i < s->hdr->num_blocks; i++) {
1222
2.67M
            if (!s->block[i] || s->block[i] == s->block[0])
1223
2.05M
                continue;
1224
622k
            if (s->block[i]->uncomp_size == 0) {
1225
248k
                cram_free_block(s->block[i]);
1226
248k
                s->block[i] = NULL;
1227
248k
                continue;
1228
248k
            }
1229
373k
            s->block[j] = s->block[i];
1230
373k
            s->hdr->block_content_ids[j-1] = s->block[i]->content_id;
1231
373k
            j++;
1232
373k
        }
1233
52.9k
        s->hdr->num_content_ids = j-1;
1234
52.9k
        s->hdr->num_blocks = j;
1235
1236
52.9k
        if (!(s->hdr_block = cram_encode_slice_header(fd, s)))
1237
194
            return -1;
1238
52.9k
    }
1239
1240
52.7k
    return r ? -1 : 0;
1241
1242
0
 block_err:
1243
0
    return -1;
1244
52.9k
}
1245
1246
6.47M
static inline const char *bam_data_end(bam1_t *b) {
1247
6.47M
    return (const char *)b->data + b->l_data;
1248
6.47M
}
1249
1250
/*
1251
 * A bounds checking version of bam_aux2i.
1252
 */
1253
1
static inline int bam_aux2i_end(const uint8_t *aux, const uint8_t *aux_end) {
1254
1
    int type = *aux++;
1255
1
    switch (type) {
1256
0
        case 'c':
1257
0
            if (aux_end - aux < 1) {
1258
0
                errno = EINVAL;
1259
0
                return 0;
1260
0
            }
1261
0
            return *(int8_t *)aux;
1262
0
        case 'C':
1263
0
            if (aux_end - aux < 1) {
1264
0
                errno = EINVAL;
1265
0
                return 0;
1266
0
            }
1267
0
            return *aux;
1268
0
        case 's':
1269
0
            if (aux_end - aux < 2) {
1270
0
                errno = EINVAL;
1271
0
                return 0;
1272
0
            }
1273
0
            return le_to_i16(aux);
1274
0
        case 'S':
1275
0
            if (aux_end - aux < 2) {
1276
0
                errno = EINVAL;
1277
0
                return 0;
1278
0
            }
1279
0
            return le_to_u16(aux);
1280
0
        case 'i':
1281
0
            if (aux_end - aux < 4) {
1282
0
                errno = EINVAL;
1283
0
                return 0;
1284
0
            }
1285
0
            return le_to_i32(aux);
1286
0
        case 'I':
1287
0
            if (aux_end - aux < 4) {
1288
0
                errno = EINVAL;
1289
0
                return 0;
1290
0
            }
1291
0
            return le_to_u32(aux);
1292
1
        default:
1293
1
            errno = EINVAL;
1294
1
    }
1295
1
    return 0;
1296
1
}
1297
1298
/*
1299
 * Returns the number of expected read names for this record.
1300
 */
1301
0
static int expected_template_count(bam_seq_t *b) {
1302
0
    int expected = bam_flag(b) & BAM_FPAIRED ? 2 : 1;
1303
1304
0
    uint8_t *TC = (uint8_t *)bam_aux_get(b, "TC");
1305
0
    if (TC) {
1306
0
        int n = bam_aux2i_end(TC, (uint8_t *)bam_data_end(b));
1307
0
        if (expected < n)
1308
0
            expected = n;
1309
0
    }
1310
1311
0
    if (!TC && bam_aux_get(b, "SA")) {
1312
        // We could count the semicolons, but we'd have to do this for
1313
        // read1, read2 and read(not-1-or-2) combining the results
1314
        // together.  This is a cheap and safe alternative for now.
1315
0
        expected = INT_MAX;
1316
0
    }
1317
1318
0
    return expected;
1319
0
}
1320
1321
/*
1322
 * Lossily reject read names.
1323
 *
1324
 * The rule here is that if all reads for this template reside in the
1325
 * same slice then we can lose the name.  Otherwise we keep them as we
1326
 * do not know when (or if) the other reads will turn up.
1327
 *
1328
 * Note there may be only 1 read (non-paired library) or more than 2
1329
 * reads (paired library with supplementary reads), or other weird
1330
 * setups.  We need to know how many are expected.  Ways to guess:
1331
 *
1332
 * - Flags (0x1 - has > 1 read)
1333
 * - TC aux field (not mandatory)
1334
 * - SA tags (count semicolons, NB per fragment so sum - hard)
1335
 * - RNEXT/PNEXT uniqueness count. (not implemented, tricky)
1336
 *
1337
 * Returns 0 on success
1338
 *        -1 on failure
1339
 */
1340
static int lossy_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
1341
54.4k
                            int bam_start) {
1342
54.4k
    int r1, r2, ret = -1;
1343
1344
    // Initialise cram_flags
1345
6.53M
    for (r2 = 0; r2 < s->hdr->num_records; r2++)
1346
6.48M
        s->crecs[r2].cram_flags = 0;
1347
1348
54.4k
    if (!fd->lossy_read_names)
1349
54.4k
        return 0;
1350
1351
0
    khash_t(m_s2u64) *names = kh_init(m_s2u64);
1352
0
    if (!names)
1353
0
        goto fail;
1354
1355
    // 1: Iterate through names to count frequency
1356
0
    for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
1357
        //cram_record *cr = &s->crecs[r2];
1358
0
        bam_seq_t *b = c->bams[r1];
1359
0
        khint_t k;
1360
0
        int n;
1361
0
        uint64_t e;
1362
0
        union {
1363
0
            uint64_t i64;
1364
0
            struct {
1365
0
                int32_t e,c; // expected & observed counts.
1366
0
            } counts;
1367
0
        } u;
1368
1369
0
        e = expected_template_count(b);
1370
0
        u.counts.e = e; u.counts.c = 1;
1371
1372
0
        k = kh_put(m_s2u64, names, bam_name(b), &n);
1373
0
        if (n == -1)
1374
0
            goto fail;
1375
1376
0
        if (n == 0) {
1377
            // not a new name
1378
0
            u.i64 = kh_val(names, k);
1379
0
            if (u.counts.e != e) {
1380
                // different expectation or already hit the max
1381
                //fprintf(stderr, "Err computing no. %s recs\n", bam_name(b));
1382
0
                kh_val(names, k) = 0;
1383
0
            } else {
1384
0
                u.counts.c++;
1385
0
                if (u.counts.e == u.counts.c) {
1386
                    // Reached expected count.
1387
0
                    kh_val(names, k) = -1;
1388
0
                } else {
1389
0
                    kh_val(names, k) = u.i64;
1390
0
                }
1391
0
            }
1392
0
        } else {
1393
            // new name
1394
0
            kh_val(names, k) = u.i64;
1395
0
        }
1396
0
    }
1397
1398
    // 2: Remove names if all present (hd.i == -1)
1399
0
    for (r1 = bam_start, r2 = 0; r2 < s->hdr->num_records; r1++, r2++) {
1400
0
        cram_record *cr = &s->crecs[r2];
1401
0
        bam_seq_t *b = c->bams[r1];
1402
0
        khint_t k;
1403
1404
0
        k = kh_get(m_s2u64, names, bam_name(b));
1405
1406
0
        if (k == kh_end(names))
1407
0
            goto fail;
1408
1409
0
        if (kh_val(names, k) == -1)
1410
0
            cr->cram_flags = CRAM_FLAG_DISCARD_NAME;
1411
0
    }
1412
1413
0
    ret = 0;
1414
0
 fail: // ret==-1
1415
1416
0
    if (names)
1417
0
        kh_destroy(m_s2u64, names);
1418
1419
0
    return ret;
1420
0
}
1421
1422
/*
1423
 * Adds the reading names.  We do this here as a separate pass rather
1424
 * than per record in the process_one_read calls as that function can
1425
 * go back and change the CRAM_FLAG_DETACHED status of a previously
1426
 * processed read if it subsequently determines the TLEN field is
1427
 * incorrect.  Given DETACHED reads always try to decode read names,
1428
 * we need to know their status before generating the read-name block.
1429
 *
1430
 * Output is an update s->name_blk, and cr->name / cr->name_len
1431
 * fields.
1432
 */
1433
static int add_read_names(cram_fd *fd, cram_container *c, cram_slice *s,
1434
53.0k
                          int bam_start) {
1435
53.0k
    int r1, r2;
1436
53.0k
    int keep_names = !fd->lossy_read_names;
1437
1438
53.0k
    for (r1 = bam_start, r2 = 0;
1439
6.53M
         r1 < c->curr_c_rec && r2 < s->hdr->num_records;
1440
6.47M
         r1++, r2++) {
1441
6.47M
        cram_record *cr = &s->crecs[r2];
1442
6.47M
        bam_seq_t *b = c->bams[r1];
1443
1444
6.47M
        cr->name        = BLOCK_SIZE(s->name_blk);
1445
6.47M
        if ((cr->cram_flags & CRAM_FLAG_DETACHED) || keep_names) {
1446
6.47M
            if (CRAM_MAJOR_VERS(fd->version) >= 4
1447
0
                && (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM)
1448
0
                && cr->mate_line) {
1449
                // Dedup read names in V4
1450
0
                BLOCK_APPEND(s->name_blk, "\0", 1);
1451
0
                cr->name_len    = 1;
1452
6.47M
            } else {
1453
6.47M
                BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b));
1454
6.47M
                cr->name_len    = bam_name_len(b);
1455
6.47M
            }
1456
6.47M
        } else {
1457
            // Can only discard duplicate names if not detached
1458
0
            cr->name_len = 0;
1459
0
        }
1460
1461
6.47M
        if (cram_stats_add(c->stats[DS_RN], cr->name_len) < 0)
1462
0
            goto block_err;
1463
6.47M
    }
1464
1465
53.0k
    return 0;
1466
1467
0
 block_err:
1468
0
    return -1;
1469
53.0k
}
1470
1471
// CRAM version >= 3.1
1472
80.1k
#define CRAM_ge31(v) ((v) >= 0x301)
1473
1474
// Returns the next cigar op code: one of the BAM_C* codes,
1475
// or -1 if no more are present.
1476
static inline
1477
int next_cigar_op(uint32_t *cigar, uint32_t ncigar, int *skip, int *spos,
1478
85.1k
                  uint32_t *cig_ind, uint32_t *cig_op, uint32_t *cig_len) {
1479
102k
    for(;;) {
1480
166k
        while (*cig_len == 0) {
1481
63.9k
            if (*cig_ind < ncigar) {
1482
63.7k
                *cig_op  = cigar[*cig_ind] & BAM_CIGAR_MASK;
1483
63.7k
                *cig_len = cigar[*cig_ind] >> BAM_CIGAR_SHIFT;
1484
63.7k
                (*cig_ind)++;
1485
63.7k
            } else {
1486
149
                return -1;
1487
149
            }
1488
63.9k
        }
1489
1490
102k
        if (skip[*cig_op]) {
1491
17.7k
            *spos += (bam_cigar_type(*cig_op)&1) * *cig_len;
1492
17.7k
            *cig_len = 0;
1493
17.7k
            continue;
1494
17.7k
        }
1495
1496
84.9k
        (*cig_len)--;
1497
84.9k
        break;
1498
102k
    }
1499
1500
84.9k
    return *cig_op;
1501
85.1k
}
1502
1503
// Ensure ref and hist are large enough.
1504
static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos,
1505
1.73M
                             hts_pos_t ref_start, hts_pos_t *ref_end) {
1506
1.73M
    if (pos < ref_start)
1507
261
        return -1;
1508
1.73M
    if (pos < *ref_end)
1509
1.70M
        return 0;
1510
1511
    // realloc
1512
29.4k
    if (pos - ref_start > UINT_MAX)
1513
78
        return -2; // protect overflow in new_end calculation
1514
1515
29.3k
    hts_pos_t old_end = *ref_end ? *ref_end : ref_start;
1516
29.3k
    hts_pos_t new_end = ref_start + 1000 + (pos-ref_start)*1.5;
1517
1518
    // Refuse to work on excessively large blocks.
1519
    // We'll just switch to referenceless encoding, which is probably better
1520
    // here as this must be very sparse data anyway.
1521
29.3k
    if (new_end - ref_start > UINT_MAX/sizeof(**hist)/2)
1522
781
        return -2;
1523
1524
28.6k
    char *tmp = realloc(*ref, new_end-ref_start+1);
1525
28.6k
    if (!tmp)
1526
0
        return -1;
1527
28.6k
    *ref = tmp;
1528
1529
28.6k
    uint32_t (*tmp5)[5] = realloc(**hist,
1530
28.6k
                                  (new_end - ref_start)*sizeof(**hist));
1531
28.6k
    if (!tmp5)
1532
0
        return -1;
1533
28.6k
    *hist = tmp5;
1534
28.6k
    *ref_end = new_end;
1535
1536
    // initialise
1537
28.6k
    old_end -= ref_start;
1538
28.6k
    new_end -= ref_start;
1539
28.6k
    memset(&(*ref)[old_end],  0,  new_end-old_end);
1540
28.6k
    memset(&(*hist)[old_end], 0, (new_end-old_end)*sizeof(**hist));
1541
1542
28.6k
    return 0;
1543
28.6k
}
1544
1545
// Walk through MD + seq to generate ref
1546
// Returns 1 on success, <0 on failure
1547
static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5],
1548
                              hts_pos_t ref_start, hts_pos_t *ref_end,
1549
51.0k
                              const uint8_t *MD) {
1550
51.0k
    uint8_t *seq = bam_get_seq(b);
1551
51.0k
    uint32_t *cigar = bam_get_cigar(b);
1552
51.0k
    uint32_t ncigar = b->core.n_cigar;
1553
51.0k
    uint32_t cig_op = 0, cig_len = 0, cig_ind = 0;
1554
1555
    // End position of the sequence on the reference.
1556
51.0k
    hts_pos_t rlen = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
1557
51.0k
    hts_pos_t rseq_end = b->core.pos + (rlen ? rlen : b->core.l_qseq);
1558
1559
    // No sequence means extend based on CIGAR instead
1560
51.0k
    if (!b->core.l_qseq && extend_ref(ref, hist, rseq_end,
1561
22.6k
                                      ref_start, ref_end) < 0)
1562
14
        return -1;
1563
1564
51.0k
    int iseq = 0, next_op;
1565
51.0k
    hts_pos_t iref = b->core.pos - ref_start;
1566
1567
    // Skip INS, REF_SKIP, *CLIP, PAD. and BACK.
1568
51.0k
    static int cig_skip[16] = {0,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1};
1569
63.5k
    while (iseq < b->core.l_qseq && *MD) {
1570
32.4k
        if (isdigit(*MD)) {
1571
            // match
1572
22.4k
            int overflow = 0;
1573
22.4k
            int len = hts_str2uint((char *)MD, (char **)&MD, 31, &overflow);
1574
22.4k
            if (overflow ||
1575
17.3k
                extend_ref(ref, hist, iref+ref_start + len,
1576
17.3k
                           ref_start, ref_end) < 0)
1577
5.28k
                return -1;
1578
17.2k
            while (iseq < b->core.l_qseq && len) {
1579
                // rewrite to have internal loops?
1580
13.8k
                if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
1581
13.8k
                                             &iseq, &cig_ind, &cig_op,
1582
13.8k
                                             &cig_len)) < 0)
1583
13
                    return -1;
1584
1585
13.8k
                if (next_op != BAM_CMATCH &&
1586
13.8k
                    next_op != BAM_CEQUAL) {
1587
13.8k
                    hts_log_info("MD:Z and CIGAR are incompatible for "
1588
13.8k
                                 "record %s", bam_get_qname(b));
1589
13.8k
                    return -1;
1590
13.8k
                }
1591
1592
                // Short-cut loop over same cigar op for efficiency
1593
9
                cig_len++;
1594
9
                do {
1595
9
                    cig_len--;
1596
9
                    (*ref)[iref++] = seq_nt16_str[bam_seqi(seq, iseq)];
1597
9
                    iseq++;
1598
9
                    len--;
1599
9
                } while (cig_len && iseq < b->core.l_qseq && len);
1600
9
            }
1601
3.36k
            if (len > 0)
1602
6
                return -1; // MD is longer than seq
1603
9.95k
        } else if (*MD == '^') {
1604
            // deletion
1605
3.69k
            MD++;
1606
65.0k
            while (isalpha(*MD)) {
1607
65.0k
                if (extend_ref(ref, hist, iref+ref_start, ref_start,
1608
65.0k
                               ref_end) < 0)
1609
0
                    return -1;
1610
65.0k
                if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
1611
65.0k
                                             &iseq, &cig_ind, &cig_op,
1612
65.0k
                                             &cig_len)) < 0)
1613
97
                    return -1;
1614
1615
64.9k
                if (next_op != BAM_CDEL) {
1616
7
                    hts_log_info("MD:Z and CIGAR are incompatible");
1617
7
                    return -1;
1618
7
                }
1619
1620
64.9k
                (*ref)[iref++] = *MD++ & ~0x20;
1621
64.9k
            }
1622
6.26k
        } else {
1623
            // substitution
1624
6.26k
            if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end) < 0)
1625
0
                return -1;
1626
6.26k
            if ((next_op = next_cigar_op(cigar, ncigar, cig_skip,
1627
6.26k
                                         &iseq, &cig_ind, &cig_op,
1628
6.26k
                                         &cig_len)) < 0)
1629
39
                return -1;
1630
1631
6.22k
            if (next_op != BAM_CMATCH && next_op != BAM_CDIFF) {
1632
629
                hts_log_info("MD:Z and CIGAR are incompatible");
1633
629
                return -1;
1634
629
            }
1635
1636
5.59k
            (*ref)[iref++] = *MD++ & ~0x20;
1637
5.59k
            iseq++;
1638
5.59k
        }
1639
32.4k
    }
1640
1641
31.1k
    return 1;
1642
51.0k
}
1643
1644
// Append a sequence to a ref/consensus structure.
1645
// We maintain both an absolute refefence (ACGTN where MD:Z is
1646
// present) and a 5-way frequency array for when no MD:Z is known.
1647
// We then subsequently convert the 5-way frequencies to a consensus
1648
// ref in a second pass.
1649
//
1650
// Returns >=0 on success,
1651
//         -1 on failure (eg inconsistent data)
1652
static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5],
1653
95.6k
                           hts_pos_t ref_start, hts_pos_t *ref_end) {
1654
95.6k
    const uint8_t *MD = bam_aux_get(b, "MD");
1655
95.6k
    int ret = 0;
1656
95.6k
    if (MD && *MD == 'Z') {
1657
        // We can use MD to directly compute the reference
1658
51.0k
        int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end, MD+1);
1659
1660
51.0k
        if (ret > 0)
1661
31.1k
            return ret;
1662
51.0k
    }
1663
1664
    // Otherwise we just use SEQ+CIGAR and build a consensus which we later
1665
    // turn into a fake reference
1666
64.4k
    uint32_t *cigar = bam_get_cigar(b);
1667
64.4k
    uint32_t ncigar = b->core.n_cigar;
1668
64.4k
    uint32_t i, j;
1669
64.4k
    hts_pos_t iseq = 0, iref = b->core.pos - ref_start;
1670
64.4k
    uint8_t *seq = bam_get_seq(b);
1671
1.89M
    for (i = 0; i < ncigar; i++) {
1672
1.83M
        switch (bam_cigar_op(cigar[i])) {
1673
80.8k
        case BAM_CSOFT_CLIP:
1674
106k
        case BAM_CINS:
1675
106k
            iseq += bam_cigar_oplen(cigar[i]);
1676
106k
            break;
1677
1678
1.57M
        case BAM_CMATCH:
1679
1.60M
        case BAM_CEQUAL:
1680
1.60M
        case BAM_CDIFF: {
1681
1.60M
            int len = bam_cigar_oplen(cigar[i]);
1682
            // Maps an nt16 (A=1 C=2 G=4 T=8 bits) to 0123 plus N=4
1683
1.60M
            static uint8_t L16[16] = {4,0,1,4, 2,4,4,4, 3,4,4,4, 4,4,4,4};
1684
1685
1.60M
            if (extend_ref(ref, hist, iref+ref_start + len,
1686
1.60M
                           ref_start, ref_end) < 0)
1687
606
                return -1;
1688
1.60M
            if (iseq + len <= b->core.l_qseq) {
1689
                // Nullify failed MD:Z if appropriate
1690
20.4k
                if (ret < 0)
1691
0
                    memset(&(*ref)[iref], 0, len);
1692
1693
29.3k
                for (j = 0; j < len; j++, iref++, iseq++)
1694
8.89k
                    (*hist)[iref][L16[bam_seqi(seq, iseq)]]++;
1695
1.58M
            } else {
1696
                // Probably a 2ndary read with seq "*"
1697
1.58M
                iseq += len;
1698
1.58M
                iref += len;
1699
1.58M
            }
1700
1.60M
            break;
1701
1.60M
        }
1702
1703
46.1k
        case BAM_CDEL:
1704
48.2k
        case BAM_CREF_SKIP:
1705
48.2k
            iref += bam_cigar_oplen(cigar[i]);
1706
1.83M
        }
1707
1.83M
    }
1708
1709
63.8k
    return 1;
1710
64.4k
}
1711
1712
// Automatically generates the reference and stashed it in c->ref, also
1713
// setting c->ref_start and c->ref_end.
1714
//
1715
// If we have MD:Z tags then we use them to directly infer the reference,
1716
// along with SEQ + CIGAR.  Otherwise we use SEQ/CIGAR only to build up
1717
// a consensus and then assume the reference as the majority rule.
1718
//
1719
// In this latter scenario we need to be wary of auto-generating MD and NM
1720
// during decode, but that's handled elsewhere via an additional aux tag.
1721
//
1722
// Returns 0 on success,
1723
//        -1 on failure
1724
26.0k
static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) {
1725
    // TODO: if we can find an external reference then use it, even if the
1726
    // user told us to do embed_ref=2.
1727
26.0k
    char *ref = NULL;
1728
26.0k
    uint32_t (*hist)[5] = NULL;
1729
26.0k
    hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0;
1730
26.0k
    if (ref_start < 0)
1731
4
        return -1; // cannot build consensus from unmapped data
1732
1733
    // initial allocation
1734
26.0k
    if (extend_ref(&ref, &hist,
1735
26.0k
                   c->bams[r1 + s->hdr->num_records-1]->core.pos +
1736
26.0k
                   c->bams[r1 + s->hdr->num_records-1]->core.l_qseq,
1737
26.0k
                   ref_start, &ref_end) < 0)
1738
358
        return -1;
1739
1740
    // Add each bam file to the reference/consensus arrays
1741
25.7k
    int r2;
1742
25.7k
    hts_pos_t last_pos = -1;
1743
120k
    for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) {
1744
95.7k
        if (c->bams[r1]->core.pos < last_pos) {
1745
104
            hts_log_error("Cannot build reference with unsorted data");
1746
104
            goto err;
1747
104
        }
1748
95.6k
        last_pos = c->bams[r1]->core.pos;
1749
95.6k
        if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end) < 0)
1750
606
            goto err;
1751
95.6k
    }
1752
1753
    // Compute the consensus
1754
25.0k
    hts_pos_t i;
1755
2.40G
    for (i = 0; i < ref_end-ref_start; i++) {
1756
2.40G
        if (!ref[i]) {
1757
2.40G
            int max_v = 0, max_j = 4, j;
1758
12.0G
            for (j = 0; j < 4; j++)
1759
                // don't call N (j==4) unless no coverage
1760
9.61G
                if (max_v < hist[i][j])
1761
1.68k
                    max_v = hist[i][j], max_j = j;
1762
2.40G
            ref[i] = "ACGTN"[max_j];
1763
2.40G
        }
1764
2.40G
    }
1765
25.0k
    free(hist);
1766
1767
    // Put the reference in place so it appears to be an external
1768
    // ref file.
1769
25.0k
    c->ref       = ref;
1770
25.0k
    c->ref_start = ref_start+1;
1771
25.0k
    c->ref_end   = ref_end+1;
1772
25.0k
    c->ref_free  = 1;
1773
25.0k
    return 0;
1774
1775
710
 err:
1776
710
    free(ref);
1777
710
    free(hist);
1778
710
    return -1;
1779
25.7k
}
1780
1781
// Check if the SQ M5 tag matches the reference we've loaded.
1782
0
static int validate_md5(cram_fd *fd, int ref_id) {
1783
0
    if (fd->ignore_md5 || ref_id < 0 || ref_id >= fd->refs->nref)
1784
0
        return 0;
1785
1786
    // Have we already checked this ref?
1787
0
    if (fd->refs->ref_id[ref_id]->validated_md5)
1788
0
        return 0;
1789
1790
    // Check if we have the MD5 known.
1791
    // We should, but maybe we're using embedded references?
1792
0
    sam_hrecs_t *hrecs = fd->header->hrecs;
1793
0
    sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, "SQ", "SN",
1794
0
                                                 hrecs->ref[ref_id].name);
1795
0
    if (!ty)
1796
0
        return 0;
1797
1798
0
    sam_hrec_tag_t *m5tag = sam_hrecs_find_key(ty, "M5", NULL);
1799
0
    if (!m5tag)
1800
0
        return 0;
1801
1802
    // It's known, so compute md5 on the loaded reference sequence.
1803
0
    char *ref = fd->refs->ref_id[ref_id]->seq;
1804
0
    int64_t len = fd->refs->ref_id[ref_id]->length;
1805
0
    hts_md5_context *md5;
1806
0
    char unsigned buf[16];
1807
0
    char buf2[33];
1808
1809
0
    if (!(md5 = hts_md5_init()))
1810
0
        return -1;
1811
0
    hts_md5_update(md5, ref, len);
1812
0
    hts_md5_final(buf, md5);
1813
0
    hts_md5_destroy(md5);
1814
0
    hts_md5_hex(buf2, buf);
1815
1816
    // Compare it to header @SQ M5 tag
1817
0
    if (strcmp(m5tag->str+3, buf2)) {
1818
0
        hts_log_error("SQ header M5 tag discrepancy for reference '%s'",
1819
0
                      hrecs->ref[ref_id].name);
1820
0
        hts_log_error("Please use the correct reference, or "
1821
0
                      "consider using embed_ref=2");
1822
0
        return -1;
1823
0
    }
1824
0
    fd->refs->ref_id[ref_id]->validated_md5 = 1;
1825
1826
0
    return 0;
1827
0
}
1828
1829
/*
1830
 * Encodes all slices in a container into blocks.
1831
 * Returns 0 on success
1832
 *        -1 on failure
1833
 */
1834
53.4k
int cram_encode_container(cram_fd *fd, cram_container *c) {
1835
53.4k
    int i, j, slice_offset;
1836
53.4k
    cram_block_compression_hdr *h = c->comp_hdr;
1837
53.4k
    cram_block *c_hdr;
1838
53.4k
    int multi_ref = 0;
1839
53.4k
    int r1, r2, sn, nref, embed_ref, no_ref;
1840
53.4k
    spare_bams *spares;
1841
1842
53.4k
    if (!c->bams)
1843
0
        goto err;
1844
1845
53.4k
    if (CRAM_MAJOR_VERS(fd->version) == 1)
1846
0
        goto err;
1847
1848
//#define goto_err {fprintf(stderr, "ERR at %s:%d\n", __FILE__, __LINE__);goto err;}
1849
53.4k
#define goto_err goto err
1850
1851
    // Don't try embed ref if we repeatedly fail
1852
53.4k
    pthread_mutex_lock(&fd->ref_lock);
1853
53.4k
    int failed_embed = (fd->no_ref_counter >= 5); // maximum 5 tries
1854
53.4k
    if (!failed_embed && c->embed_ref == -2 && c->ref_id >= 0) {
1855
705
        hts_log_warning("Retrying embed_ref=2 mode for #%d/5", fd->no_ref_counter);
1856
705
        fd->no_ref = c->no_ref = 0;
1857
705
        fd->embed_ref = c->embed_ref = 2;
1858
52.6k
    } else if (failed_embed && c->embed_ref == -2) {
1859
        // We've tried several times, so this time give up for good
1860
22
        hts_log_warning("Keeping non-ref mode from now on");
1861
22
        fd->embed_ref = c->embed_ref = 0;
1862
22
    }
1863
53.4k
    pthread_mutex_unlock(&fd->ref_lock);
1864
1865
54.4k
 restart:
1866
    /* Cache references up-front if we have unsorted access patterns */
1867
54.4k
    pthread_mutex_lock(&fd->ref_lock);
1868
54.4k
    nref = fd->refs->nref;
1869
54.4k
    pthread_mutex_unlock(&fd->ref_lock);
1870
54.4k
    embed_ref = c->embed_ref;
1871
54.4k
    no_ref = c->no_ref;
1872
1873
    /* To create M5 strings */
1874
    /* Fetch reference sequence */
1875
54.4k
    if (!no_ref) {
1876
52.9k
        if (!c->bams || !c->curr_c_rec || !c->bams[0])
1877
45
            goto_err;
1878
52.9k
        bam_seq_t *b = c->bams[0];
1879
1880
52.9k
        if (embed_ref <= 1) {
1881
2.17k
            char *ref = cram_get_ref(fd, bam_ref(b), 1, 0);
1882
2.17k
            if (!ref && bam_ref(b) >= 0) {
1883
16
                if (!c->pos_sorted) {
1884
                    // TODO: maybe also check fd->no_ref?
1885
0
                    hts_log_warning("Failed to load reference #%d",
1886
0
                                    bam_ref(b));
1887
0
                    hts_log_warning("Switching to non-ref mode");
1888
1889
0
                    pthread_mutex_lock(&fd->ref_lock);
1890
0
                    c->embed_ref = fd->embed_ref = 0;
1891
0
                    c->no_ref = fd->no_ref = 1;
1892
0
                    pthread_mutex_unlock(&fd->ref_lock);
1893
0
                    goto restart;
1894
0
                }
1895
1896
16
                if (c->multi_seq || embed_ref == 0) {
1897
0
                    hts_log_error("Failed to load reference #%d", bam_ref(b));
1898
0
                    return -1;
1899
0
                }
1900
16
                hts_log_warning("Failed to load reference #%d", bam_ref(b));
1901
16
                hts_log_warning("Enabling embed_ref=2 mode to auto-generate"
1902
16
                                " reference");
1903
16
                if (embed_ref <= 0)
1904
16
                    hts_log_warning("NOTE: the CRAM file will be bigger than"
1905
16
                                    " using an external reference");
1906
16
                pthread_mutex_lock(&fd->ref_lock);
1907
16
                embed_ref = c->embed_ref = fd->embed_ref = 2;
1908
16
                pthread_mutex_unlock(&fd->ref_lock);
1909
16
                goto auto_ref;
1910
2.16k
            } else if (ref) {
1911
0
                if (validate_md5(fd, c->ref_seq_id) < 0)
1912
0
                    goto_err;
1913
0
            }
1914
2.16k
            if ((c->ref_id = bam_ref(b)) >= 0) {
1915
0
                c->ref_seq_id = c->ref_id;
1916
0
                c->ref       = fd->refs->ref_id[c->ref_seq_id]->seq;
1917
0
                c->ref_start = 1;
1918
0
                c->ref_end   = fd->refs->ref_id[c->ref_seq_id]->length;
1919
0
            }
1920
50.7k
        } else {
1921
50.7k
        auto_ref:
1922
            // Auto-embed ref.
1923
            // This starts as 'N' and is amended on-the-fly as we go
1924
            // based on MD:Z tags.
1925
50.7k
            if ((c->ref_id = bam_ref(b)) >= 0) {
1926
26.0k
                c->ref = NULL;
1927
                // c->ref_free is boolean; whether to free c->ref.  In this
1928
                // case c->ref will be our auto-embedded sequence instead of
1929
                // a "global" portion of reference from fd->refs.
1930
                // Do not confuse with fd->ref_free which is a pointer to a
1931
                // reference string to free.
1932
26.0k
                c->ref_free = 1;
1933
26.0k
            } else {
1934
                // Double check for broken input.  We shouldn't have
1935
                // embedded references enabled for unmapped data, but our
1936
                // data could be broken.
1937
24.7k
                embed_ref = 0;
1938
24.7k
                no_ref = c->no_ref = 1;
1939
24.7k
            }
1940
50.7k
        }
1941
52.9k
        c->ref_seq_id = c->ref_id;
1942
52.9k
    } else {
1943
1.48k
        c->ref_id = bam_ref(c->bams[0]);
1944
1.48k
        cram_ref_incr(fd->refs, c->ref_id);
1945
1.48k
        c->ref_seq_id = c->ref_id;
1946
1.48k
    }
1947
1948
54.4k
    if (!no_ref && c->refs_used) {
1949
819
        for (i = 0; i < nref; i++) {
1950
439
            if (c->refs_used[i]) {
1951
326
                if (cram_get_ref(fd, i, 1, 0)) {
1952
0
                    if (validate_md5(fd, i) < 0)
1953
0
                        goto_err;
1954
326
                } else {
1955
326
                    hts_log_warning("Failed to find reference, "
1956
326
                                    "switching to non-ref mode");
1957
326
                    no_ref = c->no_ref = 1;
1958
326
                }
1959
326
            }
1960
439
        }
1961
380
    }
1962
1963
    /* Turn bams into cram_records and gather basic stats */
1964
107k
    for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) {
1965
54.4k
        cram_slice *s = c->slices[sn];
1966
54.4k
        int64_t first_base = INT64_MAX, last_base = INT64_MIN;
1967
1968
54.4k
        int r1_start = r1;
1969
1970
54.4k
        assert(sn < c->curr_slice);
1971
1972
        // Discover which read names *may* be safely removed.
1973
        // Ie which ones have all their records in this slice.
1974
54.4k
        if (lossy_read_names(fd, c, s, r1_start) != 0)
1975
0
            return -1;
1976
1977
        // Tracking of MD tags so we can spot when the auto-generated values
1978
        // will differ from the current stored ones.  The kstring here is
1979
        // simply to avoid excessive malloc and free calls.  All initialisation
1980
        // is done within process_one_read().
1981
54.4k
        kstring_t MD = {0};
1982
1983
        // Embed consensus / MD-generated ref
1984
54.4k
        if (embed_ref == 2) {
1985
26.0k
            if (c->ref_id < 0 || cram_generate_reference(c, s, r1) < 0) {
1986
                // Should this be a permanent thing via fd->no_ref?
1987
                // Doing so means we cannot easily switch back again should
1988
                // things fix themselves later on.  This is likely not a
1989
                // concern though as failure to generate a reference implies
1990
                // unsorted data which is rarely recovered from.
1991
1992
                // Only if sn == 0.  We're hosed if we're on the 2nd slice and
1993
                // the first worked, as no-ref is a container global param.
1994
1.07k
                if (sn > 0) {
1995
0
                    hts_log_error("Failed to build reference, "
1996
0
                                  "switching to non-ref mode");
1997
0
                    return -1;
1998
1.07k
                } else {
1999
1.07k
                    hts_log_warning("Failed to build reference, "
2000
1.07k
                                    "switching to non-ref mode");
2001
1.07k
                }
2002
1.07k
                pthread_mutex_lock(&fd->ref_lock);
2003
1.07k
                c->embed_ref = fd->embed_ref = -2; // was previously embed_ref
2004
1.07k
                c->no_ref = fd->no_ref = 1;
2005
1.07k
                fd->no_ref_counter++; // more likely to keep permanent action
2006
1.07k
                pthread_mutex_unlock(&fd->ref_lock);
2007
1.07k
                failed_embed = 1;
2008
1.07k
                goto restart;
2009
25.0k
            } else {
2010
25.0k
                pthread_mutex_lock(&fd->ref_lock);
2011
25.0k
                fd->no_ref_counter -= (fd->no_ref_counter > 0);
2012
25.0k
                pthread_mutex_unlock(&fd->ref_lock);
2013
25.0k
            }
2014
2015
25.0k
            if (c->ref_end > fd->refs->ref_id[c->ref_id]->LN_length)
2016
21.0k
                c->ref_end = fd->refs->ref_id[c->ref_id]->LN_length;
2017
25.0k
        }
2018
2019
        // Iterate through records creating the cram blocks for some
2020
        // fields and just gathering stats for others.
2021
6.53M
        for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) {
2022
6.47M
            cram_record *cr = &s->crecs[r2];
2023
6.47M
            bam_seq_t *b = c->bams[r1];
2024
2025
            /* If multi-ref we need to cope with changing reference per seq */
2026
6.47M
            if (c->multi_seq && !no_ref) {
2027
450
                if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) {
2028
0
                    if (c->ref_seq_id >= 0)
2029
0
                        cram_ref_decr(fd->refs, c->ref_seq_id);
2030
2031
0
                    if (!cram_get_ref(fd, bam_ref(b), 1, 0)) {
2032
0
                        hts_log_error("Failed to load reference #%d", bam_ref(b));
2033
0
                        free(MD.s);
2034
0
                        return -1;
2035
0
                    }
2036
0
                    if (validate_md5(fd, bam_ref(b)) < 0)
2037
0
                        return -1;
2038
2039
0
                    c->ref_seq_id = bam_ref(b); // overwritten later by -2
2040
0
                    if (!fd->refs->ref_id[c->ref_seq_id]->seq)
2041
0
                        return -1;
2042
0
                    c->ref       = fd->refs->ref_id[c->ref_seq_id]->seq;
2043
0
                    c->ref_start = 1;
2044
0
                    c->ref_end   = fd->refs->ref_id[c->ref_seq_id]->length;
2045
0
                }
2046
450
            }
2047
2048
6.47M
            if (process_one_read(fd, c, s, cr, b, r2, &MD, embed_ref,
2049
6.47M
                                 no_ref) != 0) {
2050
288
                free(MD.s);
2051
288
                return -1;
2052
288
            }
2053
2054
6.47M
            if (first_base > cr->apos)
2055
53.6k
                first_base = cr->apos;
2056
2057
6.47M
            if (last_base < cr->aend)
2058
53.6k
                last_base = cr->aend;
2059
6.47M
        }
2060
2061
53.0k
        free(MD.s);
2062
2063
        // Process_one_read doesn't add read names as it can change
2064
        // its mind during the loop on the CRAM_FLAG_DETACHED setting
2065
        // of earlier records (if it detects the auto-generation of
2066
        // TLEN is incorrect).  This affects which read-names can be
2067
        // lossily compressed, so we do these in another pass.
2068
53.0k
        if (add_read_names(fd, c, s, r1_start) < 0)
2069
0
            return -1;
2070
2071
53.0k
        if (c->multi_seq) {
2072
721
            s->hdr->ref_seq_id    = -2;
2073
721
            s->hdr->ref_seq_start = 0;
2074
721
            s->hdr->ref_seq_span  = 0;
2075
52.3k
        } else if (c->ref_id == -1 && CRAM_ge31(fd->version)) {
2076
            // Spec states span=0, but it broke our range queries.
2077
            // See commit message for this and prior.
2078
26.6k
            s->hdr->ref_seq_id    = -1;
2079
26.6k
            s->hdr->ref_seq_start = 0;
2080
26.6k
            s->hdr->ref_seq_span  = 0;
2081
26.6k
        } else {
2082
25.6k
            s->hdr->ref_seq_id    = c->ref_id;
2083
25.6k
            s->hdr->ref_seq_start = first_base;
2084
25.6k
            s->hdr->ref_seq_span  = MAX(0, last_base - first_base + 1);
2085
25.6k
        }
2086
53.0k
        s->hdr->num_records = r2;
2087
2088
        // Processed a slice, now stash the aux blocks so the next
2089
        // slice can start aggregating them from the start again.
2090
53.0k
        if (c->tags_used->n_occupied) {
2091
47.5k
            int ntags = c->tags_used->n_occupied;
2092
47.5k
            s->aux_block = calloc(ntags*2, sizeof(*s->aux_block));
2093
47.5k
            if (!s->aux_block)
2094
0
                return -1;
2095
2096
47.5k
            khint_t k;
2097
2098
47.5k
            s->naux_block = 0;
2099
538k
            for (k = kh_begin(c->tags_used); k != kh_end(c->tags_used); k++) {
2100
490k
                if (!kh_exist(c->tags_used, k))
2101
252k
                    continue;
2102
2103
238k
                cram_tag_map *tm = kh_val(c->tags_used, k);
2104
238k
                if (!tm) goto_err;
2105
238k
                if (!tm->blk) continue;
2106
238k
                s->aux_block[s->naux_block++] = tm->blk;
2107
238k
                tm->blk = NULL;
2108
238k
                if (!tm->blk2) continue;
2109
0
                s->aux_block[s->naux_block++] = tm->blk2;
2110
0
                tm->blk2 = NULL;
2111
0
            }
2112
47.5k
            assert(s->naux_block <= 2*c->tags_used->n_occupied);
2113
47.5k
        }
2114
53.0k
    }
2115
2116
53.0k
    if (c->multi_seq && !no_ref) {
2117
54
        if (c->ref_seq_id >= 0)
2118
0
            cram_ref_decr(fd->refs, c->ref_seq_id);
2119
54
    }
2120
2121
    /* Link our bams[] array onto the spare bam list for reuse */
2122
53.0k
    spares = malloc(sizeof(*spares));
2123
53.0k
    if (!spares) goto_err;
2124
53.0k
    pthread_mutex_lock(&fd->bam_list_lock);
2125
53.0k
    spares->bams = c->bams;
2126
53.0k
    spares->next = fd->bl;
2127
53.0k
    fd->bl = spares;
2128
53.0k
    pthread_mutex_unlock(&fd->bam_list_lock);
2129
53.0k
    c->bams = NULL;
2130
2131
    /* Detect if a multi-seq container */
2132
53.0k
    cram_stats_encoding(fd, c->stats[DS_RI]);
2133
53.0k
    multi_ref = c->stats[DS_RI]->nvals > 1;
2134
53.0k
    pthread_mutex_lock(&fd->metrics_lock);
2135
53.0k
    fd->last_RI_count = c->stats[DS_RI]->nvals;
2136
53.0k
    pthread_mutex_unlock(&fd->metrics_lock);
2137
2138
2139
53.0k
    if (multi_ref) {
2140
310
        hts_log_info("Multi-ref container");
2141
310
        c->ref_seq_id = -2;
2142
310
        c->ref_seq_start = 0;
2143
310
        c->ref_seq_span = 0;
2144
310
    }
2145
2146
2147
    /* Compute MD5s */
2148
53.0k
    no_ref = c->no_ref;
2149
53.0k
    int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0;
2150
2151
106k
    for (i = 0; i < c->curr_slice; i++) {
2152
53.0k
        cram_slice *s = c->slices[i];
2153
2154
53.0k
        if (CRAM_MAJOR_VERS(fd->version) != 1) {
2155
53.0k
            if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !no_ref) {
2156
24.8k
                hts_md5_context *md5 = hts_md5_init();
2157
24.8k
                if (!md5)
2158
0
                    return -1;
2159
24.8k
                hts_md5_update(md5,
2160
24.8k
                               c->ref + s->hdr->ref_seq_start - c->ref_start,
2161
24.8k
                               s->hdr->ref_seq_span);
2162
24.8k
                hts_md5_final(s->hdr->md5, md5);
2163
24.8k
                hts_md5_destroy(md5);
2164
28.2k
            } else {
2165
28.2k
                memset(s->hdr->md5, 0, 16);
2166
28.2k
            }
2167
53.0k
        }
2168
53.0k
    }
2169
2170
53.0k
    c->num_records = 0;
2171
53.0k
    c->num_blocks = 1; // cram_block_compression_hdr
2172
53.0k
    c->length = 0;
2173
2174
    //fprintf(stderr, "=== BF ===\n");
2175
53.0k
    h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]),
2176
53.0k
                                         c->stats[DS_BF], E_INT, NULL,
2177
53.0k
                                         fd->version, &fd->vv);
2178
53.0k
    if (c->stats[DS_BF]->nvals && !h->codecs[DS_BF]) goto_err;
2179
2180
    //fprintf(stderr, "=== CF ===\n");
2181
53.0k
    h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]),
2182
53.0k
                                         c->stats[DS_CF], E_INT, NULL,
2183
53.0k
                                         fd->version, &fd->vv);
2184
53.0k
    if (c->stats[DS_CF]->nvals && !h->codecs[DS_CF]) goto_err;
2185
2186
    //fprintf(stderr, "=== RN ===\n");
2187
    //h->codecs[DS_RN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RN]),
2188
    //                                     c->stats[DS_RN], E_BYTE_ARRAY, NULL,
2189
    //                                     fd->version);
2190
2191
    //fprintf(stderr, "=== AP ===\n");
2192
53.0k
    if (c->pos_sorted || CRAM_MAJOR_VERS(fd->version) >= 4) {
2193
52.0k
        if (c->pos_sorted)
2194
52.0k
            h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]),
2195
52.0k
                                                 c->stats[DS_AP],
2196
52.0k
                                                 is_v4 ? E_LONG : E_INT,
2197
52.0k
                                                 NULL, fd->version, &fd->vv);
2198
0
        else
2199
            // Unsorted data has no stats, but hard-code VARINT_SIGNED / EXT.
2200
0
            h->codecs[DS_AP] = cram_encoder_init(is_v4 ? E_VARINT_SIGNED
2201
0
                                                       : E_EXTERNAL,
2202
0
                                                 NULL,
2203
0
                                                 is_v4 ? E_LONG : E_INT,
2204
0
                                                 NULL, fd->version, &fd->vv);
2205
52.0k
    } else {
2206
        // Removed BETA in v4.0.
2207
        // Should we consider dropping use of it for 3.0 too?
2208
1.04k
        hts_pos_t p[2] = {0, c->max_apos};
2209
1.04k
        h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL,
2210
1.04k
                                             is_v4 ? E_LONG : E_INT,
2211
1.04k
                                             p, fd->version, &fd->vv);
2212
//      cram_xdelta_encoder e;
2213
//      e.word_size = is_v4 ? 8 : 4;
2214
//      e.sub_encoding = E_EXTERNAL;
2215
//      e.sub_codec_dat = (void *)DS_AP;
2216
//
2217
//      h->codecs[DS_AP] = cram_encoder_init(E_XDELTA, NULL,
2218
//                                           is_v4 ? E_LONG : E_INT,
2219
//                                           &e, fd->version, &fd->vv);
2220
1.04k
    }
2221
53.0k
    if (!h->codecs[DS_AP]) goto_err;
2222
2223
    //fprintf(stderr, "=== RG ===\n");
2224
52.9k
    h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]),
2225
52.9k
                                         c->stats[DS_RG],
2226
52.9k
                                         E_INT,
2227
52.9k
                                         NULL,
2228
52.9k
                                         fd->version, &fd->vv);
2229
52.9k
    if (c->stats[DS_RG]->nvals && !h->codecs[DS_RG]) goto_err;
2230
2231
    //fprintf(stderr, "=== MQ ===\n");
2232
52.9k
    h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]),
2233
52.9k
                                         c->stats[DS_MQ], E_INT, NULL,
2234
52.9k
                                         fd->version, &fd->vv);
2235
52.9k
    if (c->stats[DS_MQ]->nvals && !h->codecs[DS_MQ]) goto_err;
2236
2237
    //fprintf(stderr, "=== NS ===\n");
2238
52.9k
    h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]),
2239
52.9k
                                         c->stats[DS_NS], E_INT, NULL,
2240
52.9k
                                         fd->version, &fd->vv);
2241
52.9k
    if (c->stats[DS_NS]->nvals && !h->codecs[DS_NS]) goto_err;
2242
2243
    //fprintf(stderr, "=== MF ===\n");
2244
52.9k
    h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]),
2245
52.9k
                                         c->stats[DS_MF], E_INT, NULL,
2246
52.9k
                                         fd->version, &fd->vv);
2247
52.9k
    if (c->stats[DS_MF]->nvals && !h->codecs[DS_MF]) goto_err;
2248
2249
    //fprintf(stderr, "=== TS ===\n");
2250
52.9k
    h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]),
2251
52.9k
                                         c->stats[DS_TS],
2252
52.9k
                                         is_v4 ? E_LONG : E_INT,
2253
52.9k
                                         NULL, fd->version, &fd->vv);
2254
52.9k
    if (c->stats[DS_TS]->nvals && !h->codecs[DS_TS]) goto_err;
2255
2256
    //fprintf(stderr, "=== NP ===\n");
2257
52.9k
    h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]),
2258
52.9k
                                         c->stats[DS_NP],
2259
52.9k
                                         is_v4 ? E_LONG : E_INT,
2260
52.9k
                                         NULL, fd->version, &fd->vv);
2261
52.9k
    if (c->stats[DS_NP]->nvals && !h->codecs[DS_NP]) goto_err;
2262
2263
    //fprintf(stderr, "=== NF ===\n");
2264
52.9k
    h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]),
2265
52.9k
                                         c->stats[DS_NF], E_INT, NULL,
2266
52.9k
                                         fd->version, &fd->vv);
2267
52.9k
    if (c->stats[DS_NF]->nvals && !h->codecs[DS_NF]) goto_err;
2268
2269
    //fprintf(stderr, "=== RL ===\n");
2270
52.9k
    h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]),
2271
52.9k
                                         c->stats[DS_RL], E_INT, NULL,
2272
52.9k
                                         fd->version, &fd->vv);
2273
52.9k
    if (c->stats[DS_RL]->nvals && !h->codecs[DS_RL]) goto_err;
2274
2275
    //fprintf(stderr, "=== FN ===\n");
2276
52.9k
    h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]),
2277
52.9k
                                         c->stats[DS_FN], E_INT, NULL,
2278
52.9k
                                         fd->version, &fd->vv);
2279
52.9k
    if (c->stats[DS_FN]->nvals && !h->codecs[DS_FN]) goto_err;
2280
2281
    //fprintf(stderr, "=== FC ===\n");
2282
52.9k
    h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]),
2283
52.9k
                                         c->stats[DS_FC], E_BYTE, NULL,
2284
52.9k
                                         fd->version, &fd->vv);
2285
52.9k
    if (c->stats[DS_FC]->nvals && !h->codecs[DS_FC]) goto_err;
2286
2287
    //fprintf(stderr, "=== FP ===\n");
2288
52.9k
    h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]),
2289
52.9k
                                         c->stats[DS_FP], E_INT, NULL,
2290
52.9k
                                         fd->version, &fd->vv);
2291
52.9k
    if (c->stats[DS_FP]->nvals && !h->codecs[DS_FP]) goto_err;
2292
2293
    //fprintf(stderr, "=== DL ===\n");
2294
52.9k
    h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]),
2295
52.9k
                                         c->stats[DS_DL], E_INT, NULL,
2296
52.9k
                                         fd->version, &fd->vv);
2297
52.9k
    if (c->stats[DS_DL]->nvals && !h->codecs[DS_DL]) goto_err;
2298
2299
    //fprintf(stderr, "=== BA ===\n");
2300
52.9k
    h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]),
2301
52.9k
                                         c->stats[DS_BA], E_BYTE, NULL,
2302
52.9k
                                         fd->version, &fd->vv);
2303
52.9k
    if (c->stats[DS_BA]->nvals && !h->codecs[DS_BA]) goto_err;
2304
2305
52.9k
    if (CRAM_MAJOR_VERS(fd->version) >= 3) {
2306
52.9k
        cram_byte_array_len_encoder e;
2307
2308
52.9k
        e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
2309
52.9k
            ? E_VARINT_UNSIGNED
2310
52.9k
            : E_EXTERNAL;
2311
52.9k
        e.len_dat = (void *)DS_BB_len;
2312
        //e.len_dat = (void *)DS_BB;
2313
2314
52.9k
        e.val_encoding = E_EXTERNAL;
2315
52.9k
        e.val_dat = (void *)DS_BB;
2316
2317
52.9k
        h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
2318
52.9k
                                             E_BYTE_ARRAY, (void *)&e,
2319
52.9k
                                             fd->version, &fd->vv);
2320
52.9k
        if (!h->codecs[DS_BB]) goto_err;
2321
52.9k
    } else {
2322
0
        h->codecs[DS_BB] = NULL;
2323
0
    }
2324
2325
    //fprintf(stderr, "=== BS ===\n");
2326
52.9k
    h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]),
2327
52.9k
                                         c->stats[DS_BS], E_BYTE, NULL,
2328
52.9k
                                         fd->version, &fd->vv);
2329
52.9k
    if (c->stats[DS_BS]->nvals && !h->codecs[DS_BS]) goto_err;
2330
2331
52.9k
    if (CRAM_MAJOR_VERS(fd->version) == 1) {
2332
0
        h->codecs[DS_TL] = NULL;
2333
0
        h->codecs[DS_RI] = NULL;
2334
0
        h->codecs[DS_RS] = NULL;
2335
0
        h->codecs[DS_PD] = NULL;
2336
0
        h->codecs[DS_HC] = NULL;
2337
0
        h->codecs[DS_SC] = NULL;
2338
2339
        //fprintf(stderr, "=== TC ===\n");
2340
0
        h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]),
2341
0
                                             c->stats[DS_TC], E_BYTE, NULL,
2342
0
                                             fd->version, &fd->vv);
2343
0
        if (c->stats[DS_TC]->nvals && !h->codecs[DS_TC]) goto_err;
2344
2345
        //fprintf(stderr, "=== TN ===\n");
2346
0
        h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]),
2347
0
                                             c->stats[DS_TN], E_INT, NULL,
2348
0
                                             fd->version, &fd->vv);
2349
0
        if (c->stats[DS_TN]->nvals && !h->codecs[DS_TN]) goto_err;
2350
52.9k
    } else {
2351
52.9k
        h->codecs[DS_TC] = NULL;
2352
52.9k
        h->codecs[DS_TN] = NULL;
2353
2354
        //fprintf(stderr, "=== TL ===\n");
2355
52.9k
        h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]),
2356
52.9k
                                             c->stats[DS_TL], E_INT, NULL,
2357
52.9k
                                             fd->version, &fd->vv);
2358
52.9k
        if (c->stats[DS_TL]->nvals && !h->codecs[DS_TL]) goto_err;
2359
2360
2361
        //fprintf(stderr, "=== RI ===\n");
2362
52.9k
        h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]),
2363
52.9k
                                             c->stats[DS_RI], E_INT, NULL,
2364
52.9k
                                             fd->version, &fd->vv);
2365
52.9k
        if (c->stats[DS_RI]->nvals && !h->codecs[DS_RI]) goto_err;
2366
2367
        //fprintf(stderr, "=== RS ===\n");
2368
52.9k
        h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]),
2369
52.9k
                                             c->stats[DS_RS], E_INT, NULL,
2370
52.9k
                                             fd->version, &fd->vv);
2371
52.9k
        if (c->stats[DS_RS]->nvals && !h->codecs[DS_RS]) goto_err;
2372
2373
        //fprintf(stderr, "=== PD ===\n");
2374
52.9k
        h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]),
2375
52.9k
                                             c->stats[DS_PD], E_INT, NULL,
2376
52.9k
                                             fd->version, &fd->vv);
2377
52.9k
        if (c->stats[DS_PD]->nvals && !h->codecs[DS_PD]) goto_err;
2378
2379
        //fprintf(stderr, "=== HC ===\n");
2380
52.9k
        h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]),
2381
52.9k
                                             c->stats[DS_HC], E_INT, NULL,
2382
52.9k
                                             fd->version, &fd->vv);
2383
52.9k
        if (c->stats[DS_HC]->nvals && !h->codecs[DS_HC]) goto_err;
2384
2385
        //fprintf(stderr, "=== SC ===\n");
2386
52.9k
        if (1) {
2387
52.9k
            int i2[2] = {0, DS_SC};
2388
2389
52.9k
            h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2390
52.9k
                                                 E_BYTE_ARRAY, (void *)i2,
2391
52.9k
                                                 fd->version, &fd->vv);
2392
52.9k
        } else {
2393
            // Appears to be no practical benefit to using this method,
2394
            // but it may work better if we start mixing SC, IN and BB
2395
            // elements into the same external block.
2396
0
            cram_byte_array_len_encoder e;
2397
2398
0
            e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
2399
0
                ? E_VARINT_UNSIGNED
2400
0
                : E_EXTERNAL;
2401
0
            e.len_dat = (void *)DS_SC_len;
2402
2403
0
            e.val_encoding = E_EXTERNAL;
2404
0
            e.val_dat = (void *)DS_SC;
2405
2406
0
            h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
2407
0
                                                 E_BYTE_ARRAY, (void *)&e,
2408
0
                                                 fd->version, &fd->vv);
2409
0
        }
2410
52.9k
        if (!h->codecs[DS_SC]) goto_err;
2411
52.9k
    }
2412
2413
    //fprintf(stderr, "=== IN ===\n");
2414
52.9k
    {
2415
52.9k
        int i2[2] = {0, DS_IN};
2416
52.9k
        h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2417
52.9k
                                             E_BYTE_ARRAY, (void *)i2,
2418
52.9k
                                             fd->version, &fd->vv);
2419
52.9k
        if (!h->codecs[DS_IN]) goto_err;
2420
52.9k
    }
2421
2422
52.9k
    h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE,
2423
52.9k
                                         (void *)DS_QS,
2424
52.9k
                                         fd->version, &fd->vv);
2425
52.9k
    if (!h->codecs[DS_QS]) goto_err;
2426
52.9k
    {
2427
52.9k
        int i2[2] = {0, DS_RN};
2428
52.9k
        h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2429
52.9k
                                             E_BYTE_ARRAY, (void *)i2,
2430
52.9k
                                             fd->version, &fd->vv);
2431
52.9k
        if (!h->codecs[DS_RN]) goto_err;
2432
52.9k
    }
2433
2434
2435
    /* Encode slices */
2436
105k
    for (i = 0; i < c->curr_slice; i++) {
2437
52.9k
        hts_log_info("Encode slice %d", i);
2438
2439
52.9k
        int local_embed_ref =
2440
52.9k
            embed_ref>0 && c->slices[i]->hdr->ref_seq_id != -1 ? 1 : 0;
2441
52.9k
        if (cram_encode_slice(fd, c, h, c->slices[i], local_embed_ref) != 0)
2442
194
            return -1;
2443
52.9k
    }
2444
2445
    /* Create compression header */
2446
52.7k
    {
2447
52.7k
        h->ref_seq_id    = c->ref_seq_id;
2448
52.7k
        h->ref_seq_start = c->ref_seq_start;
2449
52.7k
        h->ref_seq_span  = c->ref_seq_span;
2450
52.7k
        h->num_records   = c->num_records;
2451
52.7k
        h->qs_seq_orient = c->qs_seq_orient;
2452
        // slight misnomer - sorted or treat as-if sorted (ap_delta force to 1)
2453
52.7k
        h->AP_delta      = c->pos_sorted;
2454
52.7k
        memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20);
2455
2456
52.7k
        if (!(c_hdr = cram_encode_compression_header(fd, c, h, embed_ref)))
2457
0
            return -1;
2458
52.7k
    }
2459
2460
    /* Compute landmarks */
2461
    /* Fill out slice landmarks */
2462
52.7k
    c->num_landmarks = c->curr_slice;
2463
52.7k
    c->landmark = malloc(c->num_landmarks * sizeof(*c->landmark));
2464
52.7k
    if (!c->landmark)
2465
0
        return -1;
2466
2467
    /*
2468
     * Slice offset starts after the first block, so we need to simulate
2469
     * writing it to work out the correct offset
2470
     */
2471
52.7k
    {
2472
52.7k
        slice_offset = c_hdr->method == RAW
2473
52.7k
            ? c_hdr->uncomp_size
2474
52.7k
            : c_hdr->comp_size;
2475
52.7k
        slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
2476
52.7k
            fd->vv.varint_size(c_hdr->content_id) +
2477
52.7k
            fd->vv.varint_size(c_hdr->comp_size) +
2478
52.7k
            fd->vv.varint_size(c_hdr->uncomp_size);
2479
52.7k
    }
2480
2481
52.7k
    c->ref_seq_id    = c->slices[0]->hdr->ref_seq_id;
2482
52.7k
    if (c->ref_seq_id == -1 && CRAM_ge31(fd->version)) {
2483
        // Spec states span=0, but it broke our range queries.
2484
        // See commit message for this and prior.
2485
26.6k
        c->ref_seq_start = 0;
2486
26.6k
        c->ref_seq_span  = 0;
2487
26.6k
    } else {
2488
26.1k
        c->ref_seq_start = c->slices[0]->hdr->ref_seq_start;
2489
26.1k
        c->ref_seq_span  = c->slices[0]->hdr->ref_seq_span;
2490
26.1k
    }
2491
105k
    for (i = 0; i < c->curr_slice; i++) {
2492
52.7k
        cram_slice *s = c->slices[i];
2493
2494
52.7k
        c->num_blocks += s->hdr->num_blocks + 1; // slice header
2495
52.7k
        c->landmark[i] = slice_offset;
2496
2497
52.7k
        if (s->hdr->ref_seq_start + s->hdr->ref_seq_span >
2498
52.7k
            c->ref_seq_start + c->ref_seq_span) {
2499
0
            c->ref_seq_span = s->hdr->ref_seq_start + s->hdr->ref_seq_span
2500
0
                - c->ref_seq_start;
2501
0
        }
2502
2503
52.7k
        slice_offset += s->hdr_block->method == RAW
2504
52.7k
            ? s->hdr_block->uncomp_size
2505
52.7k
            : s->hdr_block->comp_size;
2506
2507
52.7k
        slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
2508
52.7k
            fd->vv.varint_size(s->hdr_block->content_id) +
2509
52.7k
            fd->vv.varint_size(s->hdr_block->comp_size) +
2510
52.7k
            fd->vv.varint_size(s->hdr_block->uncomp_size);
2511
2512
478k
        for (j = 0; j < s->hdr->num_blocks; j++) {
2513
425k
            slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) +
2514
425k
                fd->vv.varint_size(s->block[j]->content_id) +
2515
425k
                fd->vv.varint_size(s->block[j]->comp_size) +
2516
425k
                fd->vv.varint_size(s->block[j]->uncomp_size);
2517
2518
425k
            slice_offset += s->block[j]->method == RAW
2519
425k
                ? s->block[j]->uncomp_size
2520
425k
                : s->block[j]->comp_size;
2521
425k
        }
2522
52.7k
    }
2523
52.7k
    c->length += slice_offset; // just past the final slice
2524
2525
52.7k
    c->comp_hdr_block = c_hdr;
2526
2527
52.7k
    if (c->ref_seq_id >= 0) {
2528
25.4k
        if (c->ref_free) {
2529
25.3k
            free(c->ref);
2530
25.3k
            c->ref = NULL;
2531
25.3k
        } else {
2532
70
            cram_ref_decr(fd->refs, c->ref_seq_id);
2533
70
        }
2534
25.4k
    }
2535
2536
    /* Cache references up-front if we have unsorted access patterns */
2537
52.7k
    if (!no_ref && c->refs_used) {
2538
103
        for (i = 0; i < fd->refs->nref; i++) {
2539
49
            if (c->refs_used[i])
2540
0
                cram_ref_decr(fd->refs, i);
2541
49
        }
2542
54
    }
2543
2544
52.7k
    return 0;
2545
2546
128
 err:
2547
128
    return -1;
2548
52.7k
}
2549
2550
2551
/*
2552
 * Adds a feature code to a read within a slice. For purposes of minimising
2553
 * memory allocations and fragmentation we have one array of features for all
2554
 * reads within the slice. We return the index into this array for this new
2555
 * feature.
2556
 *
2557
 * Returns feature index on success
2558
 *         -1 on failure.
2559
 */
2560
static int cram_add_feature(cram_container *c, cram_slice *s,
2561
205k
                            cram_record *r, cram_feature *f) {
2562
205k
    if (s->nfeatures >= s->afeatures) {
2563
25.1k
        s->afeatures = s->afeatures ? s->afeatures*2 : 1024;
2564
25.1k
        s->features = realloc(s->features, s->afeatures*sizeof(*s->features));
2565
25.1k
        if (!s->features)
2566
0
            return -1;
2567
25.1k
    }
2568
2569
205k
    if (!r->nfeature++) {
2570
101k
        r->feature = s->nfeatures;
2571
101k
        if (cram_stats_add(c->stats[DS_FP], f->X.pos) < 0)
2572
0
            return -1;
2573
104k
    } else {
2574
104k
        if (cram_stats_add(c->stats[DS_FP],
2575
104k
                           f->X.pos - s->features[r->feature + r->nfeature-2].X.pos) < 0)
2576
0
            return -1;
2577
2578
104k
    }
2579
205k
    if (cram_stats_add(c->stats[DS_FC], f->X.code) < 0)
2580
0
        return -1;
2581
2582
205k
    s->features[s->nfeatures++] = *f;
2583
2584
205k
    return 0;
2585
205k
}
2586
2587
static int cram_add_substitution(cram_fd *fd, cram_container *c,
2588
                                 cram_slice *s, cram_record *r,
2589
6.64k
                                 int pos, char base, char qual, char ref) {
2590
6.64k
    cram_feature f;
2591
2592
    // seq=ACGTN vs ref=ACGT or seq=ACGT vs ref=ACGTN
2593
6.64k
    if (fd->L2[(uc)base]<4 || (fd->L2[(uc)base]<5 && fd->L2[(uc)ref]<4)) {
2594
6.25k
        f.X.pos = pos+1;
2595
6.25k
        f.X.code = 'X';
2596
6.25k
        f.X.base = fd->cram_sub_matrix[ref&0x1f][base&0x1f];
2597
6.25k
        if (cram_stats_add(c->stats[DS_BS], f.X.base) < 0)
2598
0
            return -1;
2599
6.25k
    } else {
2600
390
        f.B.pos = pos+1;
2601
390
        f.B.code = 'B';
2602
390
        f.B.base = base;
2603
390
        f.B.qual = qual;
2604
390
        if (cram_stats_add(c->stats[DS_BA], f.B.base) < 0) return -1;
2605
390
        if (cram_stats_add(c->stats[DS_QS], f.B.qual) < 0) return -1;
2606
390
        BLOCK_APPEND_CHAR(s->qual_blk, qual);
2607
390
    }
2608
6.64k
    return cram_add_feature(c, s, r, &f);
2609
2610
0
 block_err:
2611
0
    return -1;
2612
6.64k
}
2613
2614
static int cram_add_bases(cram_fd *fd, cram_container *c,
2615
                          cram_slice *s, cram_record *r,
2616
1.12k
                          int pos, int len, char *base) {
2617
1.12k
    cram_feature f;
2618
2619
1.12k
    f.b.pos = pos+1;
2620
1.12k
    f.b.code = 'b';
2621
1.12k
    f.b.seq_idx = base - (char *)BLOCK_DATA(s->seqs_blk);
2622
1.12k
    f.b.len = len;
2623
2624
1.12k
    return cram_add_feature(c, s, r, &f);
2625
1.12k
}
2626
2627
static int cram_add_base(cram_fd *fd, cram_container *c,
2628
                         cram_slice *s, cram_record *r,
2629
1.82k
                         int pos, char base, char qual) {
2630
1.82k
    cram_feature f;
2631
1.82k
    f.B.pos = pos+1;
2632
1.82k
    f.B.code = 'B';
2633
1.82k
    f.B.base = base;
2634
1.82k
    f.B.qual = qual;
2635
1.82k
    if (cram_stats_add(c->stats[DS_BA], base) < 0) return -1;
2636
1.82k
    if (cram_stats_add(c->stats[DS_QS], qual) < 0) return -1;
2637
1.82k
    BLOCK_APPEND_CHAR(s->qual_blk, qual);
2638
1.82k
    return cram_add_feature(c, s, r, &f);
2639
2640
0
 block_err:
2641
0
    return -1;
2642
1.82k
}
2643
2644
static int cram_add_quality(cram_fd *fd, cram_container *c,
2645
                            cram_slice *s, cram_record *r,
2646
63
                            int pos, char qual) {
2647
63
    cram_feature f;
2648
63
    f.Q.pos = pos+1;
2649
63
    f.Q.code = 'Q';
2650
63
    f.Q.qual = qual;
2651
63
    if (cram_stats_add(c->stats[DS_QS], qual) < 0) return -1;
2652
63
    BLOCK_APPEND_CHAR(s->qual_blk, qual);
2653
63
    return cram_add_feature(c, s, r, &f);
2654
2655
0
 block_err:
2656
0
    return -1;
2657
63
}
2658
2659
static int cram_add_deletion(cram_container *c, cram_slice *s, cram_record *r,
2660
84.1k
                             int pos, int len, char *base) {
2661
84.1k
    cram_feature f;
2662
84.1k
    f.D.pos = pos+1;
2663
84.1k
    f.D.code = 'D';
2664
84.1k
    f.D.len = len;
2665
84.1k
    if (cram_stats_add(c->stats[DS_DL], len) < 0) return -1;
2666
84.1k
    return cram_add_feature(c, s, r, &f);
2667
84.1k
}
2668
2669
static int cram_add_softclip(cram_container *c, cram_slice *s, cram_record *r,
2670
67.5k
                             int pos, int len, char *base, int version) {
2671
67.5k
    cram_feature f;
2672
67.5k
    f.S.pos = pos+1;
2673
67.5k
    f.S.code = 'S';
2674
67.5k
    f.S.len = len;
2675
67.5k
    switch (CRAM_MAJOR_VERS(version)) {
2676
0
    case 1:
2677
0
        f.S.seq_idx = BLOCK_SIZE(s->base_blk);
2678
0
        BLOCK_APPEND(s->base_blk, base, len);
2679
0
        BLOCK_APPEND_CHAR(s->base_blk, '\0');
2680
0
        break;
2681
2682
0
    case 2:
2683
67.5k
    default:
2684
67.5k
        f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
2685
67.5k
        if (base) {
2686
24.8k
            BLOCK_APPEND(s->soft_blk, base, len);
2687
42.6k
        } else {
2688
42.6k
            int i;
2689
230k
            for (i = 0; i < len; i++)
2690
188k
                BLOCK_APPEND_CHAR(s->soft_blk, 'N');
2691
42.6k
        }
2692
67.5k
        BLOCK_APPEND_CHAR(s->soft_blk, '\0');
2693
67.5k
        break;
2694
2695
        //default:
2696
        //    // v3.0 onwards uses BB data-series
2697
        //    f.S.seq_idx = BLOCK_SIZE(s->soft_blk);
2698
67.5k
    }
2699
67.5k
    return cram_add_feature(c, s, r, &f);
2700
2701
0
 block_err:
2702
0
    return -1;
2703
67.5k
}
2704
2705
static int cram_add_hardclip(cram_container *c, cram_slice *s, cram_record *r,
2706
42.9k
                             int pos, int len, char *base) {
2707
42.9k
    cram_feature f;
2708
42.9k
    f.S.pos = pos+1;
2709
42.9k
    f.S.code = 'H';
2710
42.9k
    f.S.len = len;
2711
42.9k
    if (cram_stats_add(c->stats[DS_HC], len) < 0) return -1;
2712
42.9k
    return cram_add_feature(c, s, r, &f);
2713
42.9k
}
2714
2715
static int cram_add_skip(cram_container *c, cram_slice *s, cram_record *r,
2716
527
                         int pos, int len, char *base) {
2717
527
    cram_feature f;
2718
527
    f.S.pos = pos+1;
2719
527
    f.S.code = 'N';
2720
527
    f.S.len = len;
2721
527
    if (cram_stats_add(c->stats[DS_RS], len) < 0) return -1;
2722
527
    return cram_add_feature(c, s, r, &f);
2723
527
}
2724
2725
static int cram_add_pad(cram_container *c, cram_slice *s, cram_record *r,
2726
490
                        int pos, int len, char *base) {
2727
490
    cram_feature f;
2728
490
    f.S.pos = pos+1;
2729
490
    f.S.code = 'P';
2730
490
    f.S.len = len;
2731
490
    if (cram_stats_add(c->stats[DS_PD], len) < 0) return -1;
2732
490
    return cram_add_feature(c, s, r, &f);
2733
490
}
2734
2735
static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r,
2736
611
                              int pos, int len, char *base) {
2737
611
    cram_feature f;
2738
611
    f.I.pos = pos+1;
2739
611
    if (len == 1) {
2740
134
        char b = base ? *base : 'N';
2741
134
        f.i.code = 'i';
2742
134
        f.i.base = b;
2743
134
        if (cram_stats_add(c->stats[DS_BA], b) < 0) return -1;
2744
477
    } else {
2745
477
        f.I.code = 'I';
2746
477
        f.I.len = len;
2747
477
        f.S.seq_idx = BLOCK_SIZE(s->base_blk);
2748
477
        if (base) {
2749
210
            BLOCK_APPEND(s->base_blk, base, len);
2750
267
        } else {
2751
267
            int i;
2752
14.1M
            for (i = 0; i < len; i++)
2753
14.1M
                BLOCK_APPEND_CHAR(s->base_blk, 'N');
2754
267
        }
2755
477
        BLOCK_APPEND_CHAR(s->base_blk, '\0');
2756
477
    }
2757
611
    return cram_add_feature(c, s, r, &f);
2758
2759
0
 block_err:
2760
0
    return -1;
2761
611
}
2762
2763
/*
2764
 * Encodes auxiliary data. Largely duplicated from above, but done so to
2765
 * keep it simple and avoid a myriad of version ifs.
2766
 *
2767
 * Returns the RG header line pointed to by the BAM aux fields on success,
2768
 *         NULL on failure or no rg present, also sets "*err" to non-zero
2769
 */
2770
static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b,
2771
                                      cram_container *c,
2772
                                      cram_slice *s, cram_record *cr,
2773
                                      int verbatim_NM, int verbatim_MD,
2774
                                      int NM, kstring_t *MD, int cf_tag,
2775
6.47M
                                      int no_ref, int *err) {
2776
6.47M
    char *aux, *orig;
2777
6.47M
    sam_hrec_rg_t *brg = NULL;
2778
6.47M
    int aux_size = bam_get_l_aux(b);
2779
6.47M
    const char *aux_end = bam_data_end(b);
2780
6.47M
    cram_block *td_b = c->comp_hdr->TD_blk;
2781
6.47M
    int TD_blk_size = BLOCK_SIZE(td_b), new;
2782
6.47M
    char *key;
2783
6.47M
    khint_t k;
2784
2785
6.47M
    if (err) *err = 1;
2786
2787
6.47M
    orig = aux = (char *)bam_aux(b);
2788
2789
2790
    // cF:i  => Extra CRAM bit flags.
2791
    // 1:  Don't auto-decode MD (may be invalid)
2792
    // 2:  Don't auto-decode NM (may be invalid)
2793
6.47M
    if (cf_tag && CRAM_MAJOR_VERS(fd->version) < 4) {
2794
        // Temporary copy of aux so we can ammend it.
2795
94.2k
        aux = malloc(aux_size+4);
2796
94.2k
        if (!aux)
2797
0
            return NULL;
2798
2799
94.2k
        memcpy(aux, orig, aux_size);
2800
94.2k
        aux[aux_size++] = 'c';
2801
94.2k
        aux[aux_size++] = 'F';
2802
94.2k
        aux[aux_size++] = 'C';
2803
94.2k
        aux[aux_size++] = cf_tag;
2804
94.2k
        orig = aux;
2805
94.2k
        aux_end = aux + aux_size;
2806
94.2k
    }
2807
2808
    // Copy aux keys to td_b and aux values to slice aux blocks
2809
7.36M
    while (aux_end - aux >= 1 && aux[0] != 0) {
2810
890k
        int r;
2811
2812
        // Room for code + type + at least 1 byte of data
2813
890k
        if (aux - orig >= aux_size - 3)
2814
5
            goto err;
2815
2816
        // RG:Z
2817
890k
        if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') {
2818
1.82k
            char *rg = &aux[3];
2819
1.82k
            aux = rg;
2820
434k
            while (aux < aux_end && *aux++);
2821
1.82k
            if (aux == aux_end && aux[-1] != '\0') {
2822
2
                hts_log_error("Unterminated RG:Z tag for read \"%s\"",
2823
2
                              bam_get_qname(b));
2824
2
                goto err;
2825
2
            }
2826
1.82k
            brg = sam_hrecs_find_rg(fd->header->hrecs, rg);
2827
1.82k
            if (brg) {
2828
269
                if (CRAM_MAJOR_VERS(fd->version) >= 4)
2829
0
                    BLOCK_APPEND(td_b, "RG*", 3);
2830
269
                continue;
2831
1.55k
            } else {
2832
                // RG:Z tag will be stored verbatim
2833
1.55k
                hts_log_warning("Missing @RG header for RG \"%s\"", rg);
2834
1.55k
                aux = rg - 3;
2835
1.55k
            }
2836
1.82k
        }
2837
2838
        // MD:Z
2839
889k
        if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') {
2840
65.0k
            if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) {
2841
27.4k
                if (MD && MD->s && strncasecmp(MD->s, aux+3, orig + aux_size - (aux+3)) == 0) {
2842
0
                    while (aux < aux_end && *aux++);
2843
0
                    if (aux == aux_end && aux[-1] != '\0') {
2844
0
                        hts_log_error("Unterminated MD:Z tag for read \"%s\"",
2845
0
                                      bam_get_qname(b));
2846
0
                        goto err;
2847
0
                    }
2848
0
                    if (CRAM_MAJOR_VERS(fd->version) >= 4)
2849
0
                        BLOCK_APPEND(td_b, "MD*", 3);
2850
0
                    continue;
2851
0
                }
2852
27.4k
            }
2853
65.0k
        }
2854
2855
        // NM:i
2856
889k
        if (aux[0] == 'N' && aux[1] == 'M') {
2857
114
            if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_NM) {
2858
1
                int NM_ = bam_aux2i_end((uint8_t *)aux+2, (uint8_t *)aux_end);
2859
1
                if (NM_ == NM) {
2860
0
                    switch(aux[2]) {
2861
0
                    case 'A': case 'C': case 'c': aux+=4; break;
2862
0
                    case 'S': case 's':           aux+=5; break;
2863
0
                    case 'I': case 'i': case 'f': aux+=7; break;
2864
0
                    default:
2865
0
                        hts_log_error("Unhandled type code for NM tag");
2866
0
                        goto err;
2867
0
                    }
2868
0
                    if (CRAM_MAJOR_VERS(fd->version) >= 4)
2869
0
                        BLOCK_APPEND(td_b, "NM*", 3);
2870
0
                    continue;
2871
0
                }
2872
1
            }
2873
114
        }
2874
2875
889k
        BLOCK_APPEND(td_b, aux, 3);
2876
2877
        // Container level tags_used, for TD series
2878
        // Maps integer key ('X0i') to cram_tag_map struct.
2879
889k
        int key = (((unsigned char *) aux)[0]<<16 |
2880
889k
                   ((unsigned char *) aux)[1]<<8  |
2881
889k
                   ((unsigned char *) aux)[2]);
2882
889k
        k = kh_put(m_tagmap, c->tags_used, key, &r);
2883
889k
        if (-1 == r)
2884
0
            goto err;
2885
889k
        else if (r != 0)
2886
239k
            kh_val(c->tags_used, k) = NULL;
2887
2888
889k
        if (r == 1) {
2889
239k
            khint_t k_global;
2890
2891
            // Global tags_used for cram_metrics support
2892
239k
            pthread_mutex_lock(&fd->metrics_lock);
2893
239k
            k_global = kh_put(m_metrics, fd->tags_used, key, &r);
2894
239k
            if (-1 == r) {
2895
0
                pthread_mutex_unlock(&fd->metrics_lock);
2896
0
                goto err;
2897
0
            }
2898
239k
            if (r >= 1) {
2899
13.4k
                kh_val(fd->tags_used, k_global) = cram_new_metrics();
2900
13.4k
                if (!kh_val(fd->tags_used, k_global)) {
2901
0
                    kh_del(m_metrics, fd->tags_used, k_global);
2902
0
                    pthread_mutex_unlock(&fd->metrics_lock);
2903
0
                    goto err;
2904
0
                }
2905
13.4k
            }
2906
2907
239k
            pthread_mutex_unlock(&fd->metrics_lock);
2908
2909
239k
            int i2[2] = {'\t',key};
2910
239k
            size_t sk = key;
2911
239k
            cram_tag_map *m = calloc(1, sizeof(*m));
2912
239k
            if (!m)
2913
0
                goto_err;
2914
239k
            kh_val(c->tags_used, k) = m;
2915
2916
239k
            cram_codec *c;
2917
2918
            // Use a block content id based on the tag id.
2919
            // Codec type depends on tag data type.
2920
239k
            switch(aux[2]) {
2921
115k
            case 'Z': case 'H':
2922
                // string as byte_array_stop
2923
115k
                c = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL,
2924
115k
                                      E_BYTE_ARRAY, (void *)i2,
2925
115k
                                      fd->version, &fd->vv);
2926
115k
                break;
2927
2928
99.9k
            case 'A': case 'c': case 'C': {
2929
                // byte array len, 1 byte
2930
99.9k
                cram_byte_array_len_encoder e;
2931
99.9k
                cram_stats st;
2932
2933
99.9k
                if (CRAM_MAJOR_VERS(fd->version) <= 3) {
2934
99.9k
                    e.len_encoding = E_HUFFMAN;
2935
99.9k
                    e.len_dat = NULL; // will get codes from st
2936
99.9k
                } else {
2937
0
                    e.len_encoding = E_CONST_INT;
2938
0
                    e.len_dat = NULL; // will get codes from st
2939
0
                }
2940
99.9k
                memset(&st, 0, sizeof(st));
2941
99.9k
                if (cram_stats_add(&st, 1) < 0) goto block_err;
2942
99.9k
                cram_stats_encoding(fd, &st);
2943
2944
99.9k
                e.val_encoding = E_EXTERNAL;
2945
99.9k
                e.val_dat = (void *)sk;
2946
2947
99.9k
                c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2948
99.9k
                                      E_BYTE_ARRAY, (void *)&e,
2949
99.9k
                                      fd->version, &fd->vv);
2950
99.9k
                break;
2951
99.9k
            }
2952
2953
6.69k
            case 's': case 'S': {
2954
                // byte array len, 2 byte
2955
6.69k
                cram_byte_array_len_encoder e;
2956
6.69k
                cram_stats st;
2957
2958
6.69k
                if (CRAM_MAJOR_VERS(fd->version) <= 3) {
2959
6.69k
                    e.len_encoding = E_HUFFMAN;
2960
6.69k
                    e.len_dat = NULL; // will get codes from st
2961
6.69k
                } else {
2962
0
                    e.len_encoding = E_CONST_INT;
2963
0
                    e.len_dat = NULL; // will get codes from st
2964
0
                }
2965
6.69k
                memset(&st, 0, sizeof(st));
2966
6.69k
                if (cram_stats_add(&st, 2) < 0) goto block_err;
2967
6.69k
                cram_stats_encoding(fd, &st);
2968
2969
6.69k
                e.val_encoding = E_EXTERNAL;
2970
6.69k
                e.val_dat = (void *)sk;
2971
2972
6.69k
                c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2973
6.69k
                                      E_BYTE_ARRAY, (void *)&e,
2974
6.69k
                                      fd->version, &fd->vv);
2975
6.69k
                break;
2976
6.69k
            }
2977
16.1k
            case 'i': case 'I': case 'f': {
2978
                // byte array len, 4 byte
2979
16.1k
                cram_byte_array_len_encoder e;
2980
16.1k
                cram_stats st;
2981
2982
16.1k
                if (CRAM_MAJOR_VERS(fd->version) <= 3) {
2983
16.1k
                    e.len_encoding = E_HUFFMAN;
2984
16.1k
                    e.len_dat = NULL; // will get codes from st
2985
16.1k
                } else {
2986
0
                    e.len_encoding = E_CONST_INT;
2987
0
                    e.len_dat = NULL; // will get codes from st
2988
0
                }
2989
16.1k
                memset(&st, 0, sizeof(st));
2990
16.1k
                if (cram_stats_add(&st, 4) < 0) goto block_err;
2991
16.1k
                cram_stats_encoding(fd, &st);
2992
2993
16.1k
                e.val_encoding = E_EXTERNAL;
2994
16.1k
                e.val_dat = (void *)sk;
2995
2996
16.1k
                c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st,
2997
16.1k
                                      E_BYTE_ARRAY, (void *)&e,
2998
16.1k
                                      fd->version, &fd->vv);
2999
16.1k
                break;
3000
16.1k
            }
3001
3002
1.05k
            case 'B': {
3003
                // Byte array of variable size, but we generate our tag
3004
                // byte stream at the wrong stage (during reading and not
3005
                // after slice header construction). So we use
3006
                // BYTE_ARRAY_LEN with the length codec being external
3007
                // too.
3008
1.05k
                cram_byte_array_len_encoder e;
3009
3010
1.05k
                e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4
3011
1.05k
                    ? E_VARINT_UNSIGNED
3012
1.05k
                    : E_EXTERNAL;
3013
1.05k
                e.len_dat = (void *)sk; // or key+128 for len?
3014
3015
1.05k
                e.val_encoding = E_EXTERNAL;
3016
1.05k
                e.val_dat = (void *)sk;
3017
3018
1.05k
                c = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL,
3019
1.05k
                                      E_BYTE_ARRAY, (void *)&e,
3020
1.05k
                                      fd->version, &fd->vv);
3021
1.05k
                break;
3022
16.1k
            }
3023
3024
112
            default:
3025
112
                hts_log_error("Unsupported SAM aux type '%c'", aux[2]);
3026
112
                c = NULL;
3027
239k
            }
3028
3029
239k
            if (!c)
3030
112
                goto_err;
3031
3032
239k
            m->codec = c;
3033
3034
            // Link to fd-global tag metrics
3035
239k
            pthread_mutex_lock(&fd->metrics_lock);
3036
239k
            m->m = k_global ? (cram_metrics *)kh_val(fd->tags_used, k_global) : NULL;
3037
239k
            pthread_mutex_unlock(&fd->metrics_lock);
3038
239k
        }
3039
3040
889k
        cram_tag_map *tm = (cram_tag_map *)kh_val(c->tags_used, k);
3041
889k
        if (!tm) goto_err;
3042
889k
        cram_codec *codec = tm->codec;
3043
889k
        if (!tm->codec) goto_err;
3044
3045
889k
        switch(aux[2]) {
3046
407k
        case 'A': case 'C': case 'c':
3047
407k
            if (aux_end - aux < 3+1)
3048
0
                goto err;
3049
3050
407k
            if (!tm->blk) {
3051
99.9k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3052
0
                    goto err;
3053
99.9k
                codec->u.e_byte_array_len.val_codec->out = tm->blk;
3054
99.9k
            }
3055
3056
407k
            aux+=3;
3057
            //codec->encode(s, codec, aux, 1);
3058
            // Functionally equivalent, but less code.
3059
407k
            BLOCK_APPEND_CHAR(tm->blk, *aux);
3060
407k
            aux++;
3061
407k
            break;
3062
3063
25.5k
        case 'S': case 's':
3064
25.5k
            if (aux_end - aux < 3+2)
3065
1
                goto err;
3066
3067
25.5k
            if (!tm->blk) {
3068
6.69k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3069
0
                    goto err;
3070
6.69k
                codec->u.e_byte_array_len.val_codec->out = tm->blk;
3071
6.69k
            }
3072
3073
25.5k
            aux+=3;
3074
            //codec->encode(s, codec, aux, 2);
3075
25.5k
            BLOCK_APPEND(tm->blk, aux, 2);
3076
25.5k
            aux+=2;
3077
25.5k
            break;
3078
3079
98.0k
        case 'I': case 'i': case 'f':
3080
98.0k
            if (aux_end - aux < 3+4)
3081
0
                goto err;
3082
3083
98.0k
            if (!tm->blk) {
3084
16.1k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3085
0
                    goto err;
3086
16.1k
                codec->u.e_byte_array_len.val_codec->out = tm->blk;
3087
16.1k
            }
3088
3089
98.0k
            aux+=3;
3090
            //codec->encode(s, codec, aux, 4);
3091
98.0k
            BLOCK_APPEND(tm->blk, aux, 4);
3092
98.0k
            aux+=4;
3093
98.0k
            break;
3094
3095
0
        case 'd':
3096
0
            if (aux_end - aux < 3+8)
3097
0
                goto err;
3098
3099
0
            if (!tm->blk) {
3100
0
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3101
0
                    goto err;
3102
0
                codec->u.e_byte_array_len.val_codec->out = tm->blk;
3103
0
            }
3104
3105
0
            aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++;
3106
            //codec->encode(s, codec, aux, 8);
3107
0
            BLOCK_APPEND(tm->blk, aux, 8);
3108
0
            aux+=8;
3109
0
            break;
3110
3111
324k
        case 'Z': case 'H': {
3112
324k
            if (aux_end - aux < 3)
3113
0
                goto err;
3114
3115
324k
            if (!tm->blk) {
3116
115k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3117
0
                    goto err;
3118
115k
                codec->out = tm->blk;
3119
115k
            }
3120
3121
324k
            char *aux_s;
3122
324k
            aux += 3;
3123
324k
            aux_s = aux;
3124
13.9M
            while (aux < aux_end && *aux++);
3125
324k
            if (aux == aux_end && aux[-1] != '\0') {
3126
2
                hts_log_error("Unterminated %c%c:%c tag for read \"%s\"",
3127
2
                              aux_s[-3], aux_s[-2], aux_s[-1],
3128
2
                              bam_get_qname(b));
3129
2
                goto err;
3130
2
            }
3131
324k
            if (codec->encode(s, codec, aux_s, aux - aux_s) < 0)
3132
0
                goto err;
3133
324k
            break;
3134
324k
        }
3135
3136
324k
        case 'B': {
3137
34.1k
            if (aux_end - aux < 4+4)
3138
0
                goto err;
3139
3140
34.1k
            int type = aux[3];
3141
34.1k
            uint64_t count = (((uint64_t)((unsigned char *)aux)[4]) << 0 |
3142
34.1k
                              ((uint64_t)((unsigned char *)aux)[5]) << 8 |
3143
34.1k
                              ((uint64_t)((unsigned char *)aux)[6]) <<16 |
3144
34.1k
                              ((uint64_t)((unsigned char *)aux)[7]) <<24);
3145
34.1k
            uint64_t blen;
3146
34.1k
            if (!tm->blk) {
3147
1.05k
                if (!(tm->blk = cram_new_block(EXTERNAL, key)))
3148
0
                    goto err;
3149
1.05k
                if (codec->u.e_byte_array_len.val_codec->codec == E_XDELTA) {
3150
0
                    if (!(tm->blk2 = cram_new_block(EXTERNAL, key+128)))
3151
0
                        goto err;
3152
0
                    codec->u.e_byte_array_len.len_codec->out = tm->blk2;
3153
0
                    codec->u.e_byte_array_len.val_codec->u.e_xdelta.sub_codec->out = tm->blk;
3154
1.05k
                } else {
3155
1.05k
                    codec->u.e_byte_array_len.len_codec->out = tm->blk;
3156
1.05k
                    codec->u.e_byte_array_len.val_codec->out = tm->blk;
3157
1.05k
                }
3158
1.05k
            }
3159
3160
            // skip TN field
3161
34.1k
            aux+=3;
3162
3163
            // We use BYTE_ARRAY_LEN with external length, so store that first
3164
34.1k
            switch (type) {
3165
16.2k
            case 'c': case 'C':
3166
16.2k
                blen = count;
3167
16.2k
                break;
3168
4.39k
            case 's': case 'S':
3169
4.39k
                blen = 2*count;
3170
4.39k
                break;
3171
13.4k
            case 'i': case 'I': case 'f':
3172
13.4k
                blen = 4*count;
3173
13.4k
                break;
3174
12
            default:
3175
12
                hts_log_error("Unknown sub-type '%c' for aux type 'B'", type);
3176
12
                goto err;
3177
34.1k
            }
3178
3179
34.1k
            blen += 5; // sub-type & length
3180
34.1k
            if (aux_end - aux < blen || blen > INT_MAX)
3181
6
                goto err;
3182
3183
34.1k
            if (codec->encode(s, codec, aux, (int) blen) < 0)
3184
0
                goto err;
3185
34.1k
            aux += blen;
3186
34.1k
            break;
3187
34.1k
        }
3188
0
        default:
3189
0
            hts_log_error("Unknown aux type '%c'", aux_end - aux < 2 ? '?' : aux[2]);
3190
0
            goto err;
3191
889k
        }
3192
889k
        tm->blk->m = tm->m;
3193
889k
    }
3194
3195
    // FIXME: sort BLOCK_DATA(td_b) by char[3] triples
3196
3197
    // And and increment TD hash entry
3198
6.47M
    BLOCK_APPEND_CHAR(td_b, 0);
3199
3200
    // Duplicate key as BLOCK_DATA() can be realloced to a new pointer.
3201
6.47M
    key = string_ndup(c->comp_hdr->TD_keys,
3202
6.47M
                      (char *)BLOCK_DATA(td_b) + TD_blk_size,
3203
6.47M
                      BLOCK_SIZE(td_b) - TD_blk_size);
3204
6.47M
    if (!key)
3205
0
        goto block_err;
3206
6.47M
    k = kh_put(m_s2i, c->comp_hdr->TD_hash, key, &new);
3207
6.47M
    if (new < 0) {
3208
0
        goto err;
3209
6.47M
    } else if (new == 0) {
3210
6.40M
        BLOCK_SIZE(td_b) = TD_blk_size;
3211
6.40M
    } else {
3212
74.1k
        kh_val(c->comp_hdr->TD_hash, k) = c->comp_hdr->nTL;
3213
74.1k
        c->comp_hdr->nTL++;
3214
74.1k
    }
3215
3216
6.47M
    cr->TL = kh_val(c->comp_hdr->TD_hash, k);
3217
6.47M
    if (cram_stats_add(c->stats[DS_TL], cr->TL) < 0)
3218
0
        goto block_err;
3219
3220
6.47M
    if (orig != (char *)bam_aux(b))
3221
94.2k
        free(orig);
3222
3223
6.47M
    if (err) *err = 0;
3224
3225
6.47M
    return brg;
3226
3227
140
 err:
3228
140
 block_err:
3229
140
    if (orig != (char *)bam_aux(b))
3230
25
        free(orig);
3231
140
    return NULL;
3232
140
}
3233
3234
/*
3235
 * During cram_next_container or before the final flush at end of
3236
 * file, we update the current slice headers and increment the slice
3237
 * number to the next slice.
3238
 *
3239
 * See cram_next_container() and cram_close().
3240
 */
3241
53.4k
void cram_update_curr_slice(cram_container *c, int version) {
3242
53.4k
    cram_slice *s = c->slice;
3243
53.4k
    if (c->multi_seq) {
3244
723
        s->hdr->ref_seq_id    = -2;
3245
723
        s->hdr->ref_seq_start = 0;
3246
723
        s->hdr->ref_seq_span  = 0;
3247
52.6k
    } else if (c->curr_ref == -1 && CRAM_ge31(version)) {
3248
        // Spec states span=0, but it broke our range queries.
3249
        // See commit message for this and prior.
3250
26.8k
        s->hdr->ref_seq_id    = -1;
3251
26.8k
        s->hdr->ref_seq_start = 0;
3252
26.8k
        s->hdr->ref_seq_span  = 0;
3253
26.8k
    } else {
3254
25.8k
        s->hdr->ref_seq_id    = c->curr_ref;
3255
25.8k
        s->hdr->ref_seq_start = c->first_base;
3256
25.8k
        s->hdr->ref_seq_span  = MAX(0, c->last_base - c->first_base + 1);
3257
25.8k
    }
3258
53.4k
    s->hdr->num_records   = c->curr_rec;
3259
3260
53.4k
    if (c->curr_slice == 0) {
3261
53.4k
        if (c->ref_seq_id != s->hdr->ref_seq_id)
3262
27.5k
            c->ref_seq_id  = s->hdr->ref_seq_id;
3263
53.4k
        c->ref_seq_start = c->first_base;
3264
53.4k
    }
3265
3266
53.4k
    c->curr_slice++;
3267
53.4k
}
3268
3269
/*
3270
 * Handles creation of a new container or new slice, flushing any
3271
 * existing containers when appropriate.
3272
 *
3273
 * Really this is next slice, which may or may not lead to a new container.
3274
 *
3275
 * Returns cram_container pointer on success
3276
 *         NULL on failure.
3277
 */
3278
53.5k
static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) {
3279
53.5k
    cram_container *c = fd->ctr;
3280
53.5k
    int i;
3281
3282
    /* First occurrence */
3283
53.5k
    if (c->curr_ref == -2)
3284
3.57k
        c->curr_ref = bam_ref(b);
3285
3286
53.5k
    if (c->slice)
3287
49.9k
        cram_update_curr_slice(c, fd->version);
3288
3289
    /* Flush container */
3290
53.5k
    if (c->curr_slice == c->max_slice ||
3291
49.9k
        (bam_ref(b) != c->curr_ref && !c->multi_seq)) {
3292
49.9k
        c->ref_seq_span = fd->last_base - c->ref_seq_start + 1;
3293
49.9k
        hts_log_info("Flush container %d/%"PRId64"..%"PRId64,
3294
49.9k
                     c->ref_seq_id, c->ref_seq_start,
3295
49.9k
                     c->ref_seq_start + c->ref_seq_span -1);
3296
3297
        /* Encode slices */
3298
49.9k
        if (-1 == cram_flush_container_mt(fd, c))
3299
112
            return NULL;
3300
49.8k
        if (!fd->pool) {
3301
            // Move to sep func, as we need cram_flush_container for
3302
            // the closing phase to flush the partial container.
3303
99.6k
            for (i = 0; i < c->max_slice; i++) {
3304
49.8k
                cram_free_slice(c->slices[i]);
3305
49.8k
                c->slices[i] = NULL;
3306
49.8k
            }
3307
3308
49.8k
            c->slice = NULL;
3309
49.8k
            c->curr_slice = 0;
3310
3311
            /* Easy approach for purposes of freeing stats */
3312
49.8k
            cram_free_container(c);
3313
49.8k
        }
3314
3315
49.8k
        c = fd->ctr = cram_new_container(fd->seqs_per_slice,
3316
49.8k
                                         fd->slices_per_container);
3317
49.8k
        if (!c)
3318
0
            return NULL;
3319
3320
49.8k
        pthread_mutex_lock(&fd->ref_lock);
3321
49.8k
        c->no_ref = fd->no_ref;
3322
49.8k
        c->embed_ref = fd->embed_ref;
3323
49.8k
        c->record_counter = fd->record_counter;
3324
49.8k
        pthread_mutex_unlock(&fd->ref_lock);
3325
49.8k
        c->curr_ref = bam_ref(b);
3326
49.8k
    }
3327
3328
53.4k
    c->last_pos = c->first_base = c->last_base = bam_pos(b)+1;
3329
3330
    /* New slice */
3331
53.4k
    c->slice = c->slices[c->curr_slice] =
3332
53.4k
        cram_new_slice(MAPPED_SLICE, c->max_rec);
3333
53.4k
    if (!c->slice)
3334
0
        return NULL;
3335
3336
53.4k
    if (c->multi_seq) {
3337
0
        c->slice->hdr->ref_seq_id = -2;
3338
0
        c->slice->hdr->ref_seq_start = 0;
3339
0
        c->slice->last_apos = 1;
3340
53.4k
    } else {
3341
53.4k
        c->slice->hdr->ref_seq_id = bam_ref(b);
3342
        // wrong for unsorted data, will fix during encoding.
3343
53.4k
        c->slice->hdr->ref_seq_start = bam_pos(b)+1;
3344
53.4k
        c->slice->last_apos = bam_pos(b)+1;
3345
53.4k
    }
3346
3347
53.4k
    c->curr_rec = 0;
3348
53.4k
    c->s_num_bases = 0;
3349
53.4k
    c->n_mapped = 0;
3350
3351
    // QO field: 0 implies original orientation, 1 implies sequence orientation
3352
    // 1 is often preferable for NovaSeq, but impact is slight. ~0.5% diff.
3353
    // Conversely other data sets it's often better than 1% saving for 0.
3354
    // Short of trying both and learning, for now we use use 0 for V4, 1 for V3.
3355
53.4k
    c->qs_seq_orient = CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : 1;
3356
3357
53.4k
    return c;
3358
53.4k
}
3359
3360
3361
/*
3362
 * Converts a single bam record into a cram record.
3363
 * Possibly used within a thread.
3364
 *
3365
 * Returns 0 on success;
3366
 *        -1 on failure
3367
 */
3368
static int process_one_read(cram_fd *fd, cram_container *c,
3369
                            cram_slice *s, cram_record *cr,
3370
                            bam_seq_t *b, int rnum, kstring_t *MD,
3371
6.47M
                            int embed_ref, int no_ref) {
3372
6.47M
    int i, fake_qual = -1, NM = 0;
3373
6.47M
    char *cp;
3374
6.47M
    char *ref, *seq, *qual;
3375
3376
    // Any places with N in seq and/or reference can lead to ambiguous
3377
    // interpretation of the SAM NM:i tag.  So we store these verbatim
3378
    // to ensure valid data round-trips the same regardless of who
3379
    // defines it as valid.
3380
    // Similarly when alignments go beyond end of the reference.
3381
6.47M
    int verbatim_NM = fd->store_nm;
3382
6.47M
    int verbatim_MD = fd->store_md;
3383
3384
    // FIXME: multi-ref containers
3385
3386
6.47M
    cr->flags       = bam_flag(b);
3387
6.47M
    cr->len         = bam_seq_len(b);
3388
6.47M
    uint8_t *md;
3389
6.47M
    if (!(md = bam_aux_get(b, "MD")))
3390
6.41M
        MD = NULL;
3391
64.9k
    else
3392
64.9k
        MD->l = 0;
3393
3394
6.47M
    int cf_tag = 0;
3395
3396
6.47M
    if (embed_ref == 2) {
3397
94.3k
        cf_tag  = MD ? 0 : 1;                   // No MD
3398
94.3k
        cf_tag |= bam_aux_get(b, "NM") ? 0 : 2; // No NM
3399
94.3k
    }
3400
3401
    //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg);
3402
3403
6.47M
    ref = c->ref ? c->ref - (c->ref_start-1) : NULL;
3404
6.47M
    cr->ref_id      = bam_ref(b);
3405
6.47M
    if (cram_stats_add(c->stats[DS_RI], cr->ref_id) < 0)
3406
0
        goto block_err;
3407
6.47M
    if (cram_stats_add(c->stats[DS_BF], fd->cram_flag_swap[cr->flags & 0xfff]) < 0)
3408
0
        goto block_err;
3409
3410
    // Non reference based encoding means storing the bases verbatim as features, which in
3411
    // turn means every base also has a quality already stored.
3412
6.47M
    if (!no_ref || CRAM_MAJOR_VERS(fd->version) >= 3)
3413
6.47M
        cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
3414
3415
6.47M
    if (cr->len <= 0 && CRAM_MAJOR_VERS(fd->version) >= 3)
3416
6.22M
        cr->cram_flags |= CRAM_FLAG_NO_SEQ;
3417
    //cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK);
3418
3419
6.47M
    c->num_bases   += cr->len;
3420
6.47M
    cr->apos        = bam_pos(b)+1;
3421
6.47M
    if (cr->apos < 0 || cr->apos > INT64_MAX/2)
3422
19
        goto err;
3423
6.47M
    if (c->pos_sorted) {
3424
6.45M
        if (cr->apos < s->last_apos && !fd->ap_delta) {
3425
330
            c->pos_sorted = 0;
3426
6.45M
        } else {
3427
6.45M
            if (cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos) < 0)
3428
0
                goto block_err;
3429
6.45M
            s->last_apos = cr->apos;
3430
6.45M
        }
3431
6.45M
    } else {
3432
        //cram_stats_add(c->stats[DS_AP], cr->apos);
3433
17.4k
    }
3434
6.47M
    c->max_apos += (cr->apos > c->max_apos) * (cr->apos - c->max_apos);
3435
3436
    /*
3437
     * This seqs_ds is largely pointless and it could reuse the same memory
3438
     * over and over.
3439
     * s->base_blk is what we need for encoding.
3440
     */
3441
6.47M
    cr->seq         = BLOCK_SIZE(s->seqs_blk);
3442
6.47M
    cr->qual        = BLOCK_SIZE(s->qual_blk);
3443
6.47M
    BLOCK_GROW(s->seqs_blk, cr->len+1);
3444
6.47M
    BLOCK_GROW(s->qual_blk, cr->len);
3445
3446
    // Convert BAM nibble encoded sequence to string of base pairs
3447
6.47M
    seq = cp = (char *)BLOCK_END(s->seqs_blk);
3448
6.47M
    *seq = 0;
3449
6.47M
    nibble2base(bam_seq(b), cp, cr->len);
3450
6.47M
    BLOCK_SIZE(s->seqs_blk) += cr->len;
3451
3452
6.47M
    qual = cp = (char *)bam_qual(b);
3453
3454
3455
    /* Copy and parse */
3456
6.47M
    if (!(cr->flags & BAM_FUNMAP)) {
3457
101k
        uint32_t *cig_to, *cig_from;
3458
101k
        int64_t apos = cr->apos-1, spos = 0;
3459
101k
        int64_t MD_last = apos; // last position of edit in MD tag
3460
3461
101k
        if (apos < 0) {
3462
0
            hts_log_error("Mapped read with position <= 0 is disallowed");
3463
0
            return -1;
3464
0
        }
3465
3466
101k
        cr->cigar       = s->ncigar;
3467
101k
        cr->ncigar      = bam_cigar_len(b);
3468
101k
        while (cr->cigar + cr->ncigar >= s->cigar_alloc) {
3469
170
            s->cigar_alloc = s->cigar_alloc ? s->cigar_alloc*2 : 1024;
3470
170
            s->cigar = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar));
3471
170
            if (!s->cigar)
3472
0
                return -1;
3473
170
        }
3474
3475
101k
        cig_to = (uint32_t *)s->cigar;
3476
101k
        cig_from = (uint32_t *)bam_cigar(b);
3477
3478
101k
        cr->feature = 0;
3479
101k
        cr->nfeature = 0;
3480
326k
        for (i = 0; i < cr->ncigar; i++) {
3481
225k
            enum cigar_op cig_op = cig_from[i] & BAM_CIGAR_MASK;
3482
225k
            uint32_t cig_len = cig_from[i] >> BAM_CIGAR_SHIFT;
3483
225k
            cig_to[i] = cig_from[i];
3484
3485
            /* Can also generate events from here for CRAM diffs */
3486
3487
225k
            switch (cig_op) {
3488
0
                int l;
3489
3490
                // Don't trust = and X ops to be correct.
3491
22.1k
            case BAM_CMATCH:
3492
22.3k
            case BAM_CBASE_MATCH:
3493
28.7k
            case BAM_CBASE_MISMATCH:
3494
                //fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n",
3495
                //      cig_len, &ref[apos], cig_len, &seq[spos]);
3496
28.7k
                l = 0;
3497
28.7k
                if (!no_ref && cr->len) {
3498
19.7k
                    int end = cig_len+apos < c->ref_end
3499
19.7k
                        ? cig_len : c->ref_end - apos;
3500
19.7k
                    char *sp = &seq[spos];
3501
19.7k
                    char *rp = &ref[apos];
3502
19.7k
                    char *qp = &qual[spos];
3503
19.7k
                    if (end > cr->len) {
3504
0
                        hts_log_error("CIGAR and query sequence are of different length");
3505
0
                        return -1;
3506
0
                    }
3507
32.0k
                    for (l = 0; l < end; l++) {
3508
                        // This case is just too disputed and different tools
3509
                        // interpret these in different ways.  We give up and
3510
                        // store verbatim.
3511
12.2k
                        if (rp[l] == 'N' && sp[l] == 'N')
3512
367
                            verbatim_NM = verbatim_MD = 1;
3513
12.2k
                        if (rp[l] != sp[l]) {
3514
                            // Build our own MD tag if one is on the sequence, so
3515
                            // we can ensure it matches and thus can be discarded.
3516
6.64k
                            if (MD && ref) {
3517
4.98k
                                if (kputuw(apos+l - MD_last, MD) < 0) goto err;
3518
4.98k
                                if (kputc(rp[l], MD) < 0) goto err;
3519
4.98k
                                MD_last = apos+l+1;
3520
4.98k
                            }
3521
6.64k
                            NM++;
3522
6.64k
                            if (!sp[l])
3523
0
                                break;
3524
6.64k
                            if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) {
3525
#if 0
3526
                                // Disabled for the time being as it doesn't
3527
                                // seem to gain us much.
3528
                                int ol=l;
3529
                                while (l<end && rp[l] != sp[l])
3530
                                    l++;
3531
                                if (l-ol > 1) {
3532
                                    if (cram_add_bases(fd, c, s, cr, spos+ol,
3533
                                                       l-ol, &seq[spos+ol]))
3534
                                        return -1;
3535
                                    l--;
3536
                                } else {
3537
                                    l = ol;
3538
                                    if (cram_add_substitution(fd, c, s, cr,
3539
                                                              spos+l, sp[l],
3540
                                                              qp[l], rp[l]))
3541
                                        return -1;
3542
                                }
3543
#else
3544
                                // With urmap pushed to the limit and lots
3545
                                // of unaligned data (should be soft-clipped)
3546
                                // this saves ~2-7%. Worth it?
3547
0
                                int nl = l;
3548
0
                                int max_end = nl, max_score = 0, score = 0;
3549
0
                                while (nl < end) {
3550
0
                                    if (rp[nl] != sp[nl]) {
3551
0
                                        score += 3;
3552
0
                                        if (max_score < score) {
3553
0
                                            max_score = score;
3554
0
                                            max_end = nl;
3555
0
                                        }
3556
0
                                    } else {
3557
0
                                        score--;
3558
0
                                        if (score < -2 ||
3559
0
                                            max_score - score > 7)
3560
0
                                            break;
3561
0
                                    }
3562
0
                                    nl++;
3563
0
                                }
3564
0
                                if (max_score > 20) {
3565
0
                                    cram_add_bases(fd, c, s, cr, spos+l,
3566
0
                                                   max_end-l, &seq[spos+l]);
3567
0
                                    l = max_end-1;
3568
0
                                } else {
3569
0
                                    while (l < nl) {
3570
0
                                        if (rp[l] != sp[l])
3571
0
                                            cram_add_substitution(fd, c, s,
3572
0
                                                                  cr, spos+l,
3573
0
                                                                  sp[l], qp[l],
3574
0
                                                                  rp[l]);
3575
0
                                        l++;
3576
0
                                    }
3577
0
                                    l--;
3578
0
                                }
3579
0
#endif
3580
6.64k
                            } else {
3581
6.64k
                                if (cram_add_substitution(fd, c, s, cr, spos+l,
3582
6.64k
                                                          sp[l], qp[l], rp[l]))
3583
0
                                    return -1;
3584
6.64k
                            }
3585
6.64k
                        }
3586
12.2k
                    }
3587
19.7k
                    spos += l;
3588
19.7k
                    apos += l;
3589
19.7k
                }
3590
3591
28.7k
                if (l < cig_len && cr->len) {
3592
2.93k
                    if (no_ref) {
3593
1.12k
                        if (CRAM_MAJOR_VERS(fd->version) == 3) {
3594
1.12k
                            if (cram_add_bases(fd, c, s, cr, spos,
3595
1.12k
                                               cig_len-l, &seq[spos]))
3596
0
                                return -1;
3597
1.12k
                            spos += cig_len-l;
3598
1.12k
                        } else {
3599
0
                            for (; l < cig_len && seq[spos]; l++, spos++) {
3600
0
                                if (cram_add_base(fd, c, s, cr, spos,
3601
0
                                                  seq[spos], qual[spos]))
3602
0
                                    return -1;
3603
0
                            }
3604
0
                        }
3605
1.80k
                    } else {
3606
                        /* off end of sequence or non-ref based output */
3607
1.80k
                        verbatim_NM = verbatim_MD = 1;
3608
3.63k
                        for (; l < cig_len && seq[spos]; l++, spos++) {
3609
1.82k
                            if (cram_add_base(fd, c, s, cr, spos,
3610
1.82k
                                              seq[spos], qual[spos]))
3611
0
                                return -1;
3612
1.82k
                        }
3613
1.80k
                    }
3614
2.93k
                    apos += cig_len;
3615
25.8k
                } else if (!cr->len) {
3616
                    /* Seq "*" */
3617
7.42k
                    verbatim_NM = verbatim_MD = 1;
3618
7.42k
                    apos += cig_len;
3619
7.42k
                    spos += cig_len;
3620
7.42k
                }
3621
28.7k
                break;
3622
3623
84.1k
            case BAM_CDEL:
3624
84.1k
                if (MD && ref) {
3625
44.1k
                    if (kputuw(apos - MD_last, MD) < 0) goto err;
3626
44.1k
                    if (apos < c->ref_end) {
3627
27.3k
                        if (kputc_('^', MD) < 0) goto err;
3628
27.3k
                        if (kputsn(&ref[apos], MIN(c->ref_end - apos, cig_len), MD) < 0)
3629
0
                            goto err;
3630
27.3k
                    }
3631
44.1k
                }
3632
84.1k
                NM += cig_len;
3633
3634
84.1k
                if (cram_add_deletion(c, s, cr, spos, cig_len, &seq[spos]))
3635
0
                    return -1;
3636
84.1k
                apos += cig_len;
3637
84.1k
                MD_last = apos;
3638
84.1k
                break;
3639
3640
527
            case BAM_CREF_SKIP:
3641
527
                if (cram_add_skip(c, s, cr, spos, cig_len, &seq[spos]))
3642
0
                    return -1;
3643
527
                apos += cig_len;
3644
527
                MD_last += cig_len;
3645
527
                break;
3646
3647
611
            case BAM_CINS:
3648
611
                if (cram_add_insertion(c, s, cr, spos, cig_len,
3649
611
                                       cr->len ? &seq[spos] : NULL))
3650
0
                    return -1;
3651
611
                if (no_ref && cr->len) {
3652
178
                    for (l = 0; l < cig_len; l++, spos++) {
3653
63
                        cram_add_quality(fd, c, s, cr, spos, qual[spos]);
3654
63
                    }
3655
496
                } else {
3656
496
                    spos += cig_len;
3657
496
                }
3658
611
                NM += cig_len;
3659
611
                break;
3660
3661
67.5k
            case BAM_CSOFT_CLIP:
3662
67.5k
                if (cram_add_softclip(c, s, cr, spos, cig_len,
3663
67.5k
                                      cr->len ? &seq[spos] : NULL,
3664
67.5k
                                      fd->version))
3665
0
                    return -1;
3666
3667
67.5k
                if (no_ref &&
3668
7.23k
                    !(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) {
3669
0
                    if (cr->len) {
3670
0
                        for (l = 0; l < cig_len; l++, spos++) {
3671
0
                            cram_add_quality(fd, c, s, cr, spos, qual[spos]);
3672
0
                        }
3673
0
                    } else {
3674
0
                        for (l = 0; l < cig_len; l++, spos++) {
3675
0
                            cram_add_quality(fd, c, s, cr, spos, -1);
3676
0
                        }
3677
0
                    }
3678
67.5k
                } else {
3679
67.5k
                    spos += cig_len;
3680
67.5k
                }
3681
67.5k
                break;
3682
3683
42.9k
            case BAM_CHARD_CLIP:
3684
42.9k
                if (cram_add_hardclip(c, s, cr, spos, cig_len, &seq[spos]))
3685
0
                    return -1;
3686
42.9k
                break;
3687
3688
42.9k
            case BAM_CPAD:
3689
490
                if (cram_add_pad(c, s, cr, spos, cig_len, &seq[spos]))
3690
0
                    return -1;
3691
490
                break;
3692
3693
490
            default:
3694
128
                hts_log_error("Unknown CIGAR op code %d", cig_op);
3695
128
                return -1;
3696
225k
            }
3697
225k
        }
3698
101k
        if (cr->len && spos != cr->len) {
3699
1
            hts_log_error("CIGAR and query sequence are of different length");
3700
1
            return -1;
3701
1
        }
3702
101k
        fake_qual = spos;
3703
        // Protect against negative length refs (fuzz 382922241)
3704
101k
        cr->aend = no_ref ? apos : MIN(apos, MAX(0, c->ref_end));
3705
101k
        if (cram_stats_add(c->stats[DS_FN], cr->nfeature) < 0)
3706
0
            goto block_err;
3707
3708
101k
        if (MD && ref)
3709
46.4k
            if (kputuw(apos - MD_last, MD) < 0) goto err;
3710
6.37M
    } else {
3711
        // Unmapped
3712
6.37M
        cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES;
3713
6.37M
        cr->cigar  = 0;
3714
6.37M
        cr->ncigar = 0;
3715
6.37M
        cr->nfeature = 0;
3716
6.37M
        cr->aend = MIN(cr->apos, c->ref_end);
3717
447M
        for (i = 0; i < cr->len; i++)
3718
441M
            if (cram_stats_add(c->stats[DS_BA], seq[i]) < 0)
3719
0
                goto block_err;
3720
6.37M
        fake_qual = 0;
3721
6.37M
    }
3722
3723
6.47M
    cr->ntags      = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags);
3724
6.47M
    int err = 0;
3725
6.47M
    sam_hrec_rg_t *brg =
3726
6.47M
        cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD,
3727
6.47M
                        cf_tag, no_ref, &err);
3728
6.47M
    if (err)
3729
140
        goto block_err;
3730
3731
    /* Read group, identified earlier */
3732
6.47M
    if (brg) {
3733
269
        cr->rg = brg->id;
3734
6.47M
    } else if (CRAM_MAJOR_VERS(fd->version) == 1) {
3735
0
        sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, "UNKNOWN");
3736
0
        if (!brg) goto block_err;
3737
0
        cr->rg = brg->id;
3738
6.47M
    } else {
3739
6.47M
        cr->rg = -1;
3740
6.47M
    }
3741
6.47M
    if (cram_stats_add(c->stats[DS_RG], cr->rg) < 0)
3742
0
        goto block_err;
3743
3744
    /*
3745
     * Append to the qual block now. We do this here as
3746
     * cram_add_substitution() can generate BA/QS events which need to
3747
     * be in the qual block before we append the rest of the data.
3748
     */
3749
6.47M
    if (cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES) {
3750
        /* Special case of seq "*" */
3751
6.47M
        if (cr->len == 0) {
3752
6.22M
            cr->len = fake_qual;
3753
6.22M
            BLOCK_GROW(s->qual_blk, cr->len);
3754
6.22M
            cp = (char *)BLOCK_END(s->qual_blk);
3755
6.22M
            memset(cp, 255, cr->len);
3756
6.22M
        } else {
3757
248k
            BLOCK_GROW(s->qual_blk, cr->len);
3758
248k
            cp = (char *)BLOCK_END(s->qual_blk);
3759
248k
            char *from = (char *)&bam_qual(b)[0];
3760
248k
            char *to = &cp[0];
3761
248k
            memcpy(to, from, cr->len);
3762
3763
            // Store quality in original orientation for better compression.
3764
248k
            if (!c->qs_seq_orient) {
3765
0
                if (cr->flags & BAM_FREVERSE) {
3766
0
                    int i, j;
3767
0
                    for (i = 0, j = cr->len-1; i < j; i++, j--) {
3768
0
                        unsigned char c;
3769
0
                        c = to[i];
3770
0
                        to[i] = to[j];
3771
0
                        to[j] = c;
3772
0
                    }
3773
0
                }
3774
0
            }
3775
248k
        }
3776
6.47M
        BLOCK_SIZE(s->qual_blk) += cr->len;
3777
6.47M
    } else {
3778
0
        if (cr->len == 0)
3779
0
            cr->len = fake_qual >= 0 ? fake_qual : cr->aend - cr->apos + 1;
3780
0
    }
3781
3782
6.47M
    if (cram_stats_add(c->stats[DS_RL], cr->len) < 0)
3783
0
        goto block_err;
3784
3785
    /* Now we know apos and aend both, update mate-pair information */
3786
6.47M
    {
3787
6.47M
        int new;
3788
6.47M
        khint_t k;
3789
6.47M
        int sec = (cr->flags & BAM_FSECONDARY) ? 1 : 0;
3790
3791
        //fprintf(stderr, "Checking %"PRId64"/%.*s\t", rnum,
3792
        //      cr->name_len, DSTRING_STR(s->name_ds)+cr->name);
3793
6.47M
        if (cr->flags & BAM_FPAIRED) {
3794
40.5k
            char *key = string_ndup(s->pair_keys, bam_name(b), bam_name_len(b));
3795
40.5k
            if (!key)
3796
0
                return -1;
3797
3798
40.5k
            k = kh_put(m_s2i, s->pair[sec], key, &new);
3799
40.5k
            if (-1 == new)
3800
0
                return -1;
3801
40.5k
            else if (new > 0)
3802
7.33k
                kh_val(s->pair[sec], k) = rnum
3803
7.33k
                    | ((unsigned)((cr->flags & BAM_FREAD1)!=0)<<30)
3804
7.33k
                    | ((unsigned)((cr->flags & BAM_FREAD2)!=0)<<31);
3805
6.43M
        } else {
3806
6.43M
            new = 1;
3807
6.43M
            k = 0; // Prevents false-positive warning from gcc -Og
3808
6.43M
        }
3809
3810
6.47M
        if (new == 0) {
3811
33.1k
            cram_record *p = &s->crecs[kh_val(s->pair[sec], k) & ((1<<30)-1)];
3812
33.1k
            int64_t aleft, aright;
3813
33.1k
            int sign;
3814
3815
33.1k
            aleft = MIN(cr->apos, p->apos);
3816
33.1k
            aright = MAX(cr->aend, p->aend);
3817
33.1k
            if (cr->apos < p->apos) {
3818
772
                sign = 1;
3819
32.4k
            } else if (cr->apos > p->apos) {
3820
509
                sign = -1;
3821
31.9k
            } else if (cr->flags & BAM_FREAD1) {
3822
15.9k
                sign = 1;
3823
16.0k
            } else {
3824
16.0k
                sign = -1;
3825
16.0k
            }
3826
3827
            // Multiple sets of secondary reads means we cannot tell which is
3828
            // which, so we store TLEN etc verbatim.
3829
33.1k
            int has_r1 = kh_val(s->pair[sec], k) & (1<<30);
3830
33.1k
            unsigned int has_r2 = kh_val(s->pair[sec], k) & (1u<<31);
3831
33.1k
            if ((has_r1 && (cr->flags & BAM_FREAD1)) ||
3832
17.7k
                (has_r2 && (cr->flags & BAM_FREAD2)))
3833
18.9k
                goto detached;
3834
3835
            // This vs p: tlen, matepos, flags. Permit TLEN 0 and/or TLEN +/-
3836
            // a small amount, if appropriate options set.
3837
14.2k
            if ((!fd->tlen_zero && MAX(bam_mate_pos(b)+1, 0) != p->apos) &&
3838
13.4k
                !(fd->tlen_zero && bam_mate_pos(b) == 0))
3839
13.4k
                goto detached;
3840
3841
732
            if (((bam_flag(b) & BAM_FMUNMAP) != 0) !=
3842
732
                ((p->flags & BAM_FUNMAP) != 0))
3843
175
                goto detached;
3844
3845
557
            if (((bam_flag(b) & BAM_FMREVERSE) != 0) !=
3846
557
                ((p->flags & BAM_FREVERSE) != 0))
3847
28
                goto detached;
3848
3849
3850
            // p vs this: tlen, matepos, flags
3851
529
            if (p->ref_id != cr->ref_id &&
3852
19
                !(fd->tlen_zero && p->ref_id == -1))
3853
19
                goto detached;
3854
3855
510
            if (p->mate_pos != cr->apos &&
3856
23
                !(fd->tlen_zero && p->mate_pos == 0))
3857
23
                goto detached;
3858
3859
487
            if (((p->flags & BAM_FMUNMAP) != 0) !=
3860
487
                ((p->mate_flags & CRAM_M_UNMAP) != 0))
3861
3
                goto detached;
3862
3863
484
            if (((p->flags & BAM_FMREVERSE) != 0) !=
3864
484
                ((p->mate_flags & CRAM_M_REVERSE) != 0))
3865
2
                goto detached;
3866
3867
            // Supplementary reads are just too ill defined
3868
482
            if ((cr->flags & BAM_FSUPPLEMENTARY) ||
3869
482
                (p->flags & BAM_FSUPPLEMENTARY))
3870
0
                goto detached;
3871
3872
            // When in lossy name mode, if a read isn't detached we
3873
            // cannot store the name.  The corollary is that when we
3874
            // must store the name, it must be detached (inefficient).
3875
482
            if (fd->lossy_read_names &&
3876
0
                (!(cr->cram_flags & CRAM_FLAG_DISCARD_NAME) ||
3877
0
                 !((p->cram_flags & CRAM_FLAG_DISCARD_NAME))))
3878
0
                goto detached;
3879
3880
            // Now check TLEN.  We do this last as sometimes it's the
3881
            // only thing that differs.  In CRAM4 we have a better way
3882
            // of handling this that doesn't break detached status
3883
482
            int explicit_tlen = 0;
3884
482
            int tflag1 = ((bam_ins_size(b) &&
3885
23
                           llabs(bam_ins_size(b) - sign*(aright-aleft+1))
3886
23
                           > fd->tlen_approx)
3887
465
                          || (!bam_ins_size(b) && !fd->tlen_zero));
3888
3889
482
            int tflag2 = ((p->tlen && llabs(p->tlen - -sign*(aright-aleft+1))
3890
23
                           > fd->tlen_approx)
3891
467
                          || (!p->tlen && !fd->tlen_zero));
3892
3893
482
            if (tflag1 || tflag2) {
3894
480
                if (CRAM_MAJOR_VERS(fd->version) >= 4) {
3895
0
                    explicit_tlen = CRAM_FLAG_EXPLICIT_TLEN;
3896
480
                } else {
3897
                    // Stil do detached for unmapped data in CRAM4 as this
3898
                    // also impacts RNEXT calculation.
3899
480
                    goto detached;
3900
480
                }
3901
480
            }
3902
3903
            /*
3904
             * The fields below are unused when encoding this read as it is
3905
             * no longer detached.  In theory they may get referred to when
3906
             * processing a 3rd or 4th read in this template?, so we set them
3907
             * here just to be sure.
3908
             *
3909
             * They do not need cram_stats_add() calls those as they are
3910
             * not emitted.
3911
             */
3912
2
            cr->mate_pos = p->apos;
3913
2
            cram_stats_add(c->stats[DS_NP], cr->mate_pos);
3914
2
            cr->tlen = explicit_tlen ? bam_ins_size(b) : sign*(aright-aleft+1);
3915
2
            cram_stats_add(c->stats[DS_TS], cr->tlen);
3916
2
            cr->mate_flags =
3917
2
                ((p->flags & BAM_FMUNMAP)   == BAM_FMUNMAP)   * CRAM_M_UNMAP +
3918
2
                ((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE;
3919
3920
            // Decrement statistics aggregated earlier
3921
2
            if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
3922
2
                cram_stats_del(c->stats[DS_NP], p->mate_pos);
3923
2
                cram_stats_del(c->stats[DS_MF], p->mate_flags);
3924
2
                if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN))
3925
2
                    cram_stats_del(c->stats[DS_TS], p->tlen);
3926
2
                cram_stats_del(c->stats[DS_NS], p->mate_ref_id);
3927
2
            }
3928
3929
            /* Similarly we could correct the p-> values too, but these will no
3930
             * longer have any code that refers back to them as the new 'p'
3931
             * for this template is our current 'cr'.
3932
             */
3933
            //p->mate_pos = cr->apos;
3934
            //p->mate_flags =
3935
            //  ((cr->flags & BAM_FMUNMAP)   == BAM_FMUNMAP)  * CRAM_M_UNMAP +
3936
            //  ((cr->flags & BAM_FMREVERSE) == BAM_FMREVERSE)* CRAM_M_REVERSE;
3937
            //p->tlen = p->apos - cr->aend;
3938
3939
            // Clear detached from cr flags
3940
2
            cr->cram_flags &= ~CRAM_FLAG_DETACHED;
3941
2
            cr->cram_flags |= explicit_tlen;
3942
2
            if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0)
3943
0
                goto block_err;
3944
3945
            // Clear detached from p flags and set downstream
3946
2
            if (p->cram_flags & CRAM_FLAG_STATS_ADDED) {
3947
2
                cram_stats_del(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK);
3948
2
                p->cram_flags &= ~CRAM_FLAG_STATS_ADDED;
3949
2
            }
3950
3951
2
            p->cram_flags  &= ~CRAM_FLAG_DETACHED;
3952
2
            p->cram_flags  |=  CRAM_FLAG_MATE_DOWNSTREAM | explicit_tlen;;
3953
2
            if (cram_stats_add(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK) < 0)
3954
0
                goto block_err;
3955
3956
2
            p->mate_line = rnum - ((kh_val(s->pair[sec], k) & ((1<<30)-1)) + 1);
3957
2
            if (cram_stats_add(c->stats[DS_NF], p->mate_line) < 0)
3958
0
                goto block_err;
3959
3960
2
            int r12_flags = kh_val(s->pair[sec], k) & (3u<<30);
3961
2
            kh_val(s->pair[sec], k) = (unsigned int)rnum | r12_flags
3962
2
                | (((cr->flags & BAM_FREAD1)!=0)<<30)
3963
2
                | ((unsigned)((cr->flags & BAM_FREAD2)!=0)<<31);
3964
6.44M
        } else {
3965
6.47M
        detached:
3966
            //fprintf(stderr, "unpaired\n");
3967
3968
            /* Derive mate flags from this flag */
3969
6.47M
            cr->mate_flags = 0;
3970
6.47M
            if (bam_flag(b) & BAM_FMUNMAP)
3971
22.1k
                cr->mate_flags |= CRAM_M_UNMAP;
3972
6.47M
            if (bam_flag(b) & BAM_FMREVERSE)
3973
145
                cr->mate_flags |= CRAM_M_REVERSE;
3974
3975
6.47M
            if (cram_stats_add(c->stats[DS_MF], cr->mate_flags) < 0)
3976
0
                goto block_err;
3977
3978
6.47M
            cr->mate_pos    = MAX(bam_mate_pos(b)+1, 0);
3979
6.47M
            if (cram_stats_add(c->stats[DS_NP], cr->mate_pos) < 0)
3980
0
                goto block_err;
3981
3982
6.47M
            cr->tlen        = bam_ins_size(b);
3983
6.47M
            if (cram_stats_add(c->stats[DS_TS], cr->tlen) < 0)
3984
0
                goto block_err;
3985
3986
6.47M
            cr->cram_flags |= CRAM_FLAG_DETACHED;
3987
6.47M
            if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0)
3988
0
                goto block_err;
3989
6.47M
            if (cram_stats_add(c->stats[DS_NS], bam_mate_ref(b)) < 0)
3990
0
                goto block_err;
3991
3992
6.47M
            cr->cram_flags |= CRAM_FLAG_STATS_ADDED;
3993
6.47M
        }
3994
6.47M
    }
3995
3996
6.47M
    cr->mqual       = bam_map_qual(b);
3997
6.47M
    if (cram_stats_add(c->stats[DS_MQ], cr->mqual) < 0)
3998
0
        goto block_err;
3999
4000
6.47M
    cr->mate_ref_id = bam_mate_ref(b);
4001
4002
6.47M
    if (!(bam_flag(b) & BAM_FUNMAP)) {
4003
101k
        if (c->first_base > cr->apos)
4004
174
            c->first_base = cr->apos;
4005
4006
101k
        if (c->last_base < cr->aend)
4007
4.77k
            c->last_base = cr->aend;
4008
101k
    }
4009
4010
6.47M
    return 0;
4011
4012
140
 block_err:
4013
159
 err:
4014
159
    return -1;
4015
140
}
4016
4017
/*
4018
 * Write iterator: put BAM format sequences into a CRAM file.
4019
 * We buffer up a containers worth of data at a time.
4020
 *
4021
 * Returns 0 on success
4022
 *        -1 on failure
4023
 */
4024
6.47M
int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) {
4025
6.47M
    cram_container *c;
4026
4027
6.47M
    if (!fd->ctr) {
4028
3.57k
        fd->ctr = cram_new_container(fd->seqs_per_slice,
4029
3.57k
                                     fd->slices_per_container);
4030
3.57k
        if (!fd->ctr)
4031
0
            return -1;
4032
3.57k
        fd->ctr->record_counter = fd->record_counter;
4033
4034
3.57k
        pthread_mutex_lock(&fd->ref_lock);
4035
3.57k
        fd->ctr->no_ref = fd->no_ref;
4036
3.57k
        fd->ctr->embed_ref = fd->embed_ref;
4037
3.57k
        pthread_mutex_unlock(&fd->ref_lock);
4038
3.57k
    }
4039
6.47M
    c = fd->ctr;
4040
4041
6.47M
    int embed_ref = c->embed_ref;
4042
4043
6.47M
    if (!c->slice || c->curr_rec == c->max_rec ||
4044
6.47M
        (bam_ref(b) != c->curr_ref && c->curr_ref >= -1) ||
4045
6.41M
        (c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice)) {
4046
60.7k
        int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1;
4047
60.7k
        int curr_ref = c->slice ? c->curr_ref : bam_ref(b);
4048
4049
        /*
4050
         * Start packing slices when we routinely have under 1/4tr full.
4051
         *
4052
         * This option isn't available if we choose to embed references
4053
         * since we can only have one per slice.
4054
         *
4055
         * The multi_seq var here refers to our intention for the next slice.
4056
         * This slice has already been encoded so we output as-is.
4057
         */
4058
60.7k
        if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 &&
4059
52.6k
            fd->last_slice && fd->last_slice < c->max_rec/4+10 &&
4060
48.0k
            embed_ref<=0) {
4061
641
            if (!c->multi_seq)
4062
460
                hts_log_info("Multi-ref enabled for next container");
4063
641
            multi_seq = 1;
4064
60.1k
        } else if (fd->multi_seq == 1) {
4065
7.49k
            pthread_mutex_lock(&fd->metrics_lock);
4066
7.49k
            if (fd->last_RI_count <= c->max_slice && fd->multi_seq_user != 1) {
4067
398
                multi_seq = 0;
4068
398
                hts_log_info("Multi-ref disabled for next container");
4069
398
            }
4070
7.49k
            pthread_mutex_unlock(&fd->metrics_lock);
4071
7.49k
        }
4072
4073
60.7k
        slice_rec = c->slice_rec;
4074
60.7k
        curr_rec  = c->curr_rec;
4075
4076
60.7k
        if (CRAM_MAJOR_VERS(fd->version) == 1 ||
4077
60.7k
            c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice ||
4078
53.5k
            c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice) {
4079
53.5k
            if (NULL == (c = cram_next_container(fd, b))) {
4080
112
                if (fd->ctr) {
4081
                    // prevent cram_close attempting to flush
4082
112
                    fd->ctr_mt = fd->ctr; // delay free when threading
4083
112
                    fd->ctr = NULL;
4084
112
                }
4085
112
                return -1;
4086
112
            }
4087
53.5k
        }
4088
4089
        /*
4090
         * Due to our processing order, some things we've already done we
4091
         * cannot easily undo. So when we first notice we should be packing
4092
         * multiple sequences per container we emit the small partial
4093
         * container as-is and then start a fresh one in a different mode.
4094
         */
4095
60.6k
        if (multi_seq == 0 && fd->multi_seq == 1 && fd->multi_seq_user == -1) {
4096
            // User selected auto-mode, we're currently using multi-seq, but
4097
            // have detected we don't need to.  Switch back to auto.
4098
397
            fd->multi_seq = -1;
4099
60.2k
        } else if (multi_seq) {
4100
            // We detected we need multi-seq
4101
7.73k
            fd->multi_seq = 1;
4102
7.73k
            c->multi_seq = 1;
4103
7.73k
            c->pos_sorted = 0;
4104
4105
            // Cram_next_container may end up flushing an existing one and
4106
            // triggering fd->embed_ref=2 if no reference is found.
4107
            // Embedded refs are incompatible with multi-seq, so we bail
4108
            // out and switch to no_ref in this scenario.  We do this
4109
            // within the container only, as multi_seq may be temporary
4110
            // and we switch back away from it again.
4111
7.73k
            pthread_mutex_lock(&fd->ref_lock);
4112
7.73k
            if (fd->embed_ref > 0 && c->curr_rec == 0 && c->curr_slice == 0) {
4113
109
                hts_log_warning("Changing from embed_ref to no_ref mode");
4114
                // Should we update fd->embed_ref and no_ref here too?
4115
                // Doing so means if we go into multi-seq and back out
4116
                // again, eg due a cluster of tiny refs in the middle of
4117
                // much larger ones, then we bake in no-ref mode.
4118
                //
4119
                // However for unsorted data we're realistically not
4120
                // going to switch back.
4121
109
                c->embed_ref = fd->embed_ref = 0; // or -1 for auto?
4122
109
                c->no_ref = fd->no_ref = 1;
4123
109
            }
4124
7.73k
            pthread_mutex_unlock(&fd->ref_lock);
4125
4126
7.73k
            if (!c->refs_used) {
4127
723
                pthread_mutex_lock(&fd->ref_lock);
4128
723
                c->refs_used = calloc(fd->refs->nref, sizeof(int));
4129
723
                pthread_mutex_unlock(&fd->ref_lock);
4130
723
                if (!c->refs_used)
4131
0
                    return -1;
4132
723
            }
4133
7.73k
        }
4134
4135
60.6k
        fd->last_slice = curr_rec - slice_rec;
4136
60.6k
        c->slice_rec = c->curr_rec;
4137
4138
        // Have we seen this reference before?
4139
60.6k
        if (bam_ref(b) >= 0 && curr_ref >= 0 && bam_ref(b) != curr_ref &&
4140
0
            embed_ref<=0 && !fd->unsorted && multi_seq) {
4141
4142
0
            if (!c->refs_used) {
4143
0
                pthread_mutex_lock(&fd->ref_lock);
4144
0
                c->refs_used = calloc(fd->refs->nref, sizeof(int));
4145
0
                pthread_mutex_unlock(&fd->ref_lock);
4146
0
                if (!c->refs_used)
4147
0
                    return -1;
4148
0
            } else if (c->refs_used && c->refs_used[bam_ref(b)]) {
4149
0
                pthread_mutex_lock(&fd->ref_lock);
4150
0
                fd->unsorted = 1;
4151
0
                fd->multi_seq = 1;
4152
0
                pthread_mutex_unlock(&fd->ref_lock);
4153
0
            }
4154
0
        }
4155
4156
60.6k
        c->curr_ref = bam_ref(b);
4157
60.6k
        if (c->refs_used && c->curr_ref >= 0) c->refs_used[c->curr_ref]++;
4158
60.6k
    }
4159
4160
6.47M
    if (!c->bams) {
4161
        /* First time through, allocate a set of bam pointers */
4162
53.4k
        pthread_mutex_lock(&fd->bam_list_lock);
4163
53.4k
        if (fd->bl) {
4164
49.8k
            spare_bams *spare = fd->bl;
4165
49.8k
            c->bams = spare->bams;
4166
49.8k
            fd->bl = spare->next;
4167
49.8k
            free(spare);
4168
49.8k
        } else {
4169
3.57k
            c->bams = calloc(c->max_c_rec, sizeof(bam_seq_t *));
4170
3.57k
            if (!c->bams) {
4171
0
                pthread_mutex_unlock(&fd->bam_list_lock);
4172
0
                return -1;
4173
0
            }
4174
3.57k
        }
4175
53.4k
        pthread_mutex_unlock(&fd->bam_list_lock);
4176
53.4k
    }
4177
4178
    /* Copy or alloc+copy the bam record, for later encoding */
4179
6.47M
    if (c->bams[c->curr_c_rec]) {
4180
5.19M
        if (bam_copy1(c->bams[c->curr_c_rec], b) == NULL)
4181
0
            return -1;
4182
5.19M
    } else {
4183
1.28M
        c->bams[c->curr_c_rec] = bam_dup1(b);
4184
1.28M
        if (c->bams[c->curr_c_rec] == NULL)
4185
0
            return -1;
4186
1.28M
    }
4187
6.47M
    if (bam_seq_len(b)) {
4188
248k
        c->s_num_bases += bam_seq_len(b);
4189
6.22M
    } else {
4190
        // No sequence in BAM record.  CRAM doesn't directly support this
4191
        // case, it ends up being stored as a string of N's for each query
4192
        // consuming CIGAR operation.  As this can become very inefficient
4193
        // in time and memory, data where the query length is excessively
4194
        // long are rejected.
4195
6.22M
        hts_pos_t qlen = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
4196
6.22M
        if (qlen > 100000000) {
4197
64
            hts_log_error("CIGAR query length %"PRIhts_pos
4198
64
                          " for read \"%s\" is too long",
4199
64
                          qlen, bam_get_qname(b));
4200
64
            return -1;
4201
64
        }
4202
6.22M
        c->s_num_bases += qlen;
4203
6.22M
    }
4204
6.47M
    c->curr_rec++;
4205
6.47M
    c->curr_c_rec++;
4206
6.47M
    c->s_aux_bytes += bam_get_l_aux(b);
4207
6.47M
    c->n_mapped += (bam_flag(b) & BAM_FUNMAP) ? 0 : 1;
4208
6.47M
    fd->record_counter++;
4209
4210
6.47M
    return 0;
4211
6.47M
}