Coverage Report

Created: 2025-11-11 06:39

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/cram/cram_structs.h
Line
Count
Source
1
/*
2
Copyright (c) 2012-2016, 2018-2020, 2023 Genome Research Ltd.
3
Author: James Bonfield <jkb@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
#ifndef HTSLIB_CRAM_STRUCTS_H
32
#define HTSLIB_CRAM_STRUCTS_H
33
34
/*
35
 * Defines in-memory structs for the basic file-format objects in the
36
 * CRAM format.
37
 *
38
 * The basic file format is:
39
 *     File-def SAM-hdr Container Container ...
40
 *
41
 * Container:
42
 *     Service-block data-block data-block ...
43
 *
44
 * Multiple blocks in a container are grouped together as slices,
45
 * also sometimes referred to as landmarks in the spec.
46
 */
47
48
49
#include <pthread.h>
50
#include <stdint.h>
51
#include <sys/types.h>
52
53
#include "../htslib/thread_pool.h"
54
#include "../htslib/cram.h"
55
#include "string_alloc.h"
56
#include "mFILE.h"
57
#include "../htslib/khash.h"
58
59
#ifdef __cplusplus
60
extern "C" {
61
#endif
62
63
// Generic hash-map integer -> integer
64
KHASH_MAP_INIT_INT64(m_i2i, int)
65
66
// Generic hash-set integer -> (existence)
67
KHASH_SET_INIT_INT(s_i2i)
68
69
// For brevity
70
typedef unsigned char uc;
71
72
/*
73
 * A union for the preservation map. Required for khash.
74
 */
75
typedef union {
76
    int i;
77
    char *p;
78
} pmap_t;
79
80
// Generates static functions here which isn't ideal, but we have no way
81
// currently to declare the kh_map_t structure here without also declaring a
82
// duplicate in the .c files due to the nature of the KHASH macros.
83
KHASH_MAP_INIT_STR(map, pmap_t)
84
85
struct hFILE;
86
87
40.4k
#define SEQS_PER_SLICE 10000
88
20.2k
#define BASES_PER_SLICE (SEQS_PER_SLICE*500)
89
20.2k
#define SLICE_PER_CNT  1
90
91
481k
#define CRAM_SUBST_MATRIX "CGTNGTANCATNGCANACGT"
92
93
2.35G
#define MAX_STAT_VAL 1024
94
//#define MAX_STAT_VAL 16
95
typedef struct cram_stats {
96
    int freqs[MAX_STAT_VAL];
97
    khash_t(m_i2i) *h;
98
    int nsamp; // total number of values added
99
    int nvals; // total number of unique values added
100
    int64_t min_val, max_val;
101
} cram_stats;
102
103
/* NB: matches java impl, not the spec */
104
enum cram_encoding {
105
    E_NULL               = 0,
106
    E_EXTERNAL           = 1,  // Only for BYTE type in CRAM 4
107
    E_GOLOMB             = 2,  // Not in CRAM 4
108
    E_HUFFMAN            = 3,  // Not in CRAM 4
109
    E_BYTE_ARRAY_LEN     = 4,
110
    E_BYTE_ARRAY_STOP    = 5,
111
    E_BETA               = 6,  // Not in CRAM 4
112
    E_SUBEXP             = 7,  // Not in CRAM 4
113
    E_GOLOMB_RICE        = 8,  // Not in CRAM 4
114
    E_GAMMA              = 9,  // Not in CRAM 4
115
116
    // CRAM 4 specific codecs
117
    E_VARINT_UNSIGNED    = 41, // Specialisation of EXTERNAL
118
    E_VARINT_SIGNED      = 42, // Specialisation of EXTERNAL
119
    E_CONST_BYTE         = 43, // Alternative to HUFFMAN with 1 symbol
120
    E_CONST_INT          = 44, // Alternative to HUFFMAN with 1 symbol
121
122
    // More experimental ideas, not documented in spec yet
123
    E_XHUFFMAN           = 50, // To external block
124
    E_XPACK              = 51, // Transform to sub-codec
125
    E_XRLE               = 52, // Transform to sub-codec
126
    E_XDELTA             = 53, // Transform to sub-codec
127
128
    // Total number of codecs, not a real one.
129
    E_NUM_CODECS,
130
};
131
132
enum cram_external_type {
133
    E_INT                = 1,
134
    E_LONG               = 2,
135
    E_BYTE               = 3,
136
    E_BYTE_ARRAY         = 4,
137
    E_BYTE_ARRAY_BLOCK   = 5,
138
    E_SINT               = 6, // signed INT
139
    E_SLONG              = 7, // signed LONG
140
};
141
142
/* External IDs used by this implementation (only assumed during writing) */
143
enum cram_DS_ID {
144
    DS_CORE   = 0,
145
    DS_aux    = 1, // aux_blk
146
    DS_aux_OQ = 2,
147
    DS_aux_BQ = 3,
148
    DS_aux_BD = 4,
149
    DS_aux_BI = 5,
150
    DS_aux_FZ = 6, // also ZM:B
151
    DS_aux_oq = 7, // other qualities
152
    DS_aux_os = 8, // other sequences
153
    DS_aux_oz = 9, // other strings
154
    DS_ref,
155
    DS_RN, // name_blk
156
    DS_QS, // qual_blk
157
    DS_IN, // base_blk
158
    DS_SC, // soft_blk
159
160
    DS_BF, // start loop
161
    DS_CF,
162
    DS_AP,
163
    DS_RG,
164
    DS_MQ,
165
    DS_NS,
166
    DS_MF,
167
    DS_TS,
168
    DS_NP,
169
    DS_NF,
170
    DS_RL,
171
    DS_FN,
172
    DS_FC,
173
    DS_FP,
174
    DS_DL,
175
    DS_BA,
176
    DS_BS,
177
    DS_TL,
178
    DS_RI,
179
    DS_RS,
180
    DS_PD,
181
    DS_HC,
182
    DS_BB,
183
    DS_QQ,
184
185
    DS_TN, // end loop
186
187
    DS_RN_len,
188
    DS_SC_len,
189
    DS_BB_len,
190
    DS_QQ_len,
191
192
    DS_TC, // CRAM v1.0 tags
193
    DS_TM, // test
194
    DS_TV, // test
195
196
    DS_END,
197
};
198
199
/* "File Definition Structure" */
200
struct cram_file_def {
201
    char    magic[4];
202
    uint8_t major_version;
203
    uint8_t minor_version;
204
    char    file_id[20] HTS_NONSTRING; // Filename or SHA1 checksum
205
};
206
207
55.3M
#define CRAM_MAJOR_VERS(v) ((v) >> 8)
208
20.7k
#define CRAM_MINOR_VERS(v) ((v) & 0xff)
209
210
struct cram_slice;
211
212
// Internal version of htslib/cram.h enum.
213
// Note these have to match the laout of methmap and methcost in
214
// cram_io.c:cram_compress_block2
215
enum cram_block_method_int {
216
    // Public methods as defined in the CRAM spec.
217
    BM_ERROR = -1,
218
219
    // CRAM 2.x and 3.0
220
    RAW      = 0,
221
    GZIP     = 1,
222
    BZIP2    = 2,
223
    LZMA     = 3,
224
    RANS     = 4, RANS0 = RANS,
225
226
    // CRAM 3.1 onwards
227
    RANSPR   = 5, RANS_PR0  = RANSPR,
228
    ARITH    = 6, ARITH_PR0 = ARITH,
229
    FQZ      = 7,
230
    TOK3     = 8,
231
    // BSC = 9, ZSTD = 10
232
233
    // Methods not externalised, but used in metrics.
234
    // Externally they become one of the above methods.
235
    GZIP_RLE = 11,
236
    GZIP_1,      // Z_DEFAULT_STRATEGY level 1, NB: not externalised in CRAM
237
238
    FQZ_b, FQZ_c, FQZ_d, // Various preset FQZ methods
239
240
  //RANS0,       // Order 0
241
    RANS1,
242
243
  //RANS_PR0,    // Order 0
244
    RANS_PR1,    // Order 1
245
    RANS_PR64,   // O0 + RLE
246
    RANS_PR9,    // O1 + X4
247
    RANS_PR128,  // O0 + Pack
248
    RANS_PR129,  // O1 + Pack
249
    RANS_PR192,  // O0 + RLE + pack
250
    RANS_PR193,  // O1 + RLE + pack
251
252
  //TOK3,   // tok+rans
253
    TOKA,   // tok+arith
254
255
  //ARITH_PR0,   // Order 0
256
    ARITH_PR1,   // Order 1
257
    ARITH_PR64,  // O0 + RLE
258
    ARITH_PR9,   // O1 + X4
259
    ARITH_PR128, // O0 + Pack
260
    ARITH_PR129, // O1 + Pack
261
    ARITH_PR192, // O0 + RLE + pack
262
    ARITH_PR193, // O1 + RLE + pack
263
264
    // NB: must end on no more than 31 unless we change to a
265
    // 64-bit method type.
266
};
267
268
/* Now in htslib/cram.h
269
enum cram_content_type {
270
    CT_ERROR           = -1,
271
    FILE_HEADER        = 0,
272
    COMPRESSION_HEADER = 1,
273
    MAPPED_SLICE       = 2,
274
    UNMAPPED_SLICE     = 3, // CRAM V1.0 only
275
    EXTERNAL           = 4,
276
    CORE               = 5,
277
};
278
*/
279
280
/* Maximum simultaneous codecs allowed, 1 per bit */
281
6.10M
#define CRAM_MAX_METHOD 32
282
283
/* Compression metrics */
284
struct cram_metrics {
285
    // number of trials and time to next trial
286
    int trial;
287
    int next_trial;
288
    int consistency;
289
290
    // aggregate sizes during trials
291
    int sz[CRAM_MAX_METHOD];
292
    int input_avg_sz, input_avg_delta;
293
294
    // resultant method from trials
295
    int method, revised_method;
296
    int strat;
297
298
    // Revisions of method, to allow culling of continually failing ones.
299
    int cnt[CRAM_MAX_METHOD];
300
301
    double extra[CRAM_MAX_METHOD];
302
303
    // Not amenable to rANS bit-packing techniques; cardinality > 16
304
    int unpackable;
305
};
306
307
// Hash aux key (XX:i) to cram_metrics
308
KHASH_MAP_INIT_INT(m_metrics, cram_metrics*)
309
310
311
/* Block */
312
struct cram_block {
313
    enum cram_block_method_int  method, orig_method;
314
    enum cram_content_type  content_type;
315
    int32_t  content_id;
316
    int32_t  comp_size;
317
    int32_t  uncomp_size;
318
    uint32_t crc32;
319
    int32_t  idx; /* offset into data */
320
    unsigned char    *data;
321
322
    // For bit I/O
323
    size_t alloc;
324
    size_t byte;
325
    int bit;
326
327
    // To aid compression
328
    cram_metrics *m; // used to track aux block compression only
329
330
    int crc32_checked;
331
    uint32_t crc_part;
332
};
333
334
struct cram_codec; /* defined in cram_codecs.h */
335
struct cram_map;
336
337
3.39M
#define CRAM_MAP_HASH 32
338
76.1k
#define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1))
339
340
/* Compression header block */
341
struct cram_block_compression_hdr {
342
    int32_t ref_seq_id;
343
    int64_t ref_seq_start;
344
    int64_t ref_seq_span;
345
    int32_t num_records;
346
    int32_t num_landmarks;
347
    int32_t *landmark;
348
349
    /* Flags from preservation map */
350
    int read_names_included;
351
    int AP_delta;
352
    // indexed by ref-base and subst. code
353
    char substitution_matrix[5][4];
354
    int no_ref;
355
    int qs_seq_orient; // 1 => same as seq. 0 => original orientation
356
357
    // TD Dictionary as a concatenated block
358
    cram_block *TD_blk;          // Tag Dictionary
359
    int nTL;                     // number of TL entries in TD
360
    unsigned char **TL;          // array of size nTL, pointer into TD_blk.
361
    khash_t(m_s2i) *TD_hash;     // Keyed on TD strings, map to TL[] indices
362
    string_alloc_t *TD_keys;     // Pooled keys for TD hash.
363
364
    khash_t(map) *preservation_map;
365
    struct cram_map *rec_encoding_map[CRAM_MAP_HASH];
366
    struct cram_map *tag_encoding_map[CRAM_MAP_HASH];
367
368
    struct cram_codec *codecs[DS_END];
369
370
    char *uncomp; // A single block of uncompressed data
371
    size_t uncomp_size, uncomp_alloc;
372
373
    // Total codec count, used for index to block_by_id for transforms
374
    int ncodecs;
375
};
376
377
typedef struct cram_map {
378
    int key;    /* 0xe0 + 3 bytes */
379
    enum cram_encoding encoding;
380
    int offset; /* Offset into a single block of memory */
381
    int size;   /* Size */
382
    struct cram_codec *codec;
383
    struct cram_map *next; // for noddy internal hash
384
} cram_map;
385
386
typedef struct cram_tag_map {
387
    struct cram_codec *codec;
388
    cram_block *blk;
389
    cram_block *blk2;
390
    cram_metrics *m;
391
} cram_tag_map;
392
393
// Hash aux key (XX:i) to cram_tag_map
394
KHASH_MAP_INIT_INT(m_tagmap, cram_tag_map*)
395
396
/* Mapped or unmapped slice header block */
397
struct cram_block_slice_hdr {
398
    enum cram_content_type content_type;
399
    int32_t ref_seq_id;     /* if content_type == MAPPED_SLICE */
400
    int64_t ref_seq_start;  /* if content_type == MAPPED_SLICE */
401
    int64_t ref_seq_span;   /* if content_type == MAPPED_SLICE */
402
    int32_t num_records;
403
    int64_t record_counter;
404
    int32_t num_blocks;
405
    int32_t num_content_ids;
406
    int32_t *block_content_ids;
407
    int32_t ref_base_id;    /* if content_type == MAPPED_SLICE */
408
    unsigned char md5[16];
409
};
410
411
struct ref_entry;
412
413
/*
414
 * Container.
415
 *
416
 * Conceptually a container is split into slices, and slices into blocks.
417
 * However on disk it's just a list of blocks and we need to query the
418
 * block types to identify the start/end points of the slices.
419
 *
420
 * OR... are landmarks the start/end points of slices?
421
 */
422
struct cram_container {
423
    int32_t  length;
424
    int32_t  ref_seq_id;
425
    int64_t  ref_seq_start;
426
    int64_t  ref_seq_span;
427
    int64_t  record_counter;
428
    int64_t  num_bases;
429
    int32_t  num_records;
430
    int32_t  num_blocks;
431
    int32_t  num_landmarks;
432
    int32_t *landmark;
433
434
    /* Size of container header above */
435
    size_t   offset;
436
437
    /* Compression header is always the first block? */
438
    cram_block_compression_hdr *comp_hdr;
439
    cram_block *comp_hdr_block;
440
441
    /* For construction purposes */
442
    int max_slice, curr_slice;   // maximum number of slices
443
    int curr_slice_mt;           // Curr_slice when reading ahead (via threads)
444
    int max_rec, curr_rec;       // current and max recs per slice
445
    int max_c_rec, curr_c_rec;   // current and max recs per container
446
    int slice_rec;               // rec no. for start of this slice
447
    int curr_ref;                // current ref ID. -2 for no previous
448
    int64_t last_pos;                // last record position
449
    struct cram_slice **slices, *slice;
450
    int pos_sorted;              // boolean, 1=>position sorted data
451
    int64_t max_apos;                // maximum position, used if pos_sorted==0
452
    int last_slice;              // number of reads in last slice (0 for 1st)
453
    int multi_seq;               // true if packing multi seqs per cont/slice
454
    int unsorted;                // true is AP_delta is 0.
455
    int qs_seq_orient;           // 1 => same as seq. 0 => original orientation
456
457
    /* Copied from fd before encoding, to allow multi-threading */
458
    int ref_id;
459
    hts_pos_t ref_start, first_base, last_base, ref_end;
460
    char *ref;
461
    int embed_ref;               // 1 if embedding ref, 2 if embedding cons
462
    int no_ref;                  // true if referenceless
463
    //struct ref_entry *ref;
464
465
    /* For multi-threading */
466
    bam_seq_t **bams;
467
468
    /* Statistics for encoding */
469
    cram_stats *stats[DS_END];
470
471
    khash_t(m_tagmap) *tags_used; // set of tag types in use, for tag encoding map
472
    int *refs_used;       // array of frequency of ref seq IDs
473
474
    uint32_t crc32;       // CRC32
475
476
    uint64_t s_num_bases; // number of bases in this slice
477
    uint64_t s_aux_bytes; // number of bytes of aux in BAM
478
479
    uint32_t n_mapped;    // Number of mapped reads
480
    int ref_free;         // whether 'ref' is owned by us and must be freed.
481
};
482
483
/*
484
 * A single cram record
485
 */
486
typedef struct cram_record {
487
    struct cram_slice *s; // Filled out by cram_decode only
488
489
    int32_t ref_id;       // fixed for all recs in slice?
490
    int32_t flags;        // BF
491
    int32_t cram_flags;   // CF
492
    int32_t len;          // RL
493
    int64_t apos;         // AP
494
    int32_t rg;           // RG
495
    int32_t name;         // RN; idx to s->names_blk
496
    int32_t name_len;
497
    int32_t mate_line;    // index to another cram_record
498
    int32_t mate_ref_id;
499
    int64_t mate_pos;     // NP
500
    int64_t tlen;         // TS
501
    int64_t explicit_tlen;// TS, but PNEXT/RNEXT still need auto-computing
502
503
    // Auxiliary data
504
    int32_t ntags;        // TC
505
    uint32_t aux;         // idx to s->aux_blk
506
    uint32_t aux_size;    // total size of packed ntags in aux_blk
507
#ifndef TN_external
508
    int32_t TN_idx;       // TN; idx to s->TN;
509
#else
510
    int32_t tn;           // idx to s->tn_blk
511
#endif
512
    int     TL;
513
514
    uint32_t seq;         // idx to s->seqs_blk
515
    uint32_t qual;        // idx to s->qual_blk
516
    uint32_t cigar;       // idx to s->cigar
517
    int32_t ncigar;
518
    int64_t aend;         // alignment end
519
    int32_t mqual;        // MQ
520
521
    uint32_t feature;     // idx to s->feature
522
    uint32_t nfeature;    // number of features
523
    int32_t mate_flags;   // MF
524
} cram_record;
525
526
// Accessor macros as an analogue of the bam ones
527
#define cram_qname(c)    (&(c)->s->name_blk->data[(c)->name])
528
#define cram_seq(c)      (&(c)->s->seqs_blk->data[(c)->seq])
529
#define cram_qual(c)     (&(c)->s->qual_blk->data[(c)->qual])
530
#define cram_aux(c)      (&(c)->s->aux_blk->data[(c)->aux])
531
#define cram_seqi(c,i)   (cram_seq((c))[(i)])
532
#define cram_name_len(c) ((c)->name_len)
533
#define cram_strand(c)   (((c)->flags & BAM_FREVERSE) != 0)
534
#define cram_mstrand(c)  (((c)->flags & BAM_FMREVERSE) != 0)
535
#define cram_cigar(c)    (&((cr)->s->cigar)[(c)->cigar])
536
537
/*
538
 * A feature is a base difference, used for the sequence reference encoding.
539
 * (We generate these internally when writing CRAM.)
540
 */
541
typedef union cram_feature {
542
    struct {
543
        int pos;
544
        int code;
545
        int base;    // substitution code
546
    } X;
547
    struct {
548
        int pos;
549
        int code;
550
        int base;    // actual base & qual
551
        int qual;
552
    } B;
553
    struct {
554
        int pos;
555
        int code;
556
        int seq_idx; // index to s->seqs_blk
557
        int len;
558
    } b;
559
    struct {
560
        int pos;
561
        int code;
562
        int qual;
563
    } Q;
564
    struct {
565
        int pos;
566
        int code;
567
        int len;
568
        int seq_idx; // soft-clip multiple bases
569
    } S;
570
    struct {
571
        int pos;
572
        int code;
573
        int len;
574
        int seq_idx; // insertion multiple bases
575
    } I;
576
    struct {
577
        int pos;
578
        int code;
579
        int base; // insertion single base
580
    } i;
581
    struct {
582
        int pos;
583
        int code;
584
        int len;
585
    } D;
586
    struct {
587
        int pos;
588
        int code;
589
        int len;
590
    } N;
591
    struct {
592
        int pos;
593
        int code;
594
        int len;
595
    } P;
596
    struct {
597
        int pos;
598
        int code;
599
        int len;
600
    } H;
601
} cram_feature;
602
603
/*
604
 * A slice is really just a set of blocks, but it
605
 * is the logical unit for decoding a number of
606
 * sequences.
607
 */
608
struct cram_slice {
609
    cram_block_slice_hdr *hdr;
610
    cram_block *hdr_block;
611
    cram_block **block;
612
    cram_block **block_by_id;
613
614
    /* State used during encoding/decoding */
615
    int64_t last_apos, max_apos;
616
617
    /* Array of decoded cram records */
618
    cram_record *crecs;
619
620
    /* An dynamically growing buffers for data pointed
621
     * to by crecs[] array.
622
     */
623
    uint32_t  *cigar;
624
    uint32_t   cigar_alloc;
625
    uint32_t   ncigar;
626
627
    cram_feature *features;
628
    uint32_t      nfeatures;
629
    uint32_t      afeatures; // allocated size of features
630
631
#ifndef TN_external
632
    // TN field (Tag Name)
633
    uint32_t      *TN;
634
    int           nTN, aTN;  // used and allocated size for TN[]
635
#else
636
    cram_block *tn_blk;
637
    int tn_id;
638
#endif
639
640
    // For variable sized elements which are always external blocks.
641
    cram_block *name_blk;
642
    cram_block *seqs_blk;
643
    cram_block *qual_blk;
644
    cram_block *base_blk;
645
    cram_block *soft_blk;
646
    cram_block *aux_blk;       // BAM aux block, created while decoding CRAM
647
648
    string_alloc_t *pair_keys; // Pooled keys for pair hash.
649
    khash_t(m_s2i) *pair[2];   // for identifying read-pairs in this slice.
650
651
    char *ref;                 // slice of current reference
652
    hts_pos_t ref_start;       // start position of current reference;
653
    hts_pos_t ref_end;         // end position of current reference;
654
    int ref_id;
655
656
    // For going from BAM to CRAM; an array of auxiliary blocks per type
657
    int naux_block;
658
    cram_block **aux_block;
659
660
    unsigned int data_series; // See cram_fields enum
661
    int decode_md;
662
663
    int max_rec, curr_rec;       // current and max recs per slice
664
    int slice_num;               // To be copied into c->curr_slice in decode
665
};
666
667
/*-----------------------------------------------------------------------------
668
 * Consider moving reference handling to cram_refs.[ch]
669
 */
670
// from fa.fai / samtools faidx files
671
typedef struct ref_entry {
672
    char *name;
673
    char *fn;
674
    int64_t length;        // if 0 this indicates we haven't loaded it yet
675
    int64_t LN_length;     // @SQ LN length, used to trim consensus ref
676
    int64_t offset;
677
    int bases_per_line;
678
    int line_length;
679
    int64_t count;         // for shared references so we know to dealloc seq
680
    char *seq;
681
    mFILE *mf;
682
    int is_md5;            // Reference comes from a raw seq found by MD5
683
    int validated_md5;
684
} ref_entry;
685
686
KHASH_MAP_INIT_STR(refs, ref_entry*)
687
688
// References structure.
689
struct refs_t {
690
    string_alloc_t *pool;  // String pool for holding filenames and SN vals
691
692
    khash_t(refs) *h_meta; // ref_entry*, index by name
693
    ref_entry **ref_id;    // ref_entry*, index by ID
694
    int nref;              // number of ref_entry
695
696
    char *fn;              // current file opened
697
    BGZF *fp;              // and the hFILE* to go with it.
698
699
    int count;             // how many cram_fd sharing this refs struct
700
701
    pthread_mutex_t lock;  // Mutex for multi-threaded updating
702
    ref_entry *last;       // Last queried sequence
703
    int last_id;           // Used in cram_ref_decr_locked to delay free
704
};
705
706
/*-----------------------------------------------------------------------------
707
 * CRAM index
708
 *
709
 * Detect format by number of entries per line.
710
 * 5 => 1.0 (refid, start, nseq, C offset, slice)
711
 * 6 => 1.1 (refid, start, span, C offset, S offset, S size)
712
 *
713
 * Indices are stored in a nested containment list, which is trivial to set
714
 * up as the indices are on sorted data so we're appending to the nclist
715
 * in sorted order. Basically if a slice entirely fits within a previous
716
 * slice then we append to that slices list. This is done recursively.
717
 *
718
 * Lists are sorted on two dimensions: ref id + slice coords.
719
 */
720
typedef struct cram_index {
721
    int nslice, nalloc;   // total number of slices
722
    struct cram_index *e; // array of size nslice
723
724
    int     refid;  // 1.0                 1.1
725
    int     start;  // 1.0                 1.1
726
    int     end;    //                     1.1
727
    int     nseq;   // 1.0 - undocumented
728
    int     slice;  // 1.0 landmark index, 1.1 landmark value
729
    int     len;    //                     1.1 - size of slice in bytes
730
    int64_t offset; // 1.0                 1.1
731
732
    // Linked list of cram_index entries. Used to convert recursive
733
    // NCList back to a linear list.
734
    struct cram_index *e_next;
735
} cram_index;
736
737
typedef struct {
738
    int refid;
739
    int64_t start;
740
    int64_t end;
741
} cram_range;
742
743
/*-----------------------------------------------------------------------------
744
 */
745
/* CRAM File handle */
746
747
typedef struct spare_bams {
748
    bam_seq_t **bams;
749
    struct spare_bams *next;
750
} spare_bams;
751
752
struct cram_fd;
753
typedef struct varint_vec {
754
    // Returns number of bytes decoded from fd, 0 on error
755
    int (*varint_decode32_crc)(struct cram_fd *fd, int32_t *val_p, uint32_t *crc);
756
    int (*varint_decode32s_crc)(struct cram_fd *fd, int32_t *val_p, uint32_t *crc);
757
    int (*varint_decode64_crc)(struct cram_fd *fd, int64_t *val_p, uint32_t *crc);
758
759
    // Returns the value and increments *cp.  Sets err to 1 iff an error occurs.
760
    // NOTE: Does not set err to 0 on success.
761
    int64_t (*varint_get32) (char **cp, const char *endp, int *err);
762
    int64_t (*varint_get32s)(char **cp, const char *endp, int *err);
763
    int64_t (*varint_get64) (char **cp, const char *endp, int *err);
764
    int64_t (*varint_get64s)(char **cp, const char *endp, int *err);
765
766
    // Returns the number of bytes written, <= 0 on error.
767
    int (*varint_put32) (char *cp, char *endp, int32_t val_p);
768
    int (*varint_put32s)(char *cp, char *endp, int32_t val_p);
769
    int (*varint_put64) (char *cp, char *endp, int64_t val_p);
770
    int (*varint_put64s)(char *cp, char *endp, int64_t val_p);
771
772
    // Returns the number of bytes written, <= 0 on error.
773
    int (*varint_put32_blk) (cram_block *blk, int32_t val_p);
774
    int (*varint_put32s_blk)(cram_block *blk, int32_t val_p);
775
    int (*varint_put64_blk) (cram_block *blk, int64_t val_p);
776
    int (*varint_put64s_blk)(cram_block *blk, int64_t val_p);
777
778
    // Returns number of bytes needed to encode 'val'
779
    int (*varint_size)(int64_t val);
780
} varint_vec;
781
782
struct cram_fd {
783
    struct hFILE  *fp;
784
    int            mode;     // 'r' or 'w'
785
    int            version;
786
    cram_file_def *file_def;
787
    sam_hdr_t     *header;
788
789
    char          *prefix;
790
    int64_t        record_counter;
791
    int            err;
792
793
    // Most recent compression header decoded
794
    //cram_block_compression_hdr *comp_hdr;
795
    //cram_block_slice_hdr       *slice_hdr;
796
797
    // Current container being processed
798
    cram_container *ctr;
799
800
    // Current container used for decoder threads
801
    cram_container *ctr_mt;
802
803
    // positions for encoding or decoding
804
    int first_base, last_base; // copied to container
805
806
    // cached reference portion
807
    refs_t   *refs;                // ref meta-data structure
808
    char     *ref, *ref_free;      // current portion held in memory
809
    int       ref_id;              // copied to container
810
    hts_pos_t ref_start;           // copied to container
811
    hts_pos_t ref_end;             // copied to container
812
    char     *ref_fn;              // reference fasta filename
813
814
    // compression level and metrics
815
    int level;
816
    cram_metrics *m[DS_END];
817
    khash_t(m_metrics) *tags_used; // cram_metrics[], per tag types in use.
818
819
    // options
820
    int decode_md; // Whether to export MD and NM tags
821
    int seqs_per_slice;
822
    int bases_per_slice;
823
    int slices_per_container;
824
    int embed_ref; // copied to container
825
    int no_ref;    // copied to container
826
    int no_ref_counter; // decide if permanent switch
827
    int ignore_md5;
828
    int use_bz2;
829
    int use_rans;
830
    int use_lzma;
831
    int use_fqz;
832
    int use_tok;
833
    int use_arith;
834
    int shared_ref;
835
    unsigned int required_fields;
836
    int store_md;
837
    int store_nm;
838
    cram_range range;
839
840
    // lookup tables, stored here so we can be trivially multi-threaded
841
    unsigned int bam_flag_swap[0x1000]; // cram -> bam flags
842
    unsigned int cram_flag_swap[0x1000];// bam -> cram flags
843
    unsigned char L1[256];              // ACGT{*} ->0123{4}
844
    unsigned char L2[256];              // ACGTN{*}->01234{5}
845
    char cram_sub_matrix[32][32];       // base substitution codes
846
847
    int         index_sz;
848
    cram_index *index;                  // array, sizeof index_sz
849
    off_t first_container;
850
    off_t curr_position;
851
    int eof;
852
    int last_slice;                     // number of recs encoded in last slice
853
    int last_RI_count;                  // number of references encoded in last container
854
    int multi_seq;                      // -1 is auto, 0 is one ref per container, 1 is multi...
855
    int multi_seq_user;                 // Original user setting (CRAM_OPT_MULTI_SEQ_PER_SLICE)
856
    int unsorted;
857
    int last_mapped;                    // number of mapped reads in last container
858
    int empty_container;                // Marker for EOF block
859
860
    // thread pool
861
    int own_pool;
862
    hts_tpool *pool;
863
    hts_tpool_process *rqueue;
864
    pthread_mutex_t metrics_lock;
865
    pthread_mutex_t ref_lock;
866
    pthread_mutex_t range_lock;
867
    spare_bams *bl;
868
    pthread_mutex_t bam_list_lock;
869
    void *job_pending;
870
    int ooc;                            // out of containers.
871
872
    int lossy_read_names;               // boolean
873
    int tlen_approx;                    // max TLEN calculation offset.
874
    int tlen_zero;                      // If true, permit tlen 0 (=> tlen calculated)
875
876
    BGZF *idxfp;                        // File pointer for on-the-fly index creation
877
878
    // variable integer decoding callbacks.
879
    // This changed in CRAM4.0 to a data-size agnostic encoding.
880
    varint_vec vv;
881
882
    // Force AP delta even on non positional sorted data.
883
    // This can be beneficial for pairs where pairs are nearby each other.
884
    // We suffer with delta to unrelated things (previous pair), but gain
885
    // in delta between them.  (Ideal would be a per read setting.)
886
    int ap_delta;
887
};
888
889
// Translation of required fields to cram data series
890
enum cram_fields {
891
    CRAM_BF = 0x00000001,
892
    CRAM_AP = 0x00000002,
893
    CRAM_FP = 0x00000004,
894
    CRAM_RL = 0x00000008,
895
    CRAM_DL = 0x00000010,
896
    CRAM_NF = 0x00000020,
897
    CRAM_BA = 0x00000040,
898
    CRAM_QS = 0x00000080,
899
    CRAM_FC = 0x00000100,
900
    CRAM_FN = 0x00000200,
901
    CRAM_BS = 0x00000400,
902
    CRAM_IN = 0x00000800,
903
    CRAM_RG = 0x00001000,
904
    CRAM_MQ = 0x00002000,
905
    CRAM_TL = 0x00004000,
906
    CRAM_RN = 0x00008000,
907
    CRAM_NS = 0x00010000,
908
    CRAM_NP = 0x00020000,
909
    CRAM_TS = 0x00040000,
910
    CRAM_MF = 0x00080000,
911
    CRAM_CF = 0x00100000,
912
    CRAM_RI = 0x00200000,
913
    CRAM_RS = 0x00400000,
914
    CRAM_PD = 0x00800000,
915
    CRAM_HC = 0x01000000,
916
    CRAM_SC = 0x02000000,
917
    CRAM_BB = 0x04000000,
918
    CRAM_BB_len = 0x08000000,
919
    CRAM_QQ = 0x10000000,
920
    CRAM_QQ_len = 0x20000000,
921
    CRAM_aux= 0x40000000,
922
    CRAM_ALL= 0x7fffffff,
923
};
924
925
// A CIGAR opcode, but not necessarily the implications of it. Eg FC/FP may
926
// encode a base difference, but we don't need to know what it is for CIGAR.
927
// If we have a soft-clip or insertion, we do need SC/IN though to know how
928
// long that array is.
929
0
#define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \
930
0
                    CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF)
931
932
0
#define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_BS | \
933
0
                  CRAM_RL    | CRAM_AP | CRAM_BB)
934
935
0
#define CRAM_QUAL (CRAM_CIGAR | CRAM_RL | CRAM_AP | CRAM_QS | CRAM_QQ)
936
937
/* BF bitfields */
938
/* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */
939
23.7M
#define CRAM_FPAIRED      256
940
23.7M
#define CRAM_FPROPER_PAIR 128
941
23.7M
#define CRAM_FUNMAP        64
942
23.7M
#define CRAM_FREVERSE      32
943
23.7M
#define CRAM_FREAD1        16
944
23.7M
#define CRAM_FREAD2         8
945
23.7M
#define CRAM_FSECONDARY     4
946
23.7M
#define CRAM_FQCFAIL        2
947
23.7M
#define CRAM_FDUP           1
948
949
#define DS_aux_S "\001"
950
#define DS_aux_OQ_S "\002"
951
#define DS_aux_BQ_S "\003"
952
#define DS_aux_BD_S "\004"
953
#define DS_aux_BI_S "\005"
954
#define DS_aux_FZ_S "\006"
955
#define DS_aux_oq_S "\007"
956
#define DS_aux_os_S "\010"
957
#define DS_aux_oz_S "\011"
958
959
382
#define CRAM_M_REVERSE  1
960
17.3k
#define CRAM_M_UNMAP    2
961
962
963
/* CF bitfields */
964
22.3M
#define CRAM_FLAG_PRESERVE_QUAL_SCORES (1<<0)
965
22.3M
#define CRAM_FLAG_DETACHED             (1<<1)
966
27
#define CRAM_FLAG_MATE_DOWNSTREAM      (1<<2)
967
7.23M
#define CRAM_FLAG_NO_SEQ               (1<<3)
968
27
#define CRAM_FLAG_EXPLICIT_TLEN        (1<<4)
969
14.8M
#define CRAM_FLAG_MASK                 ((1<<5)-1)
970
971
/* Internal only */
972
7.44M
#define CRAM_FLAG_STATS_ADDED          (1<<30)
973
0
#define CRAM_FLAG_DISCARD_NAME         (1U<<31)
974
975
#ifdef __cplusplus
976
}
977
#endif
978
979
#endif /* HTSLIB_CRAM_STRUCTS_H */