/src/htslib/cram/cram_structs.h
Line | Count | Source |
1 | | /* |
2 | | Copyright (c) 2012-2016, 2018-2020, 2023 Genome Research Ltd. |
3 | | Author: James Bonfield <jkb@sanger.ac.uk> |
4 | | |
5 | | Redistribution and use in source and binary forms, with or without |
6 | | modification, are permitted provided that the following conditions are met: |
7 | | |
8 | | 1. Redistributions of source code must retain the above copyright notice, |
9 | | this list of conditions and the following disclaimer. |
10 | | |
11 | | 2. Redistributions in binary form must reproduce the above copyright notice, |
12 | | this list of conditions and the following disclaimer in the documentation |
13 | | and/or other materials provided with the distribution. |
14 | | |
15 | | 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger |
16 | | Institute nor the names of its contributors may be used to endorse or promote |
17 | | products derived from this software without specific prior written permission. |
18 | | |
19 | | THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND |
20 | | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
21 | | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
22 | | DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE |
23 | | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
24 | | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
25 | | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
26 | | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
27 | | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
28 | | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 | | */ |
30 | | |
31 | | #ifndef HTSLIB_CRAM_STRUCTS_H |
32 | | #define HTSLIB_CRAM_STRUCTS_H |
33 | | |
34 | | /* |
35 | | * Defines in-memory structs for the basic file-format objects in the |
36 | | * CRAM format. |
37 | | * |
38 | | * The basic file format is: |
39 | | * File-def SAM-hdr Container Container ... |
40 | | * |
41 | | * Container: |
42 | | * Service-block data-block data-block ... |
43 | | * |
44 | | * Multiple blocks in a container are grouped together as slices, |
45 | | * also sometimes referred to as landmarks in the spec. |
46 | | */ |
47 | | |
48 | | |
49 | | #include <pthread.h> |
50 | | #include <stdint.h> |
51 | | #include <sys/types.h> |
52 | | |
53 | | #include "../htslib/thread_pool.h" |
54 | | #include "../htslib/cram.h" |
55 | | #include "string_alloc.h" |
56 | | #include "mFILE.h" |
57 | | #include "../htslib/khash.h" |
58 | | |
59 | | #ifdef __cplusplus |
60 | | extern "C" { |
61 | | #endif |
62 | | |
63 | | // Generic hash-map integer -> integer |
64 | | KHASH_MAP_INIT_INT64(m_i2i, int) |
65 | | |
66 | | // Generic hash-set integer -> (existence) |
67 | | KHASH_SET_INIT_INT(s_i2i) |
68 | | |
69 | | // For brevity |
70 | | typedef unsigned char uc; |
71 | | |
72 | | /* |
73 | | * A union for the preservation map. Required for khash. |
74 | | */ |
75 | | typedef union { |
76 | | int i; |
77 | | char *p; |
78 | | } pmap_t; |
79 | | |
80 | | // Generates static functions here which isn't ideal, but we have no way |
81 | | // currently to declare the kh_map_t structure here without also declaring a |
82 | | // duplicate in the .c files due to the nature of the KHASH macros. |
83 | | KHASH_MAP_INIT_STR(map, pmap_t) |
84 | | |
85 | | struct hFILE; |
86 | | |
87 | 40.4k | #define SEQS_PER_SLICE 10000 |
88 | 20.2k | #define BASES_PER_SLICE (SEQS_PER_SLICE*500) |
89 | 20.2k | #define SLICE_PER_CNT 1 |
90 | | |
91 | 481k | #define CRAM_SUBST_MATRIX "CGTNGTANCATNGCANACGT" |
92 | | |
93 | 2.35G | #define MAX_STAT_VAL 1024 |
94 | | //#define MAX_STAT_VAL 16 |
95 | | typedef struct cram_stats { |
96 | | int freqs[MAX_STAT_VAL]; |
97 | | khash_t(m_i2i) *h; |
98 | | int nsamp; // total number of values added |
99 | | int nvals; // total number of unique values added |
100 | | int64_t min_val, max_val; |
101 | | } cram_stats; |
102 | | |
103 | | /* NB: matches java impl, not the spec */ |
104 | | enum cram_encoding { |
105 | | E_NULL = 0, |
106 | | E_EXTERNAL = 1, // Only for BYTE type in CRAM 4 |
107 | | E_GOLOMB = 2, // Not in CRAM 4 |
108 | | E_HUFFMAN = 3, // Not in CRAM 4 |
109 | | E_BYTE_ARRAY_LEN = 4, |
110 | | E_BYTE_ARRAY_STOP = 5, |
111 | | E_BETA = 6, // Not in CRAM 4 |
112 | | E_SUBEXP = 7, // Not in CRAM 4 |
113 | | E_GOLOMB_RICE = 8, // Not in CRAM 4 |
114 | | E_GAMMA = 9, // Not in CRAM 4 |
115 | | |
116 | | // CRAM 4 specific codecs |
117 | | E_VARINT_UNSIGNED = 41, // Specialisation of EXTERNAL |
118 | | E_VARINT_SIGNED = 42, // Specialisation of EXTERNAL |
119 | | E_CONST_BYTE = 43, // Alternative to HUFFMAN with 1 symbol |
120 | | E_CONST_INT = 44, // Alternative to HUFFMAN with 1 symbol |
121 | | |
122 | | // More experimental ideas, not documented in spec yet |
123 | | E_XHUFFMAN = 50, // To external block |
124 | | E_XPACK = 51, // Transform to sub-codec |
125 | | E_XRLE = 52, // Transform to sub-codec |
126 | | E_XDELTA = 53, // Transform to sub-codec |
127 | | |
128 | | // Total number of codecs, not a real one. |
129 | | E_NUM_CODECS, |
130 | | }; |
131 | | |
132 | | enum cram_external_type { |
133 | | E_INT = 1, |
134 | | E_LONG = 2, |
135 | | E_BYTE = 3, |
136 | | E_BYTE_ARRAY = 4, |
137 | | E_BYTE_ARRAY_BLOCK = 5, |
138 | | E_SINT = 6, // signed INT |
139 | | E_SLONG = 7, // signed LONG |
140 | | }; |
141 | | |
142 | | /* External IDs used by this implementation (only assumed during writing) */ |
143 | | enum cram_DS_ID { |
144 | | DS_CORE = 0, |
145 | | DS_aux = 1, // aux_blk |
146 | | DS_aux_OQ = 2, |
147 | | DS_aux_BQ = 3, |
148 | | DS_aux_BD = 4, |
149 | | DS_aux_BI = 5, |
150 | | DS_aux_FZ = 6, // also ZM:B |
151 | | DS_aux_oq = 7, // other qualities |
152 | | DS_aux_os = 8, // other sequences |
153 | | DS_aux_oz = 9, // other strings |
154 | | DS_ref, |
155 | | DS_RN, // name_blk |
156 | | DS_QS, // qual_blk |
157 | | DS_IN, // base_blk |
158 | | DS_SC, // soft_blk |
159 | | |
160 | | DS_BF, // start loop |
161 | | DS_CF, |
162 | | DS_AP, |
163 | | DS_RG, |
164 | | DS_MQ, |
165 | | DS_NS, |
166 | | DS_MF, |
167 | | DS_TS, |
168 | | DS_NP, |
169 | | DS_NF, |
170 | | DS_RL, |
171 | | DS_FN, |
172 | | DS_FC, |
173 | | DS_FP, |
174 | | DS_DL, |
175 | | DS_BA, |
176 | | DS_BS, |
177 | | DS_TL, |
178 | | DS_RI, |
179 | | DS_RS, |
180 | | DS_PD, |
181 | | DS_HC, |
182 | | DS_BB, |
183 | | DS_QQ, |
184 | | |
185 | | DS_TN, // end loop |
186 | | |
187 | | DS_RN_len, |
188 | | DS_SC_len, |
189 | | DS_BB_len, |
190 | | DS_QQ_len, |
191 | | |
192 | | DS_TC, // CRAM v1.0 tags |
193 | | DS_TM, // test |
194 | | DS_TV, // test |
195 | | |
196 | | DS_END, |
197 | | }; |
198 | | |
199 | | /* "File Definition Structure" */ |
200 | | struct cram_file_def { |
201 | | char magic[4]; |
202 | | uint8_t major_version; |
203 | | uint8_t minor_version; |
204 | | char file_id[20] HTS_NONSTRING; // Filename or SHA1 checksum |
205 | | }; |
206 | | |
207 | 55.3M | #define CRAM_MAJOR_VERS(v) ((v) >> 8) |
208 | 20.7k | #define CRAM_MINOR_VERS(v) ((v) & 0xff) |
209 | | |
210 | | struct cram_slice; |
211 | | |
212 | | // Internal version of htslib/cram.h enum. |
213 | | // Note these have to match the laout of methmap and methcost in |
214 | | // cram_io.c:cram_compress_block2 |
215 | | enum cram_block_method_int { |
216 | | // Public methods as defined in the CRAM spec. |
217 | | BM_ERROR = -1, |
218 | | |
219 | | // CRAM 2.x and 3.0 |
220 | | RAW = 0, |
221 | | GZIP = 1, |
222 | | BZIP2 = 2, |
223 | | LZMA = 3, |
224 | | RANS = 4, RANS0 = RANS, |
225 | | |
226 | | // CRAM 3.1 onwards |
227 | | RANSPR = 5, RANS_PR0 = RANSPR, |
228 | | ARITH = 6, ARITH_PR0 = ARITH, |
229 | | FQZ = 7, |
230 | | TOK3 = 8, |
231 | | // BSC = 9, ZSTD = 10 |
232 | | |
233 | | // Methods not externalised, but used in metrics. |
234 | | // Externally they become one of the above methods. |
235 | | GZIP_RLE = 11, |
236 | | GZIP_1, // Z_DEFAULT_STRATEGY level 1, NB: not externalised in CRAM |
237 | | |
238 | | FQZ_b, FQZ_c, FQZ_d, // Various preset FQZ methods |
239 | | |
240 | | //RANS0, // Order 0 |
241 | | RANS1, |
242 | | |
243 | | //RANS_PR0, // Order 0 |
244 | | RANS_PR1, // Order 1 |
245 | | RANS_PR64, // O0 + RLE |
246 | | RANS_PR9, // O1 + X4 |
247 | | RANS_PR128, // O0 + Pack |
248 | | RANS_PR129, // O1 + Pack |
249 | | RANS_PR192, // O0 + RLE + pack |
250 | | RANS_PR193, // O1 + RLE + pack |
251 | | |
252 | | //TOK3, // tok+rans |
253 | | TOKA, // tok+arith |
254 | | |
255 | | //ARITH_PR0, // Order 0 |
256 | | ARITH_PR1, // Order 1 |
257 | | ARITH_PR64, // O0 + RLE |
258 | | ARITH_PR9, // O1 + X4 |
259 | | ARITH_PR128, // O0 + Pack |
260 | | ARITH_PR129, // O1 + Pack |
261 | | ARITH_PR192, // O0 + RLE + pack |
262 | | ARITH_PR193, // O1 + RLE + pack |
263 | | |
264 | | // NB: must end on no more than 31 unless we change to a |
265 | | // 64-bit method type. |
266 | | }; |
267 | | |
268 | | /* Now in htslib/cram.h |
269 | | enum cram_content_type { |
270 | | CT_ERROR = -1, |
271 | | FILE_HEADER = 0, |
272 | | COMPRESSION_HEADER = 1, |
273 | | MAPPED_SLICE = 2, |
274 | | UNMAPPED_SLICE = 3, // CRAM V1.0 only |
275 | | EXTERNAL = 4, |
276 | | CORE = 5, |
277 | | }; |
278 | | */ |
279 | | |
280 | | /* Maximum simultaneous codecs allowed, 1 per bit */ |
281 | 6.10M | #define CRAM_MAX_METHOD 32 |
282 | | |
283 | | /* Compression metrics */ |
284 | | struct cram_metrics { |
285 | | // number of trials and time to next trial |
286 | | int trial; |
287 | | int next_trial; |
288 | | int consistency; |
289 | | |
290 | | // aggregate sizes during trials |
291 | | int sz[CRAM_MAX_METHOD]; |
292 | | int input_avg_sz, input_avg_delta; |
293 | | |
294 | | // resultant method from trials |
295 | | int method, revised_method; |
296 | | int strat; |
297 | | |
298 | | // Revisions of method, to allow culling of continually failing ones. |
299 | | int cnt[CRAM_MAX_METHOD]; |
300 | | |
301 | | double extra[CRAM_MAX_METHOD]; |
302 | | |
303 | | // Not amenable to rANS bit-packing techniques; cardinality > 16 |
304 | | int unpackable; |
305 | | }; |
306 | | |
307 | | // Hash aux key (XX:i) to cram_metrics |
308 | | KHASH_MAP_INIT_INT(m_metrics, cram_metrics*) |
309 | | |
310 | | |
311 | | /* Block */ |
312 | | struct cram_block { |
313 | | enum cram_block_method_int method, orig_method; |
314 | | enum cram_content_type content_type; |
315 | | int32_t content_id; |
316 | | int32_t comp_size; |
317 | | int32_t uncomp_size; |
318 | | uint32_t crc32; |
319 | | int32_t idx; /* offset into data */ |
320 | | unsigned char *data; |
321 | | |
322 | | // For bit I/O |
323 | | size_t alloc; |
324 | | size_t byte; |
325 | | int bit; |
326 | | |
327 | | // To aid compression |
328 | | cram_metrics *m; // used to track aux block compression only |
329 | | |
330 | | int crc32_checked; |
331 | | uint32_t crc_part; |
332 | | }; |
333 | | |
334 | | struct cram_codec; /* defined in cram_codecs.h */ |
335 | | struct cram_map; |
336 | | |
337 | 3.39M | #define CRAM_MAP_HASH 32 |
338 | 76.1k | #define CRAM_MAP(a,b) (((a)*3+(b))&(CRAM_MAP_HASH-1)) |
339 | | |
340 | | /* Compression header block */ |
341 | | struct cram_block_compression_hdr { |
342 | | int32_t ref_seq_id; |
343 | | int64_t ref_seq_start; |
344 | | int64_t ref_seq_span; |
345 | | int32_t num_records; |
346 | | int32_t num_landmarks; |
347 | | int32_t *landmark; |
348 | | |
349 | | /* Flags from preservation map */ |
350 | | int read_names_included; |
351 | | int AP_delta; |
352 | | // indexed by ref-base and subst. code |
353 | | char substitution_matrix[5][4]; |
354 | | int no_ref; |
355 | | int qs_seq_orient; // 1 => same as seq. 0 => original orientation |
356 | | |
357 | | // TD Dictionary as a concatenated block |
358 | | cram_block *TD_blk; // Tag Dictionary |
359 | | int nTL; // number of TL entries in TD |
360 | | unsigned char **TL; // array of size nTL, pointer into TD_blk. |
361 | | khash_t(m_s2i) *TD_hash; // Keyed on TD strings, map to TL[] indices |
362 | | string_alloc_t *TD_keys; // Pooled keys for TD hash. |
363 | | |
364 | | khash_t(map) *preservation_map; |
365 | | struct cram_map *rec_encoding_map[CRAM_MAP_HASH]; |
366 | | struct cram_map *tag_encoding_map[CRAM_MAP_HASH]; |
367 | | |
368 | | struct cram_codec *codecs[DS_END]; |
369 | | |
370 | | char *uncomp; // A single block of uncompressed data |
371 | | size_t uncomp_size, uncomp_alloc; |
372 | | |
373 | | // Total codec count, used for index to block_by_id for transforms |
374 | | int ncodecs; |
375 | | }; |
376 | | |
377 | | typedef struct cram_map { |
378 | | int key; /* 0xe0 + 3 bytes */ |
379 | | enum cram_encoding encoding; |
380 | | int offset; /* Offset into a single block of memory */ |
381 | | int size; /* Size */ |
382 | | struct cram_codec *codec; |
383 | | struct cram_map *next; // for noddy internal hash |
384 | | } cram_map; |
385 | | |
386 | | typedef struct cram_tag_map { |
387 | | struct cram_codec *codec; |
388 | | cram_block *blk; |
389 | | cram_block *blk2; |
390 | | cram_metrics *m; |
391 | | } cram_tag_map; |
392 | | |
393 | | // Hash aux key (XX:i) to cram_tag_map |
394 | | KHASH_MAP_INIT_INT(m_tagmap, cram_tag_map*) |
395 | | |
396 | | /* Mapped or unmapped slice header block */ |
397 | | struct cram_block_slice_hdr { |
398 | | enum cram_content_type content_type; |
399 | | int32_t ref_seq_id; /* if content_type == MAPPED_SLICE */ |
400 | | int64_t ref_seq_start; /* if content_type == MAPPED_SLICE */ |
401 | | int64_t ref_seq_span; /* if content_type == MAPPED_SLICE */ |
402 | | int32_t num_records; |
403 | | int64_t record_counter; |
404 | | int32_t num_blocks; |
405 | | int32_t num_content_ids; |
406 | | int32_t *block_content_ids; |
407 | | int32_t ref_base_id; /* if content_type == MAPPED_SLICE */ |
408 | | unsigned char md5[16]; |
409 | | }; |
410 | | |
411 | | struct ref_entry; |
412 | | |
413 | | /* |
414 | | * Container. |
415 | | * |
416 | | * Conceptually a container is split into slices, and slices into blocks. |
417 | | * However on disk it's just a list of blocks and we need to query the |
418 | | * block types to identify the start/end points of the slices. |
419 | | * |
420 | | * OR... are landmarks the start/end points of slices? |
421 | | */ |
422 | | struct cram_container { |
423 | | int32_t length; |
424 | | int32_t ref_seq_id; |
425 | | int64_t ref_seq_start; |
426 | | int64_t ref_seq_span; |
427 | | int64_t record_counter; |
428 | | int64_t num_bases; |
429 | | int32_t num_records; |
430 | | int32_t num_blocks; |
431 | | int32_t num_landmarks; |
432 | | int32_t *landmark; |
433 | | |
434 | | /* Size of container header above */ |
435 | | size_t offset; |
436 | | |
437 | | /* Compression header is always the first block? */ |
438 | | cram_block_compression_hdr *comp_hdr; |
439 | | cram_block *comp_hdr_block; |
440 | | |
441 | | /* For construction purposes */ |
442 | | int max_slice, curr_slice; // maximum number of slices |
443 | | int curr_slice_mt; // Curr_slice when reading ahead (via threads) |
444 | | int max_rec, curr_rec; // current and max recs per slice |
445 | | int max_c_rec, curr_c_rec; // current and max recs per container |
446 | | int slice_rec; // rec no. for start of this slice |
447 | | int curr_ref; // current ref ID. -2 for no previous |
448 | | int64_t last_pos; // last record position |
449 | | struct cram_slice **slices, *slice; |
450 | | int pos_sorted; // boolean, 1=>position sorted data |
451 | | int64_t max_apos; // maximum position, used if pos_sorted==0 |
452 | | int last_slice; // number of reads in last slice (0 for 1st) |
453 | | int multi_seq; // true if packing multi seqs per cont/slice |
454 | | int unsorted; // true is AP_delta is 0. |
455 | | int qs_seq_orient; // 1 => same as seq. 0 => original orientation |
456 | | |
457 | | /* Copied from fd before encoding, to allow multi-threading */ |
458 | | int ref_id; |
459 | | hts_pos_t ref_start, first_base, last_base, ref_end; |
460 | | char *ref; |
461 | | int embed_ref; // 1 if embedding ref, 2 if embedding cons |
462 | | int no_ref; // true if referenceless |
463 | | //struct ref_entry *ref; |
464 | | |
465 | | /* For multi-threading */ |
466 | | bam_seq_t **bams; |
467 | | |
468 | | /* Statistics for encoding */ |
469 | | cram_stats *stats[DS_END]; |
470 | | |
471 | | khash_t(m_tagmap) *tags_used; // set of tag types in use, for tag encoding map |
472 | | int *refs_used; // array of frequency of ref seq IDs |
473 | | |
474 | | uint32_t crc32; // CRC32 |
475 | | |
476 | | uint64_t s_num_bases; // number of bases in this slice |
477 | | uint64_t s_aux_bytes; // number of bytes of aux in BAM |
478 | | |
479 | | uint32_t n_mapped; // Number of mapped reads |
480 | | int ref_free; // whether 'ref' is owned by us and must be freed. |
481 | | }; |
482 | | |
483 | | /* |
484 | | * A single cram record |
485 | | */ |
486 | | typedef struct cram_record { |
487 | | struct cram_slice *s; // Filled out by cram_decode only |
488 | | |
489 | | int32_t ref_id; // fixed for all recs in slice? |
490 | | int32_t flags; // BF |
491 | | int32_t cram_flags; // CF |
492 | | int32_t len; // RL |
493 | | int64_t apos; // AP |
494 | | int32_t rg; // RG |
495 | | int32_t name; // RN; idx to s->names_blk |
496 | | int32_t name_len; |
497 | | int32_t mate_line; // index to another cram_record |
498 | | int32_t mate_ref_id; |
499 | | int64_t mate_pos; // NP |
500 | | int64_t tlen; // TS |
501 | | int64_t explicit_tlen;// TS, but PNEXT/RNEXT still need auto-computing |
502 | | |
503 | | // Auxiliary data |
504 | | int32_t ntags; // TC |
505 | | uint32_t aux; // idx to s->aux_blk |
506 | | uint32_t aux_size; // total size of packed ntags in aux_blk |
507 | | #ifndef TN_external |
508 | | int32_t TN_idx; // TN; idx to s->TN; |
509 | | #else |
510 | | int32_t tn; // idx to s->tn_blk |
511 | | #endif |
512 | | int TL; |
513 | | |
514 | | uint32_t seq; // idx to s->seqs_blk |
515 | | uint32_t qual; // idx to s->qual_blk |
516 | | uint32_t cigar; // idx to s->cigar |
517 | | int32_t ncigar; |
518 | | int64_t aend; // alignment end |
519 | | int32_t mqual; // MQ |
520 | | |
521 | | uint32_t feature; // idx to s->feature |
522 | | uint32_t nfeature; // number of features |
523 | | int32_t mate_flags; // MF |
524 | | } cram_record; |
525 | | |
526 | | // Accessor macros as an analogue of the bam ones |
527 | | #define cram_qname(c) (&(c)->s->name_blk->data[(c)->name]) |
528 | | #define cram_seq(c) (&(c)->s->seqs_blk->data[(c)->seq]) |
529 | | #define cram_qual(c) (&(c)->s->qual_blk->data[(c)->qual]) |
530 | | #define cram_aux(c) (&(c)->s->aux_blk->data[(c)->aux]) |
531 | | #define cram_seqi(c,i) (cram_seq((c))[(i)]) |
532 | | #define cram_name_len(c) ((c)->name_len) |
533 | | #define cram_strand(c) (((c)->flags & BAM_FREVERSE) != 0) |
534 | | #define cram_mstrand(c) (((c)->flags & BAM_FMREVERSE) != 0) |
535 | | #define cram_cigar(c) (&((cr)->s->cigar)[(c)->cigar]) |
536 | | |
537 | | /* |
538 | | * A feature is a base difference, used for the sequence reference encoding. |
539 | | * (We generate these internally when writing CRAM.) |
540 | | */ |
541 | | typedef union cram_feature { |
542 | | struct { |
543 | | int pos; |
544 | | int code; |
545 | | int base; // substitution code |
546 | | } X; |
547 | | struct { |
548 | | int pos; |
549 | | int code; |
550 | | int base; // actual base & qual |
551 | | int qual; |
552 | | } B; |
553 | | struct { |
554 | | int pos; |
555 | | int code; |
556 | | int seq_idx; // index to s->seqs_blk |
557 | | int len; |
558 | | } b; |
559 | | struct { |
560 | | int pos; |
561 | | int code; |
562 | | int qual; |
563 | | } Q; |
564 | | struct { |
565 | | int pos; |
566 | | int code; |
567 | | int len; |
568 | | int seq_idx; // soft-clip multiple bases |
569 | | } S; |
570 | | struct { |
571 | | int pos; |
572 | | int code; |
573 | | int len; |
574 | | int seq_idx; // insertion multiple bases |
575 | | } I; |
576 | | struct { |
577 | | int pos; |
578 | | int code; |
579 | | int base; // insertion single base |
580 | | } i; |
581 | | struct { |
582 | | int pos; |
583 | | int code; |
584 | | int len; |
585 | | } D; |
586 | | struct { |
587 | | int pos; |
588 | | int code; |
589 | | int len; |
590 | | } N; |
591 | | struct { |
592 | | int pos; |
593 | | int code; |
594 | | int len; |
595 | | } P; |
596 | | struct { |
597 | | int pos; |
598 | | int code; |
599 | | int len; |
600 | | } H; |
601 | | } cram_feature; |
602 | | |
603 | | /* |
604 | | * A slice is really just a set of blocks, but it |
605 | | * is the logical unit for decoding a number of |
606 | | * sequences. |
607 | | */ |
608 | | struct cram_slice { |
609 | | cram_block_slice_hdr *hdr; |
610 | | cram_block *hdr_block; |
611 | | cram_block **block; |
612 | | cram_block **block_by_id; |
613 | | |
614 | | /* State used during encoding/decoding */ |
615 | | int64_t last_apos, max_apos; |
616 | | |
617 | | /* Array of decoded cram records */ |
618 | | cram_record *crecs; |
619 | | |
620 | | /* An dynamically growing buffers for data pointed |
621 | | * to by crecs[] array. |
622 | | */ |
623 | | uint32_t *cigar; |
624 | | uint32_t cigar_alloc; |
625 | | uint32_t ncigar; |
626 | | |
627 | | cram_feature *features; |
628 | | uint32_t nfeatures; |
629 | | uint32_t afeatures; // allocated size of features |
630 | | |
631 | | #ifndef TN_external |
632 | | // TN field (Tag Name) |
633 | | uint32_t *TN; |
634 | | int nTN, aTN; // used and allocated size for TN[] |
635 | | #else |
636 | | cram_block *tn_blk; |
637 | | int tn_id; |
638 | | #endif |
639 | | |
640 | | // For variable sized elements which are always external blocks. |
641 | | cram_block *name_blk; |
642 | | cram_block *seqs_blk; |
643 | | cram_block *qual_blk; |
644 | | cram_block *base_blk; |
645 | | cram_block *soft_blk; |
646 | | cram_block *aux_blk; // BAM aux block, created while decoding CRAM |
647 | | |
648 | | string_alloc_t *pair_keys; // Pooled keys for pair hash. |
649 | | khash_t(m_s2i) *pair[2]; // for identifying read-pairs in this slice. |
650 | | |
651 | | char *ref; // slice of current reference |
652 | | hts_pos_t ref_start; // start position of current reference; |
653 | | hts_pos_t ref_end; // end position of current reference; |
654 | | int ref_id; |
655 | | |
656 | | // For going from BAM to CRAM; an array of auxiliary blocks per type |
657 | | int naux_block; |
658 | | cram_block **aux_block; |
659 | | |
660 | | unsigned int data_series; // See cram_fields enum |
661 | | int decode_md; |
662 | | |
663 | | int max_rec, curr_rec; // current and max recs per slice |
664 | | int slice_num; // To be copied into c->curr_slice in decode |
665 | | }; |
666 | | |
667 | | /*----------------------------------------------------------------------------- |
668 | | * Consider moving reference handling to cram_refs.[ch] |
669 | | */ |
670 | | // from fa.fai / samtools faidx files |
671 | | typedef struct ref_entry { |
672 | | char *name; |
673 | | char *fn; |
674 | | int64_t length; // if 0 this indicates we haven't loaded it yet |
675 | | int64_t LN_length; // @SQ LN length, used to trim consensus ref |
676 | | int64_t offset; |
677 | | int bases_per_line; |
678 | | int line_length; |
679 | | int64_t count; // for shared references so we know to dealloc seq |
680 | | char *seq; |
681 | | mFILE *mf; |
682 | | int is_md5; // Reference comes from a raw seq found by MD5 |
683 | | int validated_md5; |
684 | | } ref_entry; |
685 | | |
686 | | KHASH_MAP_INIT_STR(refs, ref_entry*) |
687 | | |
688 | | // References structure. |
689 | | struct refs_t { |
690 | | string_alloc_t *pool; // String pool for holding filenames and SN vals |
691 | | |
692 | | khash_t(refs) *h_meta; // ref_entry*, index by name |
693 | | ref_entry **ref_id; // ref_entry*, index by ID |
694 | | int nref; // number of ref_entry |
695 | | |
696 | | char *fn; // current file opened |
697 | | BGZF *fp; // and the hFILE* to go with it. |
698 | | |
699 | | int count; // how many cram_fd sharing this refs struct |
700 | | |
701 | | pthread_mutex_t lock; // Mutex for multi-threaded updating |
702 | | ref_entry *last; // Last queried sequence |
703 | | int last_id; // Used in cram_ref_decr_locked to delay free |
704 | | }; |
705 | | |
706 | | /*----------------------------------------------------------------------------- |
707 | | * CRAM index |
708 | | * |
709 | | * Detect format by number of entries per line. |
710 | | * 5 => 1.0 (refid, start, nseq, C offset, slice) |
711 | | * 6 => 1.1 (refid, start, span, C offset, S offset, S size) |
712 | | * |
713 | | * Indices are stored in a nested containment list, which is trivial to set |
714 | | * up as the indices are on sorted data so we're appending to the nclist |
715 | | * in sorted order. Basically if a slice entirely fits within a previous |
716 | | * slice then we append to that slices list. This is done recursively. |
717 | | * |
718 | | * Lists are sorted on two dimensions: ref id + slice coords. |
719 | | */ |
720 | | typedef struct cram_index { |
721 | | int nslice, nalloc; // total number of slices |
722 | | struct cram_index *e; // array of size nslice |
723 | | |
724 | | int refid; // 1.0 1.1 |
725 | | int start; // 1.0 1.1 |
726 | | int end; // 1.1 |
727 | | int nseq; // 1.0 - undocumented |
728 | | int slice; // 1.0 landmark index, 1.1 landmark value |
729 | | int len; // 1.1 - size of slice in bytes |
730 | | int64_t offset; // 1.0 1.1 |
731 | | |
732 | | // Linked list of cram_index entries. Used to convert recursive |
733 | | // NCList back to a linear list. |
734 | | struct cram_index *e_next; |
735 | | } cram_index; |
736 | | |
737 | | typedef struct { |
738 | | int refid; |
739 | | int64_t start; |
740 | | int64_t end; |
741 | | } cram_range; |
742 | | |
743 | | /*----------------------------------------------------------------------------- |
744 | | */ |
745 | | /* CRAM File handle */ |
746 | | |
747 | | typedef struct spare_bams { |
748 | | bam_seq_t **bams; |
749 | | struct spare_bams *next; |
750 | | } spare_bams; |
751 | | |
752 | | struct cram_fd; |
753 | | typedef struct varint_vec { |
754 | | // Returns number of bytes decoded from fd, 0 on error |
755 | | int (*varint_decode32_crc)(struct cram_fd *fd, int32_t *val_p, uint32_t *crc); |
756 | | int (*varint_decode32s_crc)(struct cram_fd *fd, int32_t *val_p, uint32_t *crc); |
757 | | int (*varint_decode64_crc)(struct cram_fd *fd, int64_t *val_p, uint32_t *crc); |
758 | | |
759 | | // Returns the value and increments *cp. Sets err to 1 iff an error occurs. |
760 | | // NOTE: Does not set err to 0 on success. |
761 | | int64_t (*varint_get32) (char **cp, const char *endp, int *err); |
762 | | int64_t (*varint_get32s)(char **cp, const char *endp, int *err); |
763 | | int64_t (*varint_get64) (char **cp, const char *endp, int *err); |
764 | | int64_t (*varint_get64s)(char **cp, const char *endp, int *err); |
765 | | |
766 | | // Returns the number of bytes written, <= 0 on error. |
767 | | int (*varint_put32) (char *cp, char *endp, int32_t val_p); |
768 | | int (*varint_put32s)(char *cp, char *endp, int32_t val_p); |
769 | | int (*varint_put64) (char *cp, char *endp, int64_t val_p); |
770 | | int (*varint_put64s)(char *cp, char *endp, int64_t val_p); |
771 | | |
772 | | // Returns the number of bytes written, <= 0 on error. |
773 | | int (*varint_put32_blk) (cram_block *blk, int32_t val_p); |
774 | | int (*varint_put32s_blk)(cram_block *blk, int32_t val_p); |
775 | | int (*varint_put64_blk) (cram_block *blk, int64_t val_p); |
776 | | int (*varint_put64s_blk)(cram_block *blk, int64_t val_p); |
777 | | |
778 | | // Returns number of bytes needed to encode 'val' |
779 | | int (*varint_size)(int64_t val); |
780 | | } varint_vec; |
781 | | |
782 | | struct cram_fd { |
783 | | struct hFILE *fp; |
784 | | int mode; // 'r' or 'w' |
785 | | int version; |
786 | | cram_file_def *file_def; |
787 | | sam_hdr_t *header; |
788 | | |
789 | | char *prefix; |
790 | | int64_t record_counter; |
791 | | int err; |
792 | | |
793 | | // Most recent compression header decoded |
794 | | //cram_block_compression_hdr *comp_hdr; |
795 | | //cram_block_slice_hdr *slice_hdr; |
796 | | |
797 | | // Current container being processed |
798 | | cram_container *ctr; |
799 | | |
800 | | // Current container used for decoder threads |
801 | | cram_container *ctr_mt; |
802 | | |
803 | | // positions for encoding or decoding |
804 | | int first_base, last_base; // copied to container |
805 | | |
806 | | // cached reference portion |
807 | | refs_t *refs; // ref meta-data structure |
808 | | char *ref, *ref_free; // current portion held in memory |
809 | | int ref_id; // copied to container |
810 | | hts_pos_t ref_start; // copied to container |
811 | | hts_pos_t ref_end; // copied to container |
812 | | char *ref_fn; // reference fasta filename |
813 | | |
814 | | // compression level and metrics |
815 | | int level; |
816 | | cram_metrics *m[DS_END]; |
817 | | khash_t(m_metrics) *tags_used; // cram_metrics[], per tag types in use. |
818 | | |
819 | | // options |
820 | | int decode_md; // Whether to export MD and NM tags |
821 | | int seqs_per_slice; |
822 | | int bases_per_slice; |
823 | | int slices_per_container; |
824 | | int embed_ref; // copied to container |
825 | | int no_ref; // copied to container |
826 | | int no_ref_counter; // decide if permanent switch |
827 | | int ignore_md5; |
828 | | int use_bz2; |
829 | | int use_rans; |
830 | | int use_lzma; |
831 | | int use_fqz; |
832 | | int use_tok; |
833 | | int use_arith; |
834 | | int shared_ref; |
835 | | unsigned int required_fields; |
836 | | int store_md; |
837 | | int store_nm; |
838 | | cram_range range; |
839 | | |
840 | | // lookup tables, stored here so we can be trivially multi-threaded |
841 | | unsigned int bam_flag_swap[0x1000]; // cram -> bam flags |
842 | | unsigned int cram_flag_swap[0x1000];// bam -> cram flags |
843 | | unsigned char L1[256]; // ACGT{*} ->0123{4} |
844 | | unsigned char L2[256]; // ACGTN{*}->01234{5} |
845 | | char cram_sub_matrix[32][32]; // base substitution codes |
846 | | |
847 | | int index_sz; |
848 | | cram_index *index; // array, sizeof index_sz |
849 | | off_t first_container; |
850 | | off_t curr_position; |
851 | | int eof; |
852 | | int last_slice; // number of recs encoded in last slice |
853 | | int last_RI_count; // number of references encoded in last container |
854 | | int multi_seq; // -1 is auto, 0 is one ref per container, 1 is multi... |
855 | | int multi_seq_user; // Original user setting (CRAM_OPT_MULTI_SEQ_PER_SLICE) |
856 | | int unsorted; |
857 | | int last_mapped; // number of mapped reads in last container |
858 | | int empty_container; // Marker for EOF block |
859 | | |
860 | | // thread pool |
861 | | int own_pool; |
862 | | hts_tpool *pool; |
863 | | hts_tpool_process *rqueue; |
864 | | pthread_mutex_t metrics_lock; |
865 | | pthread_mutex_t ref_lock; |
866 | | pthread_mutex_t range_lock; |
867 | | spare_bams *bl; |
868 | | pthread_mutex_t bam_list_lock; |
869 | | void *job_pending; |
870 | | int ooc; // out of containers. |
871 | | |
872 | | int lossy_read_names; // boolean |
873 | | int tlen_approx; // max TLEN calculation offset. |
874 | | int tlen_zero; // If true, permit tlen 0 (=> tlen calculated) |
875 | | |
876 | | BGZF *idxfp; // File pointer for on-the-fly index creation |
877 | | |
878 | | // variable integer decoding callbacks. |
879 | | // This changed in CRAM4.0 to a data-size agnostic encoding. |
880 | | varint_vec vv; |
881 | | |
882 | | // Force AP delta even on non positional sorted data. |
883 | | // This can be beneficial for pairs where pairs are nearby each other. |
884 | | // We suffer with delta to unrelated things (previous pair), but gain |
885 | | // in delta between them. (Ideal would be a per read setting.) |
886 | | int ap_delta; |
887 | | }; |
888 | | |
889 | | // Translation of required fields to cram data series |
890 | | enum cram_fields { |
891 | | CRAM_BF = 0x00000001, |
892 | | CRAM_AP = 0x00000002, |
893 | | CRAM_FP = 0x00000004, |
894 | | CRAM_RL = 0x00000008, |
895 | | CRAM_DL = 0x00000010, |
896 | | CRAM_NF = 0x00000020, |
897 | | CRAM_BA = 0x00000040, |
898 | | CRAM_QS = 0x00000080, |
899 | | CRAM_FC = 0x00000100, |
900 | | CRAM_FN = 0x00000200, |
901 | | CRAM_BS = 0x00000400, |
902 | | CRAM_IN = 0x00000800, |
903 | | CRAM_RG = 0x00001000, |
904 | | CRAM_MQ = 0x00002000, |
905 | | CRAM_TL = 0x00004000, |
906 | | CRAM_RN = 0x00008000, |
907 | | CRAM_NS = 0x00010000, |
908 | | CRAM_NP = 0x00020000, |
909 | | CRAM_TS = 0x00040000, |
910 | | CRAM_MF = 0x00080000, |
911 | | CRAM_CF = 0x00100000, |
912 | | CRAM_RI = 0x00200000, |
913 | | CRAM_RS = 0x00400000, |
914 | | CRAM_PD = 0x00800000, |
915 | | CRAM_HC = 0x01000000, |
916 | | CRAM_SC = 0x02000000, |
917 | | CRAM_BB = 0x04000000, |
918 | | CRAM_BB_len = 0x08000000, |
919 | | CRAM_QQ = 0x10000000, |
920 | | CRAM_QQ_len = 0x20000000, |
921 | | CRAM_aux= 0x40000000, |
922 | | CRAM_ALL= 0x7fffffff, |
923 | | }; |
924 | | |
925 | | // A CIGAR opcode, but not necessarily the implications of it. Eg FC/FP may |
926 | | // encode a base difference, but we don't need to know what it is for CIGAR. |
927 | | // If we have a soft-clip or insertion, we do need SC/IN though to know how |
928 | | // long that array is. |
929 | 0 | #define CRAM_CIGAR (CRAM_FN | CRAM_FP | CRAM_FC | CRAM_DL | CRAM_IN | \ |
930 | 0 | CRAM_SC | CRAM_HC | CRAM_PD | CRAM_RS | CRAM_RL | CRAM_BF) |
931 | | |
932 | 0 | #define CRAM_SEQ (CRAM_CIGAR | CRAM_BA | CRAM_BS | \ |
933 | 0 | CRAM_RL | CRAM_AP | CRAM_BB) |
934 | | |
935 | 0 | #define CRAM_QUAL (CRAM_CIGAR | CRAM_RL | CRAM_AP | CRAM_QS | CRAM_QQ) |
936 | | |
937 | | /* BF bitfields */ |
938 | | /* Corrected in 1.1. Use bam_flag_swap[bf] and BAM_* macros for 1.0 & 1.1 */ |
939 | 23.7M | #define CRAM_FPAIRED 256 |
940 | 23.7M | #define CRAM_FPROPER_PAIR 128 |
941 | 23.7M | #define CRAM_FUNMAP 64 |
942 | 23.7M | #define CRAM_FREVERSE 32 |
943 | 23.7M | #define CRAM_FREAD1 16 |
944 | 23.7M | #define CRAM_FREAD2 8 |
945 | 23.7M | #define CRAM_FSECONDARY 4 |
946 | 23.7M | #define CRAM_FQCFAIL 2 |
947 | 23.7M | #define CRAM_FDUP 1 |
948 | | |
949 | | #define DS_aux_S "\001" |
950 | | #define DS_aux_OQ_S "\002" |
951 | | #define DS_aux_BQ_S "\003" |
952 | | #define DS_aux_BD_S "\004" |
953 | | #define DS_aux_BI_S "\005" |
954 | | #define DS_aux_FZ_S "\006" |
955 | | #define DS_aux_oq_S "\007" |
956 | | #define DS_aux_os_S "\010" |
957 | | #define DS_aux_oz_S "\011" |
958 | | |
959 | 382 | #define CRAM_M_REVERSE 1 |
960 | 17.3k | #define CRAM_M_UNMAP 2 |
961 | | |
962 | | |
963 | | /* CF bitfields */ |
964 | 22.3M | #define CRAM_FLAG_PRESERVE_QUAL_SCORES (1<<0) |
965 | 22.3M | #define CRAM_FLAG_DETACHED (1<<1) |
966 | 27 | #define CRAM_FLAG_MATE_DOWNSTREAM (1<<2) |
967 | 7.23M | #define CRAM_FLAG_NO_SEQ (1<<3) |
968 | 27 | #define CRAM_FLAG_EXPLICIT_TLEN (1<<4) |
969 | 14.8M | #define CRAM_FLAG_MASK ((1<<5)-1) |
970 | | |
971 | | /* Internal only */ |
972 | 7.44M | #define CRAM_FLAG_STATS_ADDED (1<<30) |
973 | 0 | #define CRAM_FLAG_DISCARD_NAME (1U<<31) |
974 | | |
975 | | #ifdef __cplusplus |
976 | | } |
977 | | #endif |
978 | | |
979 | | #endif /* HTSLIB_CRAM_STRUCTS_H */ |