Coverage Report

Created: 2025-09-27 07:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/header.h
Line
Count
Source
1
/*
2
Copyright (c) 2013-2019,2025 Genome Research Ltd.
3
Authors: James Bonfield <jkb@sanger.ac.uk>, Valeriu Ohan <vo2@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*! \file
32
 * SAM header parsing.
33
 *
34
 * These functions can be shared between SAM, BAM and CRAM file
35
 * formats as all three internally use the same string encoding for
36
 * header fields.
37
 */
38
39
40
#ifndef HEADER_H_
41
#define HEADER_H_
42
43
#include <stdarg.h>
44
45
#include "cram/string_alloc.h"
46
#include "cram/pooled_alloc.h"
47
48
#include "htslib/khash.h"
49
#include "htslib/kstring.h"
50
#include "htslib/sam.h"
51
#include "htslib/hts.h"
52
53
#ifdef __cplusplus
54
extern "C" {
55
#endif
56
57
/*! Make a single integer out of a two-letter type code */
58
3.87M
static inline khint32_t TYPEKEY(const char *type) {
59
3.87M
    unsigned int u0 = (unsigned char) type[0];
60
3.87M
    unsigned int u1 = (unsigned char) type[1];
61
3.87M
    return (u0 << 8) | u1;
62
3.87M
}
header.c:TYPEKEY
Line
Count
Source
58
3.87M
static inline khint32_t TYPEKEY(const char *type) {
59
3.87M
    unsigned int u0 = (unsigned char) type[0];
60
3.87M
    unsigned int u1 = (unsigned char) type[1];
61
3.87M
    return (u0 << 8) | u1;
62
3.87M
}
Unexecuted instantiation: hts.c:TYPEKEY
Unexecuted instantiation: sam.c:TYPEKEY
Unexecuted instantiation: cram_decode.c:TYPEKEY
Unexecuted instantiation: cram_encode.c:TYPEKEY
Unexecuted instantiation: cram_index.c:TYPEKEY
Unexecuted instantiation: cram_io.c:TYPEKEY
Unexecuted instantiation: cram_stats.c:TYPEKEY
Unexecuted instantiation: cram_codecs.c:TYPEKEY
63
64
/*
65
 * Proposed new SAM header parsing
66
67
1 @SQ ID:foo LN:100
68
2 @SQ ID:bar LN:200
69
3 @SQ ID:ram LN:300 UR:xyz
70
4 @RG ID:r ...
71
5 @RG ID:s ...
72
73
Hash table for 2-char @keys without dup entries.
74
If dup lines, we form a circular linked list. Ie hash keys = {RG, SQ}.
75
76
HASH("SQ")--\
77
            |
78
    (3) <-> 1 <-> 2 <-> 3 <-> (1)
79
80
HASH("RG")--\
81
            |
82
    (5) <-> 4 <-> 5 <-> (4)
83
84
Items stored in the hash values also form their own linked lists:
85
Ie SQ->ID(foo)->LN(100)
86
   SQ->ID(bar)->LN(200)
87
   SQ->ID(ram)->LN(300)->UR(xyz)
88
   RG->ID(r)
89
 */
90
91
/*! A single key:value pair on a header line
92
 *
93
 * These form a linked list and hold strings. The strings are
94
 * allocated from a string_alloc_t pool referenced in the master
95
 * sam_hrecs_t structure. Do not attempt to free, malloc or manipulate
96
 * these strings directly.
97
 */
98
typedef struct sam_hrec_tag_s {
99
    struct sam_hrec_tag_s *next;
100
    const char *str;
101
    int   len;
102
} sam_hrec_tag_t;
103
104
/*! The parsed version of the SAM header string.
105
 *
106
 * Each header type (SQ, RG, HD, etc) points to its own sam_hdr_type
107
 * struct via the main hash table h in the sam_hrecs_t struct.
108
 *
109
 * These in turn consist of circular bi-directional linked lists (ie
110
 * rings) to hold the multiple instances of the same header type
111
 * code. For example if we have 5 \@SQ lines the primary hash table
112
 * will key on \@SQ pointing to the first sam_hdr_type and that in turn
113
 * will be part of a ring of 5 elements.
114
 *
115
 * For each sam_hdr_type structure we also point to a sam_hdr_tag
116
 * structure which holds the tokenised attributes; the tab separated
117
 * key:value pairs per line.
118
 */
119
typedef struct sam_hrec_type_s {
120
    struct sam_hrec_type_s *next; // circular list of this type
121
    struct sam_hrec_type_s *prev; // circular list of this type
122
    struct sam_hrec_type_s *global_next; // circular list of all lines
123
    struct sam_hrec_type_s *global_prev; // circular list of all lines
124
    sam_hrec_tag_t *tag;          // first tag
125
    khint32_t type;               // Two-letter type code as an int
126
} sam_hrec_type_t;
127
128
/*! Parsed \@SQ lines */
129
typedef struct {
130
    const char *name;
131
    hts_pos_t len;
132
    sam_hrec_type_t *ty;
133
} sam_hrec_sq_t;
134
135
/*! Parsed \@RG lines */
136
typedef struct {
137
    const char *name;
138
    sam_hrec_type_t *ty;
139
    int name_len;
140
    int id;           // numerical ID
141
} sam_hrec_rg_t;
142
143
/*! Parsed \@PG lines */
144
typedef struct {
145
    const char *name;
146
    sam_hrec_type_t *ty;
147
    int name_len;
148
    int id;           // numerical ID
149
    int prev_id;      // -1 if none
150
} sam_hrec_pg_t;
151
152
153
/*! Sort order parsed from @HD line */
154
enum sam_sort_order {
155
    ORDER_UNKNOWN  =-1,
156
    ORDER_UNSORTED = 0,
157
    ORDER_NAME     = 1,
158
    ORDER_COORD    = 2
159
  //ORDER_COLLATE  = 3 // maybe one day!
160
};
161
162
enum sam_group_order {
163
    ORDER_NONE      =-1,
164
    ORDER_QUERY     = 0,
165
    ORDER_REFERENCE = 1
166
};
167
168
KHASH_MAP_INIT_INT(sam_hrecs_t, sam_hrec_type_t*)
169
KHASH_MAP_INIT_STR(m_s2i, int)
170
171
/*! Primary structure for header manipulation
172
 *
173
 * The initial header text is held in the text kstring_t, but is also
174
 * parsed out into SQ, RG and PG arrays. These have a hash table
175
 * associated with each to allow lookup by ID or SN fields instead of
176
 * their numeric array indices. Additionally PG has an array to hold
177
 * the linked list start points (the last in a PP chain).
178
 *
179
 * Use the appropriate sam_hdr_* functions to edit the header, and
180
 * call sam_hdr_rebuild() any time the textual form needs to be
181
 * updated again.
182
 */
183
struct sam_hrecs_t {
184
    khash_t(sam_hrecs_t) *h;
185
    sam_hrec_type_t *first_line; //!< First line (usually @HD)
186
    string_alloc_t *str_pool; //!< Pool of sam_hdr_tag->str strings
187
    pool_alloc_t   *type_pool;//!< Pool of sam_hdr_type structs
188
    pool_alloc_t   *tag_pool; //!< Pool of sam_hdr_tag structs
189
190
    // @SQ lines / references
191
    int nref;                  //!< Number of \@SQ lines
192
    int ref_sz;                //!< Number of entries available in ref[]
193
    sam_hrec_sq_t *ref;        //!< Array of parsed \@SQ lines
194
    khash_t(m_s2i) *ref_hash;  //!< Maps SQ SN field to ref[] index
195
196
    // @RG lines / read-groups
197
    int nrg;                   //!< Number of \@RG lines
198
    int rg_sz;                 //!< number of entries available in rg[]
199
    sam_hrec_rg_t *rg;         //!< Array of parsed \@RG lines
200
    khash_t(m_s2i) *rg_hash;   //!< Maps RG ID field to rg[] index
201
202
    // @PG lines / programs
203
    int npg;                   //!< Number of \@PG lines
204
    int pg_sz;                //!< Number of entries available in pg[]
205
    int npg_end;               //!< Number of terminating \@PG lines
206
    int npg_end_alloc;         //!< Size of pg_end field
207
    sam_hrec_pg_t *pg;         //!< Array of parsed \@PG lines
208
    khash_t(m_s2i) *pg_hash;   //!< Maps PG ID field to pg[] index
209
    int *pg_end;               //!< \@PG chain termination IDs
210
211
    // @cond internal
212
    char *ID_buf;             // temporary buffer for sam_hdr_pg_id
213
    uint32_t ID_buf_sz;
214
    int ID_cnt;
215
    // @endcond
216
217
    int dirty;                // marks the header as modified, so it can be rebuilt
218
    int refs_changed;         // Index of first changed ref (-1 if unchanged)
219
    int pgs_changed;          // New PG line added
220
    int type_count;
221
    char (*type_order)[3];
222
};
223
224
/*!
225
 * Method for parsing the header text and populating the
226
 * internal hash tables. After calling this method, the
227
 * parsed representation becomes the single source of truth.
228
 *
229
 * @param bh    Header structure, previously initialised by a
230
 *              sam_hdr_init call
231
 * @return      0 on success, -1 on failure
232
 */
233
int sam_hdr_fill_hrecs(sam_hdr_t *bh);
234
235
/*!
236
 * Reconstructs the text representation of the header from
237
 * the hash table data after a change has been performed on
238
 * the header.
239
 *
240
 * @return  0 on success, -1 on failure
241
 */
242
int sam_hdr_rebuild(sam_hdr_t *bh);
243
244
/*! Creates an empty SAM header, ready to be populated.
245
 *
246
 * @return
247
 * Returns a sam_hrecs_t struct on success (free with sam_hrecs_free())
248
 *         NULL on failure
249
 */
250
sam_hrecs_t *sam_hrecs_new(void);
251
252
/*! Produces a duplicate copy of hrecs and returns it.
253
 * @return
254
 * Returns NULL on failure
255
 */
256
sam_hrecs_t *sam_hrecs_dup(sam_hrecs_t *hrecs);
257
258
/*! Update sam_hdr_t target_name and target_len arrays
259
 *
260
 *  sam_hdr_t and sam_hrecs_t are specified separately so that sam_hdr_dup
261
 *  can use it to construct target arrays from the source header.
262
 *
263
 *  @return 0 on success; -1 on failure
264
 */
265
int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs,
266
                                 int refs_changed);
267
268
/*! Populate sam_hdr_t from SAM file content
269
 *
270
 * @param hdr empty header struct to be filled
271
 * @param fp  File to read from
272
 * @return 0 on success
273
 *        -1 on failure
274
 *
275
 * This function is used to build the header structure when reading SAM files.
276
 * The lines read from the file are used to create sam_hrecs_t structures.
277
 * The function also populates sam_hdr_t::text, sam_hdr_t::l_text,
278
 * sam_hdr_t::target_name and sam_hdr_t::target_len.
279
 */
280
int sam_hdr_build_from_sam_file(sam_hdr_t *hdr, htsFile* fp);
281
282
/*! Reconstructs a kstring from the header hash table.
283
 *
284
 * @return
285
 * Returns 0 on success
286
 *        -1 on failure
287
 */
288
int sam_hrecs_rebuild_text(const sam_hrecs_t *hrecs, kstring_t *ks);
289
290
/*! Deallocates all storage used by a sam_hrecs_t struct.
291
 *
292
 * This also decrements the header reference count. If after decrementing
293
 * it is still non-zero then the header is assumed to be in use by another
294
 * caller and the free is not done.
295
 */
296
void sam_hrecs_free(sam_hrecs_t *hrecs);
297
298
/*!
299
 * @return
300
 * Returns the first header item matching 'type'. If ID is non-NULL it checks
301
 * for the tag ID: and compares against the specified ID.
302
 *
303
 * Returns NULL if no type/ID is found
304
 */
305
sam_hrec_type_t *sam_hrecs_find_type_id(sam_hrecs_t *hrecs, const char *type,
306
                                     const char *ID_key, const char *ID_value);
307
308
sam_hrec_tag_t *sam_hrecs_find_key(sam_hrec_type_t *type,
309
                                   const char *key,
310
                                   sam_hrec_tag_t **prev);
311
312
int sam_hrecs_remove_key(sam_hrecs_t *hrecs,
313
                         sam_hrec_type_t *type,
314
                         const char *key);
315
316
/*! Looks up a read-group by name and returns a pointer to the start of the
317
 * associated tag list.
318
 *
319
 * @return
320
 * Returns NULL on failure
321
 */
322
sam_hrec_rg_t *sam_hrecs_find_rg(sam_hrecs_t *hrecs, const char *rg);
323
324
/*! Returns the sort order from the @HD SO: field */
325
enum sam_sort_order sam_hrecs_sort_order(sam_hrecs_t *hrecs);
326
327
/*! Returns the group order from the @HD SO: field */
328
enum sam_group_order sam_hrecs_group_order(sam_hrecs_t *hrecs);
329
330
#ifdef __cplusplus
331
}
332
#endif
333
334
#endif /* HEADER_H_ */