Coverage Report

Created: 2025-07-12 06:16

/src/htslib/header.h
Line
Count
Source
1
/*
2
Copyright (c) 2013-2019 Genome Research Ltd.
3
Authors: James Bonfield <jkb@sanger.ac.uk>, Valeriu Ohan <vo2@sanger.ac.uk>
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are met:
7
8
   1. Redistributions of source code must retain the above copyright notice,
9
this list of conditions and the following disclaimer.
10
11
   2. Redistributions in binary form must reproduce the above copyright notice,
12
this list of conditions and the following disclaimer in the documentation
13
and/or other materials provided with the distribution.
14
15
   3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
16
Institute nor the names of its contributors may be used to endorse or promote
17
products derived from this software without specific prior written permission.
18
19
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
20
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
23
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
*/
30
31
/*! \file
32
 * SAM header parsing.
33
 *
34
 * These functions can be shared between SAM, BAM and CRAM file
35
 * formats as all three internally use the same string encoding for
36
 * header fields.
37
 */
38
39
40
#ifndef HEADER_H_
41
#define HEADER_H_
42
43
#include <stdarg.h>
44
45
#include "cram/string_alloc.h"
46
#include "cram/pooled_alloc.h"
47
48
#include "htslib/khash.h"
49
#include "htslib/kstring.h"
50
#include "htslib/sam.h"
51
52
#ifdef __cplusplus
53
extern "C" {
54
#endif
55
56
/*! Make a single integer out of a two-letter type code */
57
486k
static inline khint32_t TYPEKEY(const char *type) {
58
486k
    unsigned int u0 = (unsigned char) type[0];
59
486k
    unsigned int u1 = (unsigned char) type[1];
60
486k
    return (u0 << 8) | u1;
61
486k
}
header.c:TYPEKEY
Line
Count
Source
57
486k
static inline khint32_t TYPEKEY(const char *type) {
58
486k
    unsigned int u0 = (unsigned char) type[0];
59
486k
    unsigned int u1 = (unsigned char) type[1];
60
486k
    return (u0 << 8) | u1;
61
486k
}
Unexecuted instantiation: hts.c:TYPEKEY
Unexecuted instantiation: sam.c:TYPEKEY
Unexecuted instantiation: cram_decode.c:TYPEKEY
Unexecuted instantiation: cram_encode.c:TYPEKEY
Unexecuted instantiation: cram_index.c:TYPEKEY
Unexecuted instantiation: cram_io.c:TYPEKEY
Unexecuted instantiation: cram_stats.c:TYPEKEY
Unexecuted instantiation: cram_codecs.c:TYPEKEY
62
63
/*
64
 * Proposed new SAM header parsing
65
66
1 @SQ ID:foo LN:100
67
2 @SQ ID:bar LN:200
68
3 @SQ ID:ram LN:300 UR:xyz
69
4 @RG ID:r ...
70
5 @RG ID:s ...
71
72
Hash table for 2-char @keys without dup entries.
73
If dup lines, we form a circular linked list. Ie hash keys = {RG, SQ}.
74
75
HASH("SQ")--\
76
            |
77
    (3) <-> 1 <-> 2 <-> 3 <-> (1)
78
79
HASH("RG")--\
80
            |
81
    (5) <-> 4 <-> 5 <-> (4)
82
83
Items stored in the hash values also form their own linked lists:
84
Ie SQ->ID(foo)->LN(100)
85
   SQ->ID(bar)->LN(200)
86
   SQ->ID(ram)->LN(300)->UR(xyz)
87
   RG->ID(r)
88
 */
89
90
/*! A single key:value pair on a header line
91
 *
92
 * These form a linked list and hold strings. The strings are
93
 * allocated from a string_alloc_t pool referenced in the master
94
 * sam_hrecs_t structure. Do not attempt to free, malloc or manipulate
95
 * these strings directly.
96
 */
97
typedef struct sam_hrec_tag_s {
98
    struct sam_hrec_tag_s *next;
99
    const char *str;
100
    int   len;
101
} sam_hrec_tag_t;
102
103
/*! The parsed version of the SAM header string.
104
 *
105
 * Each header type (SQ, RG, HD, etc) points to its own sam_hdr_type
106
 * struct via the main hash table h in the sam_hrecs_t struct.
107
 *
108
 * These in turn consist of circular bi-directional linked lists (ie
109
 * rings) to hold the multiple instances of the same header type
110
 * code. For example if we have 5 \@SQ lines the primary hash table
111
 * will key on \@SQ pointing to the first sam_hdr_type and that in turn
112
 * will be part of a ring of 5 elements.
113
 *
114
 * For each sam_hdr_type structure we also point to a sam_hdr_tag
115
 * structure which holds the tokenised attributes; the tab separated
116
 * key:value pairs per line.
117
 */
118
typedef struct sam_hrec_type_s {
119
    struct sam_hrec_type_s *next; // circular list of this type
120
    struct sam_hrec_type_s *prev; // circular list of this type
121
    struct sam_hrec_type_s *global_next; // circular list of all lines
122
    struct sam_hrec_type_s *global_prev; // circular list of all lines
123
    sam_hrec_tag_t *tag;          // first tag
124
    khint32_t type;               // Two-letter type code as an int
125
} sam_hrec_type_t;
126
127
/*! Parsed \@SQ lines */
128
typedef struct {
129
    const char *name;
130
    hts_pos_t len;
131
    sam_hrec_type_t *ty;
132
} sam_hrec_sq_t;
133
134
/*! Parsed \@RG lines */
135
typedef struct {
136
    const char *name;
137
    sam_hrec_type_t *ty;
138
    int name_len;
139
    int id;           // numerical ID
140
} sam_hrec_rg_t;
141
142
/*! Parsed \@PG lines */
143
typedef struct {
144
    const char *name;
145
    sam_hrec_type_t *ty;
146
    int name_len;
147
    int id;           // numerical ID
148
    int prev_id;      // -1 if none
149
} sam_hrec_pg_t;
150
151
152
/*! Sort order parsed from @HD line */
153
enum sam_sort_order {
154
    ORDER_UNKNOWN  =-1,
155
    ORDER_UNSORTED = 0,
156
    ORDER_NAME     = 1,
157
    ORDER_COORD    = 2
158
  //ORDER_COLLATE  = 3 // maybe one day!
159
};
160
161
enum sam_group_order {
162
    ORDER_NONE      =-1,
163
    ORDER_QUERY     = 0,
164
    ORDER_REFERENCE = 1
165
};
166
167
KHASH_MAP_INIT_INT(sam_hrecs_t, sam_hrec_type_t*)
168
KHASH_MAP_INIT_STR(m_s2i, int)
169
170
/*! Primary structure for header manipulation
171
 *
172
 * The initial header text is held in the text kstring_t, but is also
173
 * parsed out into SQ, RG and PG arrays. These have a hash table
174
 * associated with each to allow lookup by ID or SN fields instead of
175
 * their numeric array indices. Additionally PG has an array to hold
176
 * the linked list start points (the last in a PP chain).
177
 *
178
 * Use the appropriate sam_hdr_* functions to edit the header, and
179
 * call sam_hdr_rebuild() any time the textual form needs to be
180
 * updated again.
181
 */
182
struct sam_hrecs_t {
183
    khash_t(sam_hrecs_t) *h;
184
    sam_hrec_type_t *first_line; //!< First line (usually @HD)
185
    string_alloc_t *str_pool; //!< Pool of sam_hdr_tag->str strings
186
    pool_alloc_t   *type_pool;//!< Pool of sam_hdr_type structs
187
    pool_alloc_t   *tag_pool; //!< Pool of sam_hdr_tag structs
188
189
    // @SQ lines / references
190
    int nref;                  //!< Number of \@SQ lines
191
    int ref_sz;                //!< Number of entries available in ref[]
192
    sam_hrec_sq_t *ref;        //!< Array of parsed \@SQ lines
193
    khash_t(m_s2i) *ref_hash;  //!< Maps SQ SN field to ref[] index
194
195
    // @RG lines / read-groups
196
    int nrg;                   //!< Number of \@RG lines
197
    int rg_sz;                 //!< number of entries available in rg[]
198
    sam_hrec_rg_t *rg;         //!< Array of parsed \@RG lines
199
    khash_t(m_s2i) *rg_hash;   //!< Maps RG ID field to rg[] index
200
201
    // @PG lines / programs
202
    int npg;                   //!< Number of \@PG lines
203
    int pg_sz;                //!< Number of entries available in pg[]
204
    int npg_end;               //!< Number of terminating \@PG lines
205
    int npg_end_alloc;         //!< Size of pg_end field
206
    sam_hrec_pg_t *pg;         //!< Array of parsed \@PG lines
207
    khash_t(m_s2i) *pg_hash;   //!< Maps PG ID field to pg[] index
208
    int *pg_end;               //!< \@PG chain termination IDs
209
210
    // @cond internal
211
    char *ID_buf;             // temporary buffer for sam_hdr_pg_id
212
    uint32_t ID_buf_sz;
213
    int ID_cnt;
214
    // @endcond
215
216
    int dirty;                // marks the header as modified, so it can be rebuilt
217
    int refs_changed;         // Index of first changed ref (-1 if unchanged)
218
    int pgs_changed;          // New PG line added
219
    int type_count;
220
    char (*type_order)[3];
221
};
222
223
/*!
224
 * Method for parsing the header text and populating the
225
 * internal hash tables. After calling this method, the
226
 * parsed representation becomes the single source of truth.
227
 *
228
 * @param bh    Header structure, previously initialised by a
229
 *              sam_hdr_init call
230
 * @return      0 on success, -1 on failure
231
 */
232
int sam_hdr_fill_hrecs(sam_hdr_t *bh);
233
234
/*!
235
 * Reconstructs the text representation of the header from
236
 * the hash table data after a change has been performed on
237
 * the header.
238
 *
239
 * @return  0 on success, -1 on failure
240
 */
241
int sam_hdr_rebuild(sam_hdr_t *bh);
242
243
/*! Creates an empty SAM header, ready to be populated.
244
 *
245
 * @return
246
 * Returns a sam_hrecs_t struct on success (free with sam_hrecs_free())
247
 *         NULL on failure
248
 */
249
sam_hrecs_t *sam_hrecs_new(void);
250
251
/*! Produces a duplicate copy of hrecs and returns it.
252
 * @return
253
 * Returns NULL on failure
254
 */
255
sam_hrecs_t *sam_hrecs_dup(sam_hrecs_t *hrecs);
256
257
/*! Update sam_hdr_t target_name and target_len arrays
258
 *
259
 *  sam_hdr_t and sam_hrecs_t are specified separately so that sam_hdr_dup
260
 *  can use it to construct target arrays from the source header.
261
 *
262
 *  @return 0 on success; -1 on failure
263
 */
264
int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs,
265
                                 int refs_changed);
266
267
/*! Reconstructs a kstring from the header hash table.
268
 *
269
 * @return
270
 * Returns 0 on success
271
 *        -1 on failure
272
 */
273
int sam_hrecs_rebuild_text(const sam_hrecs_t *hrecs, kstring_t *ks);
274
275
/*! Deallocates all storage used by a sam_hrecs_t struct.
276
 *
277
 * This also decrements the header reference count. If after decrementing
278
 * it is still non-zero then the header is assumed to be in use by another
279
 * caller and the free is not done.
280
 */
281
void sam_hrecs_free(sam_hrecs_t *hrecs);
282
283
/*!
284
 * @return
285
 * Returns the first header item matching 'type'. If ID is non-NULL it checks
286
 * for the tag ID: and compares against the specified ID.
287
 *
288
 * Returns NULL if no type/ID is found
289
 */
290
sam_hrec_type_t *sam_hrecs_find_type_id(sam_hrecs_t *hrecs, const char *type,
291
                                     const char *ID_key, const char *ID_value);
292
293
sam_hrec_tag_t *sam_hrecs_find_key(sam_hrec_type_t *type,
294
                                   const char *key,
295
                                   sam_hrec_tag_t **prev);
296
297
int sam_hrecs_remove_key(sam_hrecs_t *hrecs,
298
                         sam_hrec_type_t *type,
299
                         const char *key);
300
301
/*! Looks up a read-group by name and returns a pointer to the start of the
302
 * associated tag list.
303
 *
304
 * @return
305
 * Returns NULL on failure
306
 */
307
sam_hrec_rg_t *sam_hrecs_find_rg(sam_hrecs_t *hrecs, const char *rg);
308
309
/*! Returns the sort order from the @HD SO: field */
310
enum sam_sort_order sam_hrecs_sort_order(sam_hrecs_t *hrecs);
311
312
/*! Returns the group order from the @HD SO: field */
313
enum sam_group_order sam_hrecs_group_order(sam_hrecs_t *hrecs);
314
315
#ifdef __cplusplus
316
}
317
#endif
318
319
#endif /* HEADER_H_ */