Line | Count | Source |
1 | | /* |
2 | | Copyright (c) 2013-2019 Genome Research Ltd. |
3 | | Authors: James Bonfield <jkb@sanger.ac.uk>, Valeriu Ohan <vo2@sanger.ac.uk> |
4 | | |
5 | | Redistribution and use in source and binary forms, with or without |
6 | | modification, are permitted provided that the following conditions are met: |
7 | | |
8 | | 1. Redistributions of source code must retain the above copyright notice, |
9 | | this list of conditions and the following disclaimer. |
10 | | |
11 | | 2. Redistributions in binary form must reproduce the above copyright notice, |
12 | | this list of conditions and the following disclaimer in the documentation |
13 | | and/or other materials provided with the distribution. |
14 | | |
15 | | 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger |
16 | | Institute nor the names of its contributors may be used to endorse or promote |
17 | | products derived from this software without specific prior written permission. |
18 | | |
19 | | THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND |
20 | | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
21 | | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
22 | | DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE |
23 | | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
24 | | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
25 | | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
26 | | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
27 | | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
28 | | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
29 | | */ |
30 | | |
31 | | /*! \file |
32 | | * SAM header parsing. |
33 | | * |
34 | | * These functions can be shared between SAM, BAM and CRAM file |
35 | | * formats as all three internally use the same string encoding for |
36 | | * header fields. |
37 | | */ |
38 | | |
39 | | |
40 | | #ifndef HEADER_H_ |
41 | | #define HEADER_H_ |
42 | | |
43 | | #include <stdarg.h> |
44 | | |
45 | | #include "cram/string_alloc.h" |
46 | | #include "cram/pooled_alloc.h" |
47 | | |
48 | | #include "htslib/khash.h" |
49 | | #include "htslib/kstring.h" |
50 | | #include "htslib/sam.h" |
51 | | |
52 | | #ifdef __cplusplus |
53 | | extern "C" { |
54 | | #endif |
55 | | |
56 | | /*! Make a single integer out of a two-letter type code */ |
57 | 486k | static inline khint32_t TYPEKEY(const char *type) { |
58 | 486k | unsigned int u0 = (unsigned char) type[0]; |
59 | 486k | unsigned int u1 = (unsigned char) type[1]; |
60 | 486k | return (u0 << 8) | u1; |
61 | 486k | } Line | Count | Source | 57 | 486k | static inline khint32_t TYPEKEY(const char *type) { | 58 | 486k | unsigned int u0 = (unsigned char) type[0]; | 59 | 486k | unsigned int u1 = (unsigned char) type[1]; | 60 | 486k | return (u0 << 8) | u1; | 61 | 486k | } |
Unexecuted instantiation: hts.c:TYPEKEY Unexecuted instantiation: sam.c:TYPEKEY Unexecuted instantiation: cram_decode.c:TYPEKEY Unexecuted instantiation: cram_encode.c:TYPEKEY Unexecuted instantiation: cram_index.c:TYPEKEY Unexecuted instantiation: cram_io.c:TYPEKEY Unexecuted instantiation: cram_stats.c:TYPEKEY Unexecuted instantiation: cram_codecs.c:TYPEKEY |
62 | | |
63 | | /* |
64 | | * Proposed new SAM header parsing |
65 | | |
66 | | 1 @SQ ID:foo LN:100 |
67 | | 2 @SQ ID:bar LN:200 |
68 | | 3 @SQ ID:ram LN:300 UR:xyz |
69 | | 4 @RG ID:r ... |
70 | | 5 @RG ID:s ... |
71 | | |
72 | | Hash table for 2-char @keys without dup entries. |
73 | | If dup lines, we form a circular linked list. Ie hash keys = {RG, SQ}. |
74 | | |
75 | | HASH("SQ")--\ |
76 | | | |
77 | | (3) <-> 1 <-> 2 <-> 3 <-> (1) |
78 | | |
79 | | HASH("RG")--\ |
80 | | | |
81 | | (5) <-> 4 <-> 5 <-> (4) |
82 | | |
83 | | Items stored in the hash values also form their own linked lists: |
84 | | Ie SQ->ID(foo)->LN(100) |
85 | | SQ->ID(bar)->LN(200) |
86 | | SQ->ID(ram)->LN(300)->UR(xyz) |
87 | | RG->ID(r) |
88 | | */ |
89 | | |
90 | | /*! A single key:value pair on a header line |
91 | | * |
92 | | * These form a linked list and hold strings. The strings are |
93 | | * allocated from a string_alloc_t pool referenced in the master |
94 | | * sam_hrecs_t structure. Do not attempt to free, malloc or manipulate |
95 | | * these strings directly. |
96 | | */ |
97 | | typedef struct sam_hrec_tag_s { |
98 | | struct sam_hrec_tag_s *next; |
99 | | const char *str; |
100 | | int len; |
101 | | } sam_hrec_tag_t; |
102 | | |
103 | | /*! The parsed version of the SAM header string. |
104 | | * |
105 | | * Each header type (SQ, RG, HD, etc) points to its own sam_hdr_type |
106 | | * struct via the main hash table h in the sam_hrecs_t struct. |
107 | | * |
108 | | * These in turn consist of circular bi-directional linked lists (ie |
109 | | * rings) to hold the multiple instances of the same header type |
110 | | * code. For example if we have 5 \@SQ lines the primary hash table |
111 | | * will key on \@SQ pointing to the first sam_hdr_type and that in turn |
112 | | * will be part of a ring of 5 elements. |
113 | | * |
114 | | * For each sam_hdr_type structure we also point to a sam_hdr_tag |
115 | | * structure which holds the tokenised attributes; the tab separated |
116 | | * key:value pairs per line. |
117 | | */ |
118 | | typedef struct sam_hrec_type_s { |
119 | | struct sam_hrec_type_s *next; // circular list of this type |
120 | | struct sam_hrec_type_s *prev; // circular list of this type |
121 | | struct sam_hrec_type_s *global_next; // circular list of all lines |
122 | | struct sam_hrec_type_s *global_prev; // circular list of all lines |
123 | | sam_hrec_tag_t *tag; // first tag |
124 | | khint32_t type; // Two-letter type code as an int |
125 | | } sam_hrec_type_t; |
126 | | |
127 | | /*! Parsed \@SQ lines */ |
128 | | typedef struct { |
129 | | const char *name; |
130 | | hts_pos_t len; |
131 | | sam_hrec_type_t *ty; |
132 | | } sam_hrec_sq_t; |
133 | | |
134 | | /*! Parsed \@RG lines */ |
135 | | typedef struct { |
136 | | const char *name; |
137 | | sam_hrec_type_t *ty; |
138 | | int name_len; |
139 | | int id; // numerical ID |
140 | | } sam_hrec_rg_t; |
141 | | |
142 | | /*! Parsed \@PG lines */ |
143 | | typedef struct { |
144 | | const char *name; |
145 | | sam_hrec_type_t *ty; |
146 | | int name_len; |
147 | | int id; // numerical ID |
148 | | int prev_id; // -1 if none |
149 | | } sam_hrec_pg_t; |
150 | | |
151 | | |
152 | | /*! Sort order parsed from @HD line */ |
153 | | enum sam_sort_order { |
154 | | ORDER_UNKNOWN =-1, |
155 | | ORDER_UNSORTED = 0, |
156 | | ORDER_NAME = 1, |
157 | | ORDER_COORD = 2 |
158 | | //ORDER_COLLATE = 3 // maybe one day! |
159 | | }; |
160 | | |
161 | | enum sam_group_order { |
162 | | ORDER_NONE =-1, |
163 | | ORDER_QUERY = 0, |
164 | | ORDER_REFERENCE = 1 |
165 | | }; |
166 | | |
167 | | KHASH_MAP_INIT_INT(sam_hrecs_t, sam_hrec_type_t*) |
168 | | KHASH_MAP_INIT_STR(m_s2i, int) |
169 | | |
170 | | /*! Primary structure for header manipulation |
171 | | * |
172 | | * The initial header text is held in the text kstring_t, but is also |
173 | | * parsed out into SQ, RG and PG arrays. These have a hash table |
174 | | * associated with each to allow lookup by ID or SN fields instead of |
175 | | * their numeric array indices. Additionally PG has an array to hold |
176 | | * the linked list start points (the last in a PP chain). |
177 | | * |
178 | | * Use the appropriate sam_hdr_* functions to edit the header, and |
179 | | * call sam_hdr_rebuild() any time the textual form needs to be |
180 | | * updated again. |
181 | | */ |
182 | | struct sam_hrecs_t { |
183 | | khash_t(sam_hrecs_t) *h; |
184 | | sam_hrec_type_t *first_line; //!< First line (usually @HD) |
185 | | string_alloc_t *str_pool; //!< Pool of sam_hdr_tag->str strings |
186 | | pool_alloc_t *type_pool;//!< Pool of sam_hdr_type structs |
187 | | pool_alloc_t *tag_pool; //!< Pool of sam_hdr_tag structs |
188 | | |
189 | | // @SQ lines / references |
190 | | int nref; //!< Number of \@SQ lines |
191 | | int ref_sz; //!< Number of entries available in ref[] |
192 | | sam_hrec_sq_t *ref; //!< Array of parsed \@SQ lines |
193 | | khash_t(m_s2i) *ref_hash; //!< Maps SQ SN field to ref[] index |
194 | | |
195 | | // @RG lines / read-groups |
196 | | int nrg; //!< Number of \@RG lines |
197 | | int rg_sz; //!< number of entries available in rg[] |
198 | | sam_hrec_rg_t *rg; //!< Array of parsed \@RG lines |
199 | | khash_t(m_s2i) *rg_hash; //!< Maps RG ID field to rg[] index |
200 | | |
201 | | // @PG lines / programs |
202 | | int npg; //!< Number of \@PG lines |
203 | | int pg_sz; //!< Number of entries available in pg[] |
204 | | int npg_end; //!< Number of terminating \@PG lines |
205 | | int npg_end_alloc; //!< Size of pg_end field |
206 | | sam_hrec_pg_t *pg; //!< Array of parsed \@PG lines |
207 | | khash_t(m_s2i) *pg_hash; //!< Maps PG ID field to pg[] index |
208 | | int *pg_end; //!< \@PG chain termination IDs |
209 | | |
210 | | // @cond internal |
211 | | char *ID_buf; // temporary buffer for sam_hdr_pg_id |
212 | | uint32_t ID_buf_sz; |
213 | | int ID_cnt; |
214 | | // @endcond |
215 | | |
216 | | int dirty; // marks the header as modified, so it can be rebuilt |
217 | | int refs_changed; // Index of first changed ref (-1 if unchanged) |
218 | | int pgs_changed; // New PG line added |
219 | | int type_count; |
220 | | char (*type_order)[3]; |
221 | | }; |
222 | | |
223 | | /*! |
224 | | * Method for parsing the header text and populating the |
225 | | * internal hash tables. After calling this method, the |
226 | | * parsed representation becomes the single source of truth. |
227 | | * |
228 | | * @param bh Header structure, previously initialised by a |
229 | | * sam_hdr_init call |
230 | | * @return 0 on success, -1 on failure |
231 | | */ |
232 | | int sam_hdr_fill_hrecs(sam_hdr_t *bh); |
233 | | |
234 | | /*! |
235 | | * Reconstructs the text representation of the header from |
236 | | * the hash table data after a change has been performed on |
237 | | * the header. |
238 | | * |
239 | | * @return 0 on success, -1 on failure |
240 | | */ |
241 | | int sam_hdr_rebuild(sam_hdr_t *bh); |
242 | | |
243 | | /*! Creates an empty SAM header, ready to be populated. |
244 | | * |
245 | | * @return |
246 | | * Returns a sam_hrecs_t struct on success (free with sam_hrecs_free()) |
247 | | * NULL on failure |
248 | | */ |
249 | | sam_hrecs_t *sam_hrecs_new(void); |
250 | | |
251 | | /*! Produces a duplicate copy of hrecs and returns it. |
252 | | * @return |
253 | | * Returns NULL on failure |
254 | | */ |
255 | | sam_hrecs_t *sam_hrecs_dup(sam_hrecs_t *hrecs); |
256 | | |
257 | | /*! Update sam_hdr_t target_name and target_len arrays |
258 | | * |
259 | | * sam_hdr_t and sam_hrecs_t are specified separately so that sam_hdr_dup |
260 | | * can use it to construct target arrays from the source header. |
261 | | * |
262 | | * @return 0 on success; -1 on failure |
263 | | */ |
264 | | int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs, |
265 | | int refs_changed); |
266 | | |
267 | | /*! Reconstructs a kstring from the header hash table. |
268 | | * |
269 | | * @return |
270 | | * Returns 0 on success |
271 | | * -1 on failure |
272 | | */ |
273 | | int sam_hrecs_rebuild_text(const sam_hrecs_t *hrecs, kstring_t *ks); |
274 | | |
275 | | /*! Deallocates all storage used by a sam_hrecs_t struct. |
276 | | * |
277 | | * This also decrements the header reference count. If after decrementing |
278 | | * it is still non-zero then the header is assumed to be in use by another |
279 | | * caller and the free is not done. |
280 | | */ |
281 | | void sam_hrecs_free(sam_hrecs_t *hrecs); |
282 | | |
283 | | /*! |
284 | | * @return |
285 | | * Returns the first header item matching 'type'. If ID is non-NULL it checks |
286 | | * for the tag ID: and compares against the specified ID. |
287 | | * |
288 | | * Returns NULL if no type/ID is found |
289 | | */ |
290 | | sam_hrec_type_t *sam_hrecs_find_type_id(sam_hrecs_t *hrecs, const char *type, |
291 | | const char *ID_key, const char *ID_value); |
292 | | |
293 | | sam_hrec_tag_t *sam_hrecs_find_key(sam_hrec_type_t *type, |
294 | | const char *key, |
295 | | sam_hrec_tag_t **prev); |
296 | | |
297 | | int sam_hrecs_remove_key(sam_hrecs_t *hrecs, |
298 | | sam_hrec_type_t *type, |
299 | | const char *key); |
300 | | |
301 | | /*! Looks up a read-group by name and returns a pointer to the start of the |
302 | | * associated tag list. |
303 | | * |
304 | | * @return |
305 | | * Returns NULL on failure |
306 | | */ |
307 | | sam_hrec_rg_t *sam_hrecs_find_rg(sam_hrecs_t *hrecs, const char *rg); |
308 | | |
309 | | /*! Returns the sort order from the @HD SO: field */ |
310 | | enum sam_sort_order sam_hrecs_sort_order(sam_hrecs_t *hrecs); |
311 | | |
312 | | /*! Returns the group order from the @HD SO: field */ |
313 | | enum sam_group_order sam_hrecs_group_order(sam_hrecs_t *hrecs); |
314 | | |
315 | | #ifdef __cplusplus |
316 | | } |
317 | | #endif |
318 | | |
319 | | #endif /* HEADER_H_ */ |