Line | Count | Source |
1 | | /* sam.c -- SAM and BAM file I/O and manipulation. |
2 | | |
3 | | Copyright (C) 2008-2010, 2012-2025 Genome Research Ltd. |
4 | | Copyright (C) 2010, 2012, 2013 Broad Institute. |
5 | | |
6 | | Author: Heng Li <lh3@sanger.ac.uk> |
7 | | |
8 | | Permission is hereby granted, free of charge, to any person obtaining a copy |
9 | | of this software and associated documentation files (the "Software"), to deal |
10 | | in the Software without restriction, including without limitation the rights |
11 | | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
12 | | copies of the Software, and to permit persons to whom the Software is |
13 | | furnished to do so, subject to the following conditions: |
14 | | |
15 | | The above copyright notice and this permission notice shall be included in |
16 | | all copies or substantial portions of the Software. |
17 | | |
18 | | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
19 | | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
20 | | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
21 | | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
22 | | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
23 | | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
24 | | DEALINGS IN THE SOFTWARE. */ |
25 | | |
26 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
27 | | #include <config.h> |
28 | | |
29 | | #include <strings.h> |
30 | | #include <stdio.h> |
31 | | #include <stdlib.h> |
32 | | #include <string.h> |
33 | | #include <errno.h> |
34 | | #include <zlib.h> |
35 | | #include <assert.h> |
36 | | #include <signal.h> |
37 | | #include <inttypes.h> |
38 | | #include <unistd.h> |
39 | | #include <regex.h> |
40 | | |
41 | | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
42 | | #include "fuzz_settings.h" |
43 | | #endif |
44 | | |
45 | | // Suppress deprecation message for cigar_tab, which we initialise |
46 | | #include "htslib/hts_defs.h" |
47 | | #undef HTS_DEPRECATED |
48 | | #define HTS_DEPRECATED(message) |
49 | | |
50 | | #include "htslib/sam.h" |
51 | | #include "htslib/bgzf.h" |
52 | | #include "cram/cram.h" |
53 | | #include "hts_internal.h" |
54 | | #include "sam_internal.h" |
55 | | #include "htslib/hfile.h" |
56 | | #include "htslib/hts_endian.h" |
57 | | #include "htslib/hts_expr.h" |
58 | | #include "header.h" |
59 | | |
60 | | #include "htslib/khash.h" |
61 | | KHASH_DECLARE(s2i, kh_cstr_t, int64_t) |
62 | | KHASH_SET_INIT_INT(tag) |
63 | | |
64 | | #ifndef EFTYPE |
65 | 0 | #define EFTYPE ENOEXEC |
66 | | #endif |
67 | | #ifndef EOVERFLOW |
68 | | #define EOVERFLOW ERANGE |
69 | | #endif |
70 | | |
71 | | /********************** |
72 | | *** BAM header I/O *** |
73 | | **********************/ |
74 | | |
75 | | HTSLIB_EXPORT |
76 | | const int8_t bam_cigar_table[256] = { |
77 | | // 0 .. 47 |
78 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
79 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
80 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
81 | | |
82 | | // 48 .. 63 (including =) |
83 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, BAM_CEQUAL, -1, -1, |
84 | | |
85 | | // 64 .. 79 (including MIDNHB) |
86 | | -1, -1, BAM_CBACK, -1, BAM_CDEL, -1, -1, -1, |
87 | | BAM_CHARD_CLIP, BAM_CINS, -1, -1, -1, BAM_CMATCH, BAM_CREF_SKIP, -1, |
88 | | |
89 | | // 80 .. 95 (including SPX) |
90 | | BAM_CPAD, -1, -1, BAM_CSOFT_CLIP, -1, -1, -1, -1, |
91 | | BAM_CDIFF, -1, -1, -1, -1, -1, -1, -1, |
92 | | |
93 | | // 96 .. 127 |
94 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
95 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
96 | | |
97 | | // 128 .. 255 |
98 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
99 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
100 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
101 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
102 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
103 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
104 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
105 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 |
106 | | }; |
107 | | |
108 | | sam_hdr_t *sam_hdr_init(void) |
109 | 20.5k | { |
110 | 20.5k | sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t)); |
111 | 20.5k | if (bh == NULL) return NULL; |
112 | | |
113 | 20.5k | bh->cigar_tab = bam_cigar_table; |
114 | 20.5k | return bh; |
115 | 20.5k | } |
116 | | |
117 | | void sam_hdr_destroy(sam_hdr_t *bh) |
118 | 47.5k | { |
119 | 47.5k | int32_t i; |
120 | | |
121 | 47.5k | if (bh == NULL) return; |
122 | | |
123 | 26.5k | if (bh->ref_count > 0) { |
124 | 6.01k | --bh->ref_count; |
125 | 6.01k | return; |
126 | 6.01k | } |
127 | | |
128 | 20.5k | if (bh->target_name) { |
129 | 27.5k | for (i = 0; i < bh->n_targets; ++i) |
130 | 16.0k | free(bh->target_name[i]); |
131 | 11.4k | free(bh->target_name); |
132 | 11.4k | free(bh->target_len); |
133 | 11.4k | } |
134 | 20.5k | free(bh->text); |
135 | 20.5k | if (bh->hrecs) |
136 | 12.5k | sam_hrecs_free(bh->hrecs); |
137 | 20.5k | if (bh->sdict) |
138 | 877 | kh_destroy(s2i, (khash_t(s2i) *) bh->sdict); |
139 | 20.5k | free(bh); |
140 | 20.5k | } |
141 | | |
142 | | // Copy the sam_hdr_t::sdict hash, used to store the real lengths of long |
143 | | // references before sam_hdr_t::hrecs is populated |
144 | | int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h) |
145 | 0 | { |
146 | 0 | const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict; |
147 | 0 | khash_t(s2i) *dest_long_refs = kh_init(s2i); |
148 | 0 | int i; |
149 | 0 | if (!dest_long_refs) return -1; |
150 | | |
151 | 0 | for (i = 0; i < h->n_targets; i++) { |
152 | 0 | int ret; |
153 | 0 | khiter_t ksrc, kdest; |
154 | 0 | if (h->target_len[i] < UINT32_MAX) continue; |
155 | 0 | ksrc = kh_get(s2i, src_long_refs, h->target_name[i]); |
156 | 0 | if (ksrc == kh_end(src_long_refs)) continue; |
157 | 0 | kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret); |
158 | 0 | if (ret < 0) { |
159 | 0 | kh_destroy(s2i, dest_long_refs); |
160 | 0 | return -1; |
161 | 0 | } |
162 | 0 | kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc); |
163 | 0 | } |
164 | | |
165 | 0 | h->sdict = dest_long_refs; |
166 | 0 | return 0; |
167 | 0 | } |
168 | | |
169 | | sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0) |
170 | 11.7k | { |
171 | 11.7k | if (h0 == NULL) return NULL; |
172 | 11.7k | sam_hdr_t *h; |
173 | 11.7k | if ((h = sam_hdr_init()) == NULL) return NULL; |
174 | | // copy the simple data |
175 | 11.7k | h->n_targets = 0; |
176 | 11.7k | h->ignore_sam_err = h0->ignore_sam_err; |
177 | 11.7k | h->l_text = 0; |
178 | | |
179 | | // Then the pointery stuff |
180 | | |
181 | 11.7k | if (!h0->hrecs) { |
182 | 0 | h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t)); |
183 | 0 | if (!h->target_len) goto fail; |
184 | 0 | h->target_name = (char**)calloc(h0->n_targets, sizeof(char*)); |
185 | 0 | if (!h->target_name) goto fail; |
186 | | |
187 | 0 | int i; |
188 | 0 | for (i = 0; i < h0->n_targets; ++i) { |
189 | 0 | h->target_len[i] = h0->target_len[i]; |
190 | 0 | h->target_name[i] = strdup(h0->target_name[i]); |
191 | 0 | if (!h->target_name[i]) break; |
192 | 0 | } |
193 | 0 | h->n_targets = i; |
194 | 0 | if (i < h0->n_targets) goto fail; |
195 | | |
196 | 0 | if (h0->sdict) { |
197 | 0 | if (sam_hdr_dup_sdict(h0, h) < 0) goto fail; |
198 | 0 | } |
199 | 0 | } |
200 | | |
201 | 11.7k | if (h0->hrecs) { |
202 | 11.7k | kstring_t tmp = { 0, 0, NULL }; |
203 | 11.7k | if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) { |
204 | 0 | free(ks_release(&tmp)); |
205 | 0 | goto fail; |
206 | 0 | } |
207 | | |
208 | 11.7k | h->l_text = tmp.l; |
209 | 11.7k | h->text = ks_release(&tmp); |
210 | | |
211 | 11.7k | if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0) |
212 | 0 | goto fail; |
213 | 11.7k | } else { |
214 | 0 | h->l_text = h0->text ? h0->l_text : 0; |
215 | 0 | h->text = malloc(h->l_text + 1); |
216 | 0 | if (!h->text) goto fail; |
217 | 0 | if (h0->text) |
218 | 0 | memcpy(h->text, h0->text, h->l_text); |
219 | 0 | h->text[h->l_text] = '\0'; |
220 | 0 | } |
221 | | |
222 | 11.7k | return h; |
223 | | |
224 | 0 | fail: |
225 | 0 | sam_hdr_destroy(h); |
226 | 0 | return NULL; |
227 | 11.7k | } |
228 | | |
229 | | sam_hdr_t *bam_hdr_read(BGZF *fp) |
230 | 153 | { |
231 | 153 | sam_hdr_t *h; |
232 | 153 | uint8_t buf[4]; |
233 | 153 | int magic_len, has_EOF; |
234 | 153 | int32_t i, name_len, num_names = 0; |
235 | 153 | size_t bufsize; |
236 | 153 | ssize_t bytes; |
237 | | // check EOF |
238 | 153 | has_EOF = bgzf_check_EOF(fp); |
239 | 153 | if (has_EOF < 0) { |
240 | 0 | perror("[W::bam_hdr_read] bgzf_check_EOF"); |
241 | 153 | } else if (has_EOF == 0) { |
242 | 153 | hts_log_warning("EOF marker is absent. The input is probably truncated"); |
243 | 153 | } |
244 | | // read "BAM1" |
245 | 153 | magic_len = bgzf_read(fp, buf, 4); |
246 | 153 | if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) { |
247 | 0 | hts_log_error("Invalid BAM binary header"); |
248 | 0 | return 0; |
249 | 0 | } |
250 | 153 | h = sam_hdr_init(); |
251 | 153 | if (!h) goto nomem; |
252 | | |
253 | | // read plain text and the number of reference sequences |
254 | 153 | bytes = bgzf_read(fp, buf, 4); |
255 | 153 | if (bytes != 4) goto read_err; |
256 | 153 | h->l_text = le_to_u32(buf); |
257 | | |
258 | 153 | bufsize = h->l_text + 1; |
259 | 153 | if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed |
260 | 153 | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
261 | 153 | if (bufsize > FUZZ_ALLOC_LIMIT) goto nomem; |
262 | 153 | #endif |
263 | 153 | h->text = (char*)malloc(bufsize); |
264 | 153 | if (!h->text) goto nomem; |
265 | 153 | h->text[h->l_text] = 0; // make sure it is NULL terminated |
266 | 153 | bytes = bgzf_read(fp, h->text, h->l_text); |
267 | 153 | if (bytes != h->l_text) goto read_err; |
268 | | |
269 | 147 | bytes = bgzf_read(fp, &h->n_targets, 4); |
270 | 147 | if (bytes != 4) goto read_err; |
271 | 147 | if (fp->is_be) ed_swap_4p(&h->n_targets); |
272 | | |
273 | 147 | if (h->n_targets < 0) goto invalid; |
274 | | |
275 | | // read reference sequence names and lengths |
276 | 144 | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
277 | 144 | if (h->n_targets > (FUZZ_ALLOC_LIMIT - bufsize)/(sizeof(char*)+sizeof(uint32_t))) |
278 | 0 | goto nomem; |
279 | 144 | #endif |
280 | 144 | if (h->n_targets > 0) { |
281 | 99 | h->target_name = (char**)calloc(h->n_targets, sizeof(char*)); |
282 | 99 | if (!h->target_name) goto nomem; |
283 | 99 | h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t)); |
284 | 99 | if (!h->target_len) goto nomem; |
285 | 99 | } |
286 | 45 | else { |
287 | 45 | h->target_name = NULL; |
288 | 45 | h->target_len = NULL; |
289 | 45 | } |
290 | | |
291 | 651 | for (i = 0; i != h->n_targets; ++i) { |
292 | 549 | bytes = bgzf_read(fp, &name_len, 4); |
293 | 549 | if (bytes != 4) goto read_err; |
294 | 549 | if (fp->is_be) ed_swap_4p(&name_len); |
295 | 549 | if (name_len <= 0) goto invalid; |
296 | | |
297 | 543 | h->target_name[i] = (char*)malloc(name_len); |
298 | 543 | if (!h->target_name[i]) goto nomem; |
299 | 543 | num_names++; |
300 | | |
301 | 543 | bytes = bgzf_read(fp, h->target_name[i], name_len); |
302 | 543 | if (bytes != name_len) goto read_err; |
303 | | |
304 | 507 | if (h->target_name[i][name_len - 1] != '\0') { |
305 | | /* Fix missing NUL-termination. Is this being too nice? |
306 | | We could alternatively bail out with an error. */ |
307 | 252 | char *new_name; |
308 | 252 | if (name_len == INT32_MAX) goto invalid; |
309 | 252 | new_name = realloc(h->target_name[i], name_len + 1); |
310 | 252 | if (new_name == NULL) goto nomem; |
311 | 252 | h->target_name[i] = new_name; |
312 | 252 | h->target_name[i][name_len] = '\0'; |
313 | 252 | } |
314 | | |
315 | 507 | bytes = bgzf_read(fp, &h->target_len[i], 4); |
316 | 507 | if (bytes != 4) goto read_err; |
317 | 507 | if (fp->is_be) ed_swap_4p(&h->target_len[i]); |
318 | 507 | } |
319 | 102 | return h; |
320 | | |
321 | 0 | nomem: |
322 | 0 | hts_log_error("Out of memory"); |
323 | 0 | goto clean; |
324 | | |
325 | 42 | read_err: |
326 | 42 | if (bytes < 0) { |
327 | 18 | hts_log_error("Error reading BGZF stream"); |
328 | 24 | } else { |
329 | 24 | hts_log_error("Truncated BAM header"); |
330 | 24 | } |
331 | 42 | goto clean; |
332 | | |
333 | 9 | invalid: |
334 | 9 | hts_log_error("Invalid BAM binary header"); |
335 | | |
336 | 51 | clean: |
337 | 51 | if (h != NULL) { |
338 | 51 | h->n_targets = num_names; // ensure we free only allocated target_names |
339 | 51 | sam_hdr_destroy(h); |
340 | 51 | } |
341 | 51 | return NULL; |
342 | 9 | } |
343 | | |
344 | | int bam_hdr_write(BGZF *fp, const sam_hdr_t *h) |
345 | 2.63k | { |
346 | 2.63k | int32_t i, name_len, x; |
347 | 2.63k | kstring_t hdr_ks = { 0, 0, NULL }; |
348 | 2.63k | char *text; |
349 | 2.63k | uint32_t l_text; |
350 | | |
351 | 2.63k | if (!h) return -1; |
352 | | |
353 | 2.63k | if (h->hrecs) { |
354 | 2.63k | if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1; |
355 | 2.63k | if (hdr_ks.l > UINT32_MAX) { |
356 | 0 | hts_log_error("Header too long for BAM format"); |
357 | 0 | free(hdr_ks.s); |
358 | 0 | return -1; |
359 | 2.63k | } else if (hdr_ks.l > INT32_MAX) { |
360 | 0 | hts_log_warning("Header too long for BAM specification (>2GB)"); |
361 | 0 | hts_log_warning("Output file may not be portable"); |
362 | 0 | } |
363 | 2.63k | text = hdr_ks.s; |
364 | 2.63k | l_text = hdr_ks.l; |
365 | 2.63k | } else { |
366 | 0 | if (h->l_text > UINT32_MAX) { |
367 | 0 | hts_log_error("Header too long for BAM format"); |
368 | 0 | return -1; |
369 | 0 | } else if (h->l_text > INT32_MAX) { |
370 | 0 | hts_log_warning("Header too long for BAM specification (>2GB)"); |
371 | 0 | hts_log_warning("Output file may not be portable"); |
372 | 0 | } |
373 | 0 | text = h->text; |
374 | 0 | l_text = h->l_text; |
375 | 0 | } |
376 | | // write "BAM1" |
377 | 2.63k | if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; } |
378 | | // write plain text and the number of reference sequences |
379 | 2.63k | if (fp->is_be) { |
380 | 0 | x = ed_swap_4(l_text); |
381 | 0 | if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; } |
382 | 0 | if (l_text) { |
383 | 0 | if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; } |
384 | 0 | } |
385 | 0 | x = ed_swap_4(h->n_targets); |
386 | 0 | if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; } |
387 | 2.63k | } else { |
388 | 2.63k | if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; } |
389 | 2.63k | if (l_text) { |
390 | 1.70k | if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; } |
391 | 1.70k | } |
392 | 2.63k | if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; } |
393 | 2.63k | } |
394 | 2.63k | free(hdr_ks.s); |
395 | | // write sequence names and lengths |
396 | 4.73k | for (i = 0; i != h->n_targets; ++i) { |
397 | 2.09k | char *p = h->target_name[i]; |
398 | 2.09k | name_len = strlen(p) + 1; |
399 | 2.09k | if (fp->is_be) { |
400 | 0 | x = ed_swap_4(name_len); |
401 | 0 | if (bgzf_write(fp, &x, 4) < 0) return -1; |
402 | 2.09k | } else { |
403 | 2.09k | if (bgzf_write(fp, &name_len, 4) < 0) return -1; |
404 | 2.09k | } |
405 | 2.09k | if (bgzf_write(fp, p, name_len) < 0) return -1; |
406 | 2.09k | if (fp->is_be) { |
407 | 0 | x = ed_swap_4(h->target_len[i]); |
408 | 0 | if (bgzf_write(fp, &x, 4) < 0) return -1; |
409 | 2.09k | } else { |
410 | 2.09k | if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1; |
411 | 2.09k | } |
412 | 2.09k | } |
413 | 2.63k | if (bgzf_flush(fp) < 0) return -1; |
414 | 2.63k | return 0; |
415 | 2.63k | } |
416 | | |
417 | | const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, |
418 | 0 | hts_pos_t *beg, hts_pos_t *end, int flags) { |
419 | 0 | return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags); |
420 | 0 | } |
421 | | |
422 | | /************************* |
423 | | *** BAM alignment I/O *** |
424 | | *************************/ |
425 | | |
426 | | bam1_t *bam_init1(void) |
427 | 1.15M | { |
428 | 1.15M | return (bam1_t*)calloc(1, sizeof(bam1_t)); |
429 | 1.15M | } |
430 | | |
431 | | int sam_realloc_bam_data(bam1_t *b, size_t desired) |
432 | 1.18M | { |
433 | 1.18M | uint32_t new_m_data; |
434 | 1.18M | uint8_t *new_data; |
435 | 1.18M | new_m_data = desired; |
436 | 1.18M | kroundup32(new_m_data); // next power of 2 |
437 | 1.18M | new_m_data += 32; // reduces malloc arena migrations? |
438 | 1.18M | if (new_m_data < desired) { |
439 | 0 | errno = ENOMEM; // Not strictly true but we can't store the size |
440 | 0 | return -1; |
441 | 0 | } |
442 | 1.18M | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
443 | 1.18M | if (new_m_data > FUZZ_ALLOC_LIMIT) { |
444 | 4 | errno = ENOMEM; |
445 | 4 | return -1; |
446 | 4 | } |
447 | 1.18M | #endif |
448 | 1.18M | if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) { |
449 | 1.18M | new_data = realloc(b->data, new_m_data); |
450 | 1.18M | } else { |
451 | 0 | if ((new_data = malloc(new_m_data)) != NULL) { |
452 | 0 | if (b->l_data > 0) |
453 | 0 | memcpy(new_data, b->data, |
454 | 0 | b->l_data < b->m_data ? b->l_data : b->m_data); |
455 | 0 | bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA)); |
456 | 0 | } |
457 | 0 | } |
458 | 1.18M | if (!new_data) return -1; |
459 | 1.18M | b->data = new_data; |
460 | 1.18M | b->m_data = new_m_data; |
461 | 1.18M | return 0; |
462 | 1.18M | } |
463 | | |
464 | | void bam_destroy1(bam1_t *b) |
465 | 17.0M | { |
466 | 17.0M | if (b == 0) return; |
467 | 1.15M | if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) { |
468 | 1.15M | free(b->data); |
469 | 1.15M | if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) { |
470 | | // In case of reuse |
471 | 0 | b->data = NULL; |
472 | 0 | b->m_data = 0; |
473 | 0 | b->l_data = 0; |
474 | 0 | } |
475 | 1.15M | } |
476 | | |
477 | 1.15M | if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0) |
478 | 1.15M | free(b); |
479 | 1.15M | } |
480 | | |
481 | | bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) |
482 | 5.87M | { |
483 | 5.87M | if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL; |
484 | 5.87M | memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data |
485 | 5.87M | memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest |
486 | 5.87M | bdst->l_data = bsrc->l_data; |
487 | 5.87M | bdst->id = bsrc->id; |
488 | 5.87M | return bdst; |
489 | 5.87M | } |
490 | | |
491 | | bam1_t *bam_dup1(const bam1_t *bsrc) |
492 | 1.14M | { |
493 | 1.14M | if (bsrc == NULL) return NULL; |
494 | 1.14M | bam1_t *bdst = bam_init1(); |
495 | 1.14M | if (bdst == NULL) return NULL; |
496 | 1.14M | if (bam_copy1(bdst, bsrc) == NULL) { |
497 | 0 | bam_destroy1(bdst); |
498 | 0 | return NULL; |
499 | 0 | } |
500 | 1.14M | return bdst; |
501 | 1.14M | } |
502 | | |
503 | | static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar, |
504 | | hts_pos_t *rlen, hts_pos_t *qlen) |
505 | 367 | { |
506 | 367 | int k; |
507 | 367 | *rlen = *qlen = 0; |
508 | 15.2k | for (k = 0; k < n_cigar; ++k) { |
509 | 14.9k | int type = bam_cigar_type(bam_cigar_op(cigar[k])); |
510 | 14.9k | int len = bam_cigar_oplen(cigar[k]); |
511 | 14.9k | if (type & 1) *qlen += len; |
512 | 14.9k | if (type & 2) *rlen += len; |
513 | 14.9k | } |
514 | 367 | } |
515 | | |
516 | | static int subtract_check_underflow(size_t length, size_t *limit) |
517 | 86.4M | { |
518 | 86.4M | if (length <= *limit) { |
519 | 86.4M | *limit -= length; |
520 | 86.4M | return 0; |
521 | 86.4M | } |
522 | | |
523 | 0 | return -1; |
524 | 86.4M | } |
525 | | |
526 | | int bam_set1(bam1_t *bam, |
527 | | size_t l_qname, const char *qname, |
528 | | uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq, |
529 | | size_t n_cigar, const uint32_t *cigar, |
530 | | int32_t mtid, hts_pos_t mpos, hts_pos_t isize, |
531 | | size_t l_seq, const char *seq, const char *qual, |
532 | | size_t l_aux) |
533 | 17.2M | { |
534 | | // use a default qname "*" if none is provided |
535 | 17.2M | if (l_qname == 0) { |
536 | 15.7M | l_qname = 1; |
537 | 15.7M | qname = "*"; |
538 | 15.7M | } |
539 | | |
540 | | // note: the qname is stored nul terminated and padded as described in the |
541 | | // documentation for the bam1_t struct. |
542 | 17.2M | size_t qname_nuls = 4 - l_qname % 4; |
543 | | |
544 | | // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos(). |
545 | | // can't use bam_endpos() directly as some fields not yet set up. |
546 | 17.2M | hts_pos_t rlen = 0, qlen = 0; |
547 | 17.2M | if (!(flag & BAM_FUNMAP)) { |
548 | 0 | bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen); |
549 | 0 | } |
550 | 17.2M | if (rlen == 0) { |
551 | 17.2M | rlen = 1; |
552 | 17.2M | } |
553 | | |
554 | | // validate parameters |
555 | 17.2M | if (l_qname > 254) { |
556 | 111 | hts_log_error("Query name too long"); |
557 | 111 | errno = EINVAL; |
558 | 111 | return -1; |
559 | 111 | } |
560 | 17.2M | if (HTS_POS_MAX - rlen <= pos) { |
561 | 0 | hts_log_error("Read ends beyond highest supported position"); |
562 | 0 | errno = EINVAL; |
563 | 0 | return -1; |
564 | 0 | } |
565 | 17.2M | if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) { |
566 | 0 | hts_log_error("Mapped query must have a CIGAR"); |
567 | 0 | errno = EINVAL; |
568 | 0 | return -1; |
569 | 0 | } |
570 | 17.2M | if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) { |
571 | 0 | hts_log_error("CIGAR and query sequence are of different length"); |
572 | 0 | errno = EINVAL; |
573 | 0 | return -1; |
574 | 0 | } |
575 | | |
576 | 17.2M | size_t limit = INT32_MAX; |
577 | 17.2M | int u = subtract_check_underflow(l_qname + qname_nuls, &limit); |
578 | 17.2M | u += subtract_check_underflow(n_cigar * 4, &limit); |
579 | 17.2M | u += subtract_check_underflow((l_seq + 1) / 2, &limit); |
580 | 17.2M | u += subtract_check_underflow(l_seq, &limit); |
581 | 17.2M | u += subtract_check_underflow(l_aux, &limit); |
582 | 17.2M | if (u != 0) { |
583 | 0 | hts_log_error("Size overflow"); |
584 | 0 | errno = EINVAL; |
585 | 0 | return -1; |
586 | 0 | } |
587 | | |
588 | | // re-allocate the data buffer as needed. |
589 | 17.2M | size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq; |
590 | 17.2M | if (realloc_bam_data(bam, data_len + l_aux) < 0) { |
591 | 0 | return -1; |
592 | 0 | } |
593 | | |
594 | 17.2M | bam->l_data = (int)data_len; |
595 | 17.2M | bam->core.pos = pos; |
596 | 17.2M | bam->core.tid = tid; |
597 | 17.2M | bam->core.bin = bam_reg2bin(pos, pos + rlen); |
598 | 17.2M | bam->core.qual = mapq; |
599 | 17.2M | bam->core.l_extranul = (uint8_t)(qname_nuls - 1); |
600 | 17.2M | bam->core.flag = flag; |
601 | 17.2M | bam->core.l_qname = (uint16_t)(l_qname + qname_nuls); |
602 | 17.2M | bam->core.n_cigar = (uint32_t)n_cigar; |
603 | 17.2M | bam->core.l_qseq = (int32_t)l_seq; |
604 | 17.2M | bam->core.mtid = mtid; |
605 | 17.2M | bam->core.mpos = mpos; |
606 | 17.2M | bam->core.isize = isize; |
607 | | |
608 | 17.2M | uint8_t *cp = bam->data; |
609 | 17.2M | strncpy((char *)cp, qname, l_qname); |
610 | 17.2M | int i; |
611 | 68.5M | for (i = 0; i < qname_nuls; i++) { |
612 | 51.2M | cp[l_qname + i] = '\0'; |
613 | 51.2M | } |
614 | 17.2M | cp += l_qname + qname_nuls; |
615 | | |
616 | 17.2M | if (n_cigar > 0) { |
617 | 0 | memcpy(cp, cigar, n_cigar * 4); |
618 | 0 | } |
619 | 17.2M | cp += n_cigar * 4; |
620 | | |
621 | 1.29G | #define NN 16 |
622 | 17.2M | const uint8_t *useq = (uint8_t *)seq; |
623 | 123M | for (i = 0; i + NN < l_seq; i += NN) { |
624 | 106M | int j; |
625 | 106M | const uint8_t *u2 = useq+i; |
626 | 956M | for (j = 0; j < NN/2; j++) |
627 | 850M | cp[j] = (seq_nt16_table[u2[j*2]]<<4) | seq_nt16_table[u2[j*2+1]]; |
628 | 106M | cp += NN/2; |
629 | 106M | } |
630 | 18.8M | for (; i + 1 < l_seq; i += 2) { |
631 | 1.52M | *cp++ = (seq_nt16_table[useq[i]] << 4) | seq_nt16_table[useq[i + 1]]; |
632 | 1.52M | } |
633 | | |
634 | 17.4M | for (; i < l_seq; i++) { |
635 | 206k | *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4; |
636 | 206k | } |
637 | | |
638 | 17.2M | if (qual) { |
639 | 6 | memcpy(cp, qual, l_seq); |
640 | 6 | } |
641 | 17.2M | else { |
642 | 17.2M | memset(cp, '\xff', l_seq); |
643 | 17.2M | } |
644 | | |
645 | 17.2M | return (int)data_len; |
646 | 17.2M | } |
647 | | |
648 | | hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar) |
649 | 5.96M | { |
650 | 5.96M | int k; |
651 | 5.96M | hts_pos_t l; |
652 | 10.5M | for (k = l = 0; k < n_cigar; ++k) |
653 | 4.59M | if (bam_cigar_type(bam_cigar_op(cigar[k]))&1) |
654 | 4.15M | l += bam_cigar_oplen(cigar[k]); |
655 | 5.96M | return l; |
656 | 5.96M | } |
657 | | |
658 | | hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar) |
659 | 237k | { |
660 | 237k | int k; |
661 | 237k | hts_pos_t l; |
662 | 16.1M | for (k = l = 0; k < n_cigar; ++k) |
663 | 15.8M | if (bam_cigar_type(bam_cigar_op(cigar[k]))&2) |
664 | 14.7M | l += bam_cigar_oplen(cigar[k]); |
665 | 237k | return l; |
666 | 237k | } |
667 | | |
668 | | hts_pos_t bam_endpos(const bam1_t *b) |
669 | 1.41k | { |
670 | 1.41k | hts_pos_t rlen = (b->core.flag & BAM_FUNMAP)? 0 : bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); |
671 | 1.41k | if (rlen == 0) rlen = 1; |
672 | 1.41k | return b->core.pos + rlen; |
673 | 1.41k | } |
674 | | |
675 | | static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG |
676 | 343k | { |
677 | 343k | bam1_core_t *c = &b->core; |
678 | | |
679 | | // Bail out as fast as possible for the easy case |
680 | 343k | uint32_t test_CG = BAM_CSOFT_CLIP | (c->l_qseq << BAM_CIGAR_SHIFT); |
681 | 343k | if (c->n_cigar == 0 || test_CG != *bam_get_cigar(b)) |
682 | 240k | return 0; |
683 | | |
684 | | // The above isn't fool proof - we may have old CIGAR tags that aren't used, |
685 | | // but this is much less likely so do as a secondary check. |
686 | 103k | if (c->tid < 0 || c->pos < 0) |
687 | 56.0k | return 0; |
688 | | |
689 | | // Do we have a CG tag? |
690 | 47.1k | uint8_t *CG = bam_aux_get(b, "CG"); |
691 | 47.1k | int saved_errno = errno; |
692 | 47.1k | if (!CG) { |
693 | 45.2k | if (errno != ENOENT) return -1; // Bad aux data |
694 | 45.2k | errno = saved_errno; // restore errno on expected no-CG-tag case |
695 | 45.2k | return 0; |
696 | 45.2k | } |
697 | | |
698 | | // Now we start with the serious work migrating CG to CIGAR |
699 | 1.88k | uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, |
700 | 1.88k | *cigar0, CG_len, fake_bytes; |
701 | 1.88k | cigar0 = bam_get_cigar(b); |
702 | 1.88k | fake_bytes = c->n_cigar * 4; |
703 | 1.88k | if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i')) |
704 | 468 | return 0; // not of type B,I |
705 | 1.41k | CG_len = le_to_u32(CG + 2); |
706 | | // don't move if the real CIGAR length is shorter than the fake cigar length |
707 | 1.41k | if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; |
708 | | |
709 | | // move from the CG tag to the right position |
710 | 1.41k | cigar_st = (uint8_t*)cigar0 - b->data; |
711 | 1.41k | c->n_cigar = CG_len; |
712 | 1.41k | n_cigar4 = c->n_cigar * 4; |
713 | 1.41k | CG_st = CG - b->data - 2; |
714 | 1.41k | CG_en = CG_st + 8 + n_cigar4; |
715 | 1.41k | if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1; |
716 | | // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place |
717 | 1.41k | b->l_data = b->l_data - fake_bytes + n_cigar4; |
718 | | // insert c->n_cigar-fake_bytes empty space to make room |
719 | 1.41k | memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); |
720 | | // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR |
721 | 1.41k | memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); |
722 | 1.41k | if (ori_len > CG_en) // move data after the CG tag |
723 | 290 | memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en); |
724 | 1.41k | b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4) |
725 | 1.41k | if (recal_bin) |
726 | 1.41k | b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5); |
727 | 1.41k | if (give_warning) |
728 | 1.41k | hts_log_warning("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar); |
729 | 1.41k | return 1; |
730 | 1.41k | } |
731 | | |
732 | | static inline int aux_type2size(uint8_t type) |
733 | 3.21M | { |
734 | 3.21M | switch (type) { |
735 | 1.62M | case 'A': case 'c': case 'C': |
736 | 1.62M | return 1; |
737 | 198k | case 's': case 'S': |
738 | 198k | return 2; |
739 | 666k | case 'i': case 'I': case 'f': |
740 | 666k | return 4; |
741 | 13.5k | case 'd': |
742 | 13.5k | return 8; |
743 | 707k | case 'Z': case 'H': case 'B': |
744 | 707k | return type; |
745 | 84 | default: |
746 | 84 | return 0; |
747 | 3.21M | } |
748 | 3.21M | } |
749 | | |
750 | | static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host) |
751 | 0 | { |
752 | 0 | uint32_t *cigar = (uint32_t*)(data + c->l_qname); |
753 | 0 | uint32_t i; |
754 | 0 | for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]); |
755 | 0 | } |
756 | | |
757 | | // Fix bad records where qname is not terminated correctly. |
758 | 197 | static int fixup_missing_qname_nul(bam1_t *b) { |
759 | 197 | bam1_core_t *c = &b->core; |
760 | | |
761 | | // Note this is called before c->l_extranul is added to c->l_qname |
762 | 197 | if (c->l_extranul > 0) { |
763 | 188 | b->data[c->l_qname++] = '\0'; |
764 | 188 | c->l_extranul--; |
765 | 188 | } else { |
766 | 9 | if (b->l_data > INT_MAX - 4) return -1; |
767 | 9 | if (realloc_bam_data(b, b->l_data + 4) < 0) return -1; |
768 | 9 | b->l_data += 4; |
769 | 9 | b->data[c->l_qname++] = '\0'; |
770 | 9 | c->l_extranul = 3; |
771 | 9 | } |
772 | 197 | return 0; |
773 | 197 | } |
774 | | |
775 | | /* |
776 | | * Note a second interface that returns a bam pointer instead would avoid bam_copy1 |
777 | | * in multi-threaded handling. This may be worth considering for htslib2. |
778 | | */ |
779 | | int bam_read1(BGZF *fp, bam1_t *b) |
780 | 431 | { |
781 | 431 | bam1_core_t *c = &b->core; |
782 | 431 | int32_t block_len, ret, i; |
783 | 431 | uint32_t new_l_data; |
784 | 431 | uint8_t tmp[32], *x; |
785 | | |
786 | 431 | b->l_data = 0; |
787 | | |
788 | 431 | if ((ret = bgzf_read_small(fp, &block_len, 4)) != 4) { |
789 | 0 | if (ret == 0) return -1; // normal end-of-file |
790 | 0 | else return -2; // truncated |
791 | 0 | } |
792 | 431 | if (fp->is_be) |
793 | 0 | ed_swap_4p(&block_len); |
794 | 431 | if (block_len < 32) return -4; // block_len includes core data |
795 | 427 | if (fp->block_length - fp->block_offset > 32) { |
796 | | // Avoid bgzf_read and a temporary copy to a local buffer |
797 | 427 | x = (uint8_t *)fp->uncompressed_block + fp->block_offset; |
798 | 427 | fp->block_offset += 32; |
799 | 427 | } else { |
800 | 0 | x = tmp; |
801 | 0 | if (bgzf_read(fp, x, 32) != 32) return -3; |
802 | 0 | } |
803 | | |
804 | 427 | c->tid = le_to_u32(x); |
805 | 427 | c->pos = le_to_i32(x+4); |
806 | 427 | uint32_t x2 = le_to_u32(x+8); |
807 | 427 | c->bin = x2>>16; |
808 | 427 | c->qual = x2>>8&0xff; |
809 | 427 | c->l_qname = x2&0xff; |
810 | 427 | c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0; |
811 | 427 | uint32_t x3 = le_to_u32(x+12); |
812 | 427 | c->flag = x3>>16; |
813 | 427 | c->n_cigar = x3&0xffff; |
814 | 427 | c->l_qseq = le_to_u32(x+16); |
815 | 427 | c->mtid = le_to_u32(x+20); |
816 | 427 | c->mpos = le_to_i32(x+24); |
817 | 427 | c->isize = le_to_i32(x+28); |
818 | | |
819 | 427 | new_l_data = block_len - 32 + c->l_extranul; |
820 | 427 | if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4; |
821 | 427 | if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul |
822 | 427 | + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data) |
823 | 25 | return -4; |
824 | 402 | if (realloc_bam_data(b, new_l_data) < 0) return -4; |
825 | 398 | b->l_data = new_l_data; |
826 | | |
827 | 398 | if (bgzf_read_small(fp, b->data, c->l_qname) != c->l_qname) return -4; |
828 | 396 | if (b->data[c->l_qname - 1] != '\0') { // try to fix missing nul termination |
829 | 197 | if (fixup_missing_qname_nul(b) < 0) return -4; |
830 | 197 | } |
831 | 747 | for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0'; |
832 | 396 | c->l_qname += c->l_extranul; |
833 | 396 | if (b->l_data < c->l_qname || |
834 | 396 | bgzf_read_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname) |
835 | 19 | return -4; |
836 | 377 | if (fp->is_be) swap_data(c, b->l_data, b->data, 0); |
837 | 377 | if (bam_tag2cigar(b, 0, 0) < 0) |
838 | 0 | return -4; |
839 | | |
840 | | // TODO: consider making this conditional |
841 | 377 | if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency |
842 | 367 | hts_pos_t rlen, qlen; |
843 | 367 | bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen); |
844 | 367 | if ((b->core.flag & BAM_FUNMAP) || rlen == 0) rlen = 1; |
845 | 367 | b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5); |
846 | | // Sanity check for broken CIGAR alignments |
847 | 367 | if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) { |
848 | 10 | hts_log_error("CIGAR and query sequence lengths differ for %s", |
849 | 10 | bam_get_qname(b)); |
850 | 10 | return -4; |
851 | 10 | } |
852 | 367 | } |
853 | | |
854 | 367 | return 4 + block_len; |
855 | 377 | } |
856 | | |
857 | | int bam_write1(BGZF *fp, const bam1_t *b) |
858 | 5.87M | { |
859 | 5.87M | const bam1_core_t *c = &b->core; |
860 | 5.87M | uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y; |
861 | 5.87M | int i, ok; |
862 | 5.87M | if (c->l_qname - c->l_extranul > 255) { |
863 | 0 | hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b)); |
864 | 0 | errno = EOVERFLOW; |
865 | 0 | return -1; |
866 | 0 | } |
867 | 5.87M | if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR |
868 | 5.87M | if (c->pos > INT_MAX || |
869 | 5.87M | c->mpos > INT_MAX || |
870 | 5.87M | c->isize < INT_MIN || c->isize > INT_MAX) { |
871 | 67 | hts_log_error("Positional data is too large for BAM format"); |
872 | 67 | return -1; |
873 | 67 | } |
874 | 5.87M | x[0] = c->tid; |
875 | 5.87M | x[1] = c->pos; |
876 | 5.87M | x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul); |
877 | 5.87M | if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2; |
878 | 5.87M | else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff); |
879 | 5.87M | x[4] = c->l_qseq; |
880 | 5.87M | x[5] = c->mtid; |
881 | 5.87M | x[6] = c->mpos; |
882 | 5.87M | x[7] = c->isize; |
883 | 5.87M | ok = (bgzf_flush_try(fp, 4 + block_len) >= 0); |
884 | 5.87M | if (fp->is_be) { |
885 | 0 | for (i = 0; i < 8; ++i) ed_swap_4p(x + i); |
886 | 0 | y = block_len; |
887 | 0 | if (ok) ok = (bgzf_write_small(fp, ed_swap_4p(&y), 4) >= 0); |
888 | 0 | swap_data(c, b->l_data, b->data, 1); |
889 | 5.87M | } else { |
890 | 5.87M | if (ok) ok = (bgzf_write_small(fp, &block_len, 4) >= 0); |
891 | 5.87M | } |
892 | 5.87M | if (ok) ok = (bgzf_write_small(fp, x, 32) >= 0); |
893 | 5.87M | if (ok) ok = (bgzf_write_small(fp, b->data, c->l_qname - c->l_extranul) >= 0); |
894 | 5.87M | if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally |
895 | 5.87M | if (ok) ok = (bgzf_write_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0); |
896 | 5.87M | } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag |
897 | 28 | uint8_t buf[8]; |
898 | 28 | uint32_t cigar_st, cigar_en, cigar[2]; |
899 | 28 | hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b)); |
900 | 28 | if (cigreflen >= (1<<28)) { |
901 | | // Length of reference covered is greater than the biggest |
902 | | // CIGAR operation currently allowed. |
903 | 11 | hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos |
904 | 11 | " cannot be written in BAM. Try writing SAM or CRAM instead.\n", |
905 | 11 | bam_get_qname(b), c->n_cigar, cigreflen); |
906 | 11 | return -1; |
907 | 11 | } |
908 | 17 | cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; |
909 | 17 | cigar_en = cigar_st + c->n_cigar * 4; |
910 | 17 | cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP; |
911 | 17 | cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP; |
912 | 17 | u32_to_le(cigar[0], buf); |
913 | 17 | u32_to_le(cigar[1], buf + 4); |
914 | 17 | if (ok) ok = (bgzf_write_small(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N |
915 | 17 | if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR |
916 | 17 | if (ok) ok = (bgzf_write_small(fp, "CGBI", 4) >= 0); // write CG:B,I |
917 | 17 | u32_to_le(c->n_cigar, buf); |
918 | 17 | if (ok) ok = (bgzf_write_small(fp, buf, 4) >= 0); // write the true CIGAR length |
919 | 17 | if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR |
920 | 17 | } |
921 | 5.87M | if (fp->is_be) swap_data(c, b->l_data, b->data, 0); |
922 | 5.87M | return ok? 4 + block_len : -1; |
923 | 5.87M | } |
924 | | |
925 | | /* |
926 | | * Write a BAM file and append to the in-memory index simultaneously. |
927 | | */ |
928 | 5.87M | static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) { |
929 | 5.87M | BGZF *bfp = fp->fp.bgzf; |
930 | | |
931 | 5.87M | if (!fp->idx) |
932 | 5.87M | return bam_write1(bfp, b); |
933 | | |
934 | 0 | uint32_t block_len = b->l_data - b->core.l_extranul + 32; |
935 | 0 | if (bgzf_flush_try(bfp, 4 + block_len) < 0) |
936 | 0 | return -1; |
937 | 0 | if (!bfp->mt) |
938 | 0 | hts_idx_amend_last(fp->idx, bgzf_tell(bfp)); |
939 | |
|
940 | 0 | int ret = bam_write1(bfp, b); |
941 | 0 | if (ret < 0) |
942 | 0 | return -1; |
943 | | |
944 | 0 | if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) { |
945 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
946 | 0 | bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); |
947 | 0 | ret = -1; |
948 | 0 | } |
949 | |
|
950 | 0 | return ret; |
951 | 0 | } |
952 | | |
953 | | /* |
954 | | * Set the qname in a BAM record |
955 | | */ |
956 | | int bam_set_qname(bam1_t *rec, const char *qname) |
957 | 0 | { |
958 | 0 | if (!rec) return -1; |
959 | 0 | if (!qname || !*qname) return -1; |
960 | | |
961 | 0 | size_t old_len = rec->core.l_qname; |
962 | 0 | size_t new_len = strlen(qname) + 1; |
963 | 0 | if (new_len < 1 || new_len > 255) return -1; |
964 | | |
965 | 0 | int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0; |
966 | |
|
967 | 0 | size_t new_data_len = rec->l_data - old_len + new_len + extranul; |
968 | 0 | if (realloc_bam_data(rec, new_data_len) < 0) return -1; |
969 | | |
970 | | // Make room |
971 | 0 | if (new_len + extranul != rec->core.l_qname) |
972 | 0 | memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname); |
973 | | // Copy in new name and pad if needed |
974 | 0 | memcpy(rec->data, qname, new_len); |
975 | 0 | int n; |
976 | 0 | for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0'; |
977 | |
|
978 | 0 | rec->l_data = new_data_len; |
979 | 0 | rec->core.l_qname = new_len + extranul; |
980 | 0 | rec->core.l_extranul = extranul; |
981 | |
|
982 | 0 | return 0; |
983 | 0 | } |
984 | | |
985 | | /******************** |
986 | | *** BAM indexing *** |
987 | | ********************/ |
988 | | |
989 | | static hts_idx_t *sam_index(htsFile *fp, int min_shift) |
990 | 0 | { |
991 | 0 | int n_lvls, i, fmt, ret; |
992 | 0 | bam1_t *b; |
993 | 0 | hts_idx_t *idx; |
994 | 0 | sam_hdr_t *h; |
995 | 0 | h = sam_hdr_read(fp); |
996 | 0 | if (h == NULL) return NULL; |
997 | 0 | if (min_shift > 0) { |
998 | 0 | hts_pos_t max_len = 0; |
999 | 0 | for (i = 0; i < h->n_targets; ++i) { |
1000 | 0 | hts_pos_t len = sam_hdr_tid2len(h, i); |
1001 | 0 | if (max_len < len) max_len = len; |
1002 | 0 | } |
1003 | 0 | n_lvls = 0; |
1004 | 0 | hts_adjust_csi_settings(max_len, &min_shift, &n_lvls); |
1005 | 0 | fmt = HTS_FMT_CSI; |
1006 | 0 | } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI; |
1007 | 0 | idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); |
1008 | 0 | b = bam_init1(); |
1009 | 0 | while ((ret = sam_read1(fp, h, b)) >= 0) { |
1010 | 0 | ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)); |
1011 | 0 | if (ret < 0) { // unsorted or doesn't fit |
1012 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); |
1013 | 0 | goto err; |
1014 | 0 | } |
1015 | 0 | } |
1016 | 0 | if (ret < -1) goto err; // corrupted BAM file |
1017 | | |
1018 | 0 | hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf)); |
1019 | 0 | sam_hdr_destroy(h); |
1020 | 0 | bam_destroy1(b); |
1021 | 0 | return idx; |
1022 | | |
1023 | 0 | err: |
1024 | 0 | bam_destroy1(b); |
1025 | 0 | hts_idx_destroy(idx); |
1026 | 0 | return NULL; |
1027 | 0 | } |
1028 | | |
1029 | | int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads) |
1030 | 0 | { |
1031 | 0 | hts_idx_t *idx; |
1032 | 0 | htsFile *fp; |
1033 | 0 | int ret = 0; |
1034 | |
|
1035 | 0 | if ((fp = hts_open(fn, "r")) == 0) return -2; |
1036 | 0 | if (nthreads) |
1037 | 0 | hts_set_threads(fp, nthreads); |
1038 | |
|
1039 | 0 | switch (fp->format.format) { |
1040 | 0 | case cram: |
1041 | |
|
1042 | 0 | ret = cram_index_build(fp->fp.cram, fn, fnidx); |
1043 | 0 | break; |
1044 | | |
1045 | 0 | case bam: |
1046 | 0 | case sam: |
1047 | 0 | if (fp->format.compression != bgzf) { |
1048 | 0 | hts_log_error("%s file \"%s\" not BGZF compressed", |
1049 | 0 | fp->format.format == bam ? "BAM" : "SAM", fn); |
1050 | 0 | ret = -1; |
1051 | 0 | break; |
1052 | 0 | } |
1053 | 0 | idx = sam_index(fp, min_shift); |
1054 | 0 | if (idx) { |
1055 | 0 | ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI); |
1056 | 0 | if (ret < 0) ret = -4; |
1057 | 0 | hts_idx_destroy(idx); |
1058 | 0 | } |
1059 | 0 | else ret = -1; |
1060 | 0 | break; |
1061 | | |
1062 | 0 | default: |
1063 | 0 | ret = -3; |
1064 | 0 | break; |
1065 | 0 | } |
1066 | 0 | hts_close(fp); |
1067 | |
|
1068 | 0 | return ret; |
1069 | 0 | } |
1070 | | |
1071 | | int sam_index_build2(const char *fn, const char *fnidx, int min_shift) |
1072 | 0 | { |
1073 | 0 | return sam_index_build3(fn, fnidx, min_shift, 0); |
1074 | 0 | } |
1075 | | |
1076 | | int sam_index_build(const char *fn, int min_shift) |
1077 | 0 | { |
1078 | 0 | return sam_index_build3(fn, NULL, min_shift, 0); |
1079 | 0 | } |
1080 | | |
1081 | | // Provide bam_index_build() symbol for binary compatibility with earlier HTSlib |
1082 | | #undef bam_index_build |
1083 | | int bam_index_build(const char *fn, int min_shift) |
1084 | 0 | { |
1085 | 0 | return sam_index_build2(fn, NULL, min_shift); |
1086 | 0 | } |
1087 | | |
1088 | | // Initialise fp->idx for the current format type. |
1089 | | // This must be called after the header has been written but no other data. |
1090 | 0 | int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) { |
1091 | 0 | fp->fnidx = fnidx; |
1092 | 0 | if (fp->format.format == bam || fp->format.format == bcf || |
1093 | 0 | (fp->format.format == sam && fp->format.compression == bgzf)) { |
1094 | 0 | int n_lvls, fmt = HTS_FMT_CSI; |
1095 | 0 | if (min_shift > 0) { |
1096 | 0 | int64_t max_len = 0; |
1097 | 0 | int i; |
1098 | 0 | for (i = 0; i < h->n_targets; ++i) |
1099 | 0 | if (max_len < h->target_len[i]) max_len = h->target_len[i]; |
1100 | 0 | n_lvls = 0; |
1101 | 0 | hts_adjust_csi_settings(max_len, &min_shift, &n_lvls); |
1102 | 0 | } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI; |
1103 | |
|
1104 | 0 | fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); |
1105 | 0 | return fp->idx ? 0 : -1; |
1106 | 0 | } |
1107 | | |
1108 | 0 | if (fp->format.format == cram) { |
1109 | 0 | fp->fp.cram->idxfp = bgzf_open(fnidx, "wg"); |
1110 | 0 | return fp->fp.cram->idxfp ? 0 : -1; |
1111 | 0 | } |
1112 | | |
1113 | 0 | return -1; |
1114 | 0 | } |
1115 | | |
1116 | | // Finishes an index. Call after the last record has been written. |
1117 | | // Returns 0 on success, <0 on failure. |
1118 | 0 | int sam_idx_save(htsFile *fp) { |
1119 | 0 | if (fp->format.format == bam || fp->format.format == bcf || |
1120 | 0 | fp->format.format == vcf || fp->format.format == sam) { |
1121 | 0 | int ret; |
1122 | 0 | if ((ret = sam_state_destroy(fp)) < 0) { |
1123 | 0 | errno = -ret; |
1124 | 0 | return -1; |
1125 | 0 | } |
1126 | 0 | if (!fp->is_bgzf || bgzf_flush(fp->fp.bgzf) < 0) |
1127 | 0 | return -1; |
1128 | 0 | hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf)); |
1129 | |
|
1130 | 0 | if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0) |
1131 | 0 | return -1; |
1132 | | |
1133 | 0 | return hts_idx_save_but_not_close(fp->idx, fp->fnidx, hts_idx_fmt(fp->idx)); |
1134 | |
|
1135 | 0 | } else if (fp->format.format == cram) { |
1136 | | // flushed and closed by cram_close |
1137 | 0 | } |
1138 | | |
1139 | 0 | return 0; |
1140 | 0 | } |
1141 | | |
1142 | | static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) |
1143 | 0 | { |
1144 | 0 | htsFile *fp = (htsFile *)fpv; |
1145 | 0 | bam1_t *b = bv; |
1146 | 0 | fp->line.l = 0; |
1147 | 0 | int ret = sam_read1(fp, fp->bam_header, b); |
1148 | 0 | if (ret >= 0) { |
1149 | 0 | *tid = b->core.tid; |
1150 | 0 | *beg = b->core.pos; |
1151 | 0 | *end = bam_endpos(b); |
1152 | 0 | } |
1153 | 0 | return ret; |
1154 | 0 | } |
1155 | | |
1156 | | // This is used only with read_rest=1 iterators, so need not set tid/beg/end. |
1157 | | static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) |
1158 | 0 | { |
1159 | 0 | htsFile *fp = (htsFile *)fpv; |
1160 | 0 | bam1_t *b = bv; |
1161 | 0 | fp->line.l = 0; |
1162 | 0 | int ret = sam_read1(fp, fp->bam_header, b); |
1163 | 0 | return ret; |
1164 | 0 | } |
1165 | | |
1166 | | // Internal (for now) func used by bam_sym_lookup. This is copied from |
1167 | | // samtools/bam.c. |
1168 | | static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b) |
1169 | 0 | { |
1170 | 0 | const char *rg; |
1171 | 0 | kstring_t lib = { 0, 0, NULL }; |
1172 | 0 | rg = (char *)bam_aux_get(b, "RG"); |
1173 | |
|
1174 | 0 | if (!rg) |
1175 | 0 | return NULL; |
1176 | 0 | else |
1177 | 0 | rg++; |
1178 | | |
1179 | 0 | if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib) < 0) |
1180 | 0 | return NULL; |
1181 | | |
1182 | 0 | static char LB_text[1024]; |
1183 | 0 | int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1; |
1184 | |
|
1185 | 0 | memcpy(LB_text, lib.s, len); |
1186 | 0 | LB_text[len] = 0; |
1187 | |
|
1188 | 0 | free(lib.s); |
1189 | |
|
1190 | 0 | return LB_text; |
1191 | 0 | } |
1192 | | |
1193 | | |
1194 | | // Bam record pointer and SAM header combined |
1195 | | typedef struct { |
1196 | | const sam_hdr_t *h; |
1197 | | const bam1_t *b; |
1198 | | } hb_pair; |
1199 | | |
1200 | | // Looks up variable names in str and replaces them with their value. |
1201 | | // Also supports aux tags. |
1202 | | // |
1203 | | // Note the expression parser deliberately overallocates str size so it |
1204 | | // is safe to use memcmp over strcmp. |
1205 | | static int bam_sym_lookup(void *data, char *str, char **end, |
1206 | 0 | hts_expr_val_t *res) { |
1207 | 0 | hb_pair *hb = (hb_pair *)data; |
1208 | 0 | const bam1_t *b = hb->b; |
1209 | |
|
1210 | 0 | res->is_str = 0; |
1211 | 0 | switch(*str) { |
1212 | 0 | case 'c': |
1213 | 0 | if (memcmp(str, "cigar", 5) == 0) { |
1214 | 0 | *end = str+5; |
1215 | 0 | res->is_str = 1; |
1216 | 0 | ks_clear(&res->s); |
1217 | 0 | uint32_t *cigar = bam_get_cigar(b); |
1218 | 0 | int i, n = b->core.n_cigar, r = 0; |
1219 | 0 | if (n) { |
1220 | 0 | for (i = 0; i < n; i++) { |
1221 | 0 | r |= kputw (bam_cigar_oplen(cigar[i]), &res->s) < 0; |
1222 | 0 | r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s) < 0; |
1223 | 0 | } |
1224 | 0 | r |= kputs("", &res->s) < 0; |
1225 | 0 | } else { |
1226 | 0 | r |= kputs("*", &res->s) < 0; |
1227 | 0 | } |
1228 | 0 | return r ? -1 : 0; |
1229 | 0 | } |
1230 | 0 | break; |
1231 | | |
1232 | 0 | case 'e': |
1233 | 0 | if (memcmp(str, "endpos", 6) == 0) { |
1234 | 0 | *end = str+6; |
1235 | 0 | res->d = bam_endpos(b); |
1236 | 0 | return 0; |
1237 | 0 | } |
1238 | 0 | break; |
1239 | | |
1240 | 0 | case 'f': |
1241 | 0 | if (memcmp(str, "flag", 4) == 0) { |
1242 | 0 | str = *end = str+4; |
1243 | 0 | if (*str != '.') { |
1244 | 0 | res->d = b->core.flag; |
1245 | 0 | return 0; |
1246 | 0 | } else { |
1247 | 0 | str++; |
1248 | 0 | if (!memcmp(str, "paired", 6)) { |
1249 | 0 | *end = str+6; |
1250 | 0 | res->d = b->core.flag & BAM_FPAIRED; |
1251 | 0 | return 0; |
1252 | 0 | } else if (!memcmp(str, "proper_pair", 11)) { |
1253 | 0 | *end = str+11; |
1254 | 0 | res->d = b->core.flag & BAM_FPROPER_PAIR; |
1255 | 0 | return 0; |
1256 | 0 | } else if (!memcmp(str, "unmap", 5)) { |
1257 | 0 | *end = str+5; |
1258 | 0 | res->d = b->core.flag & BAM_FUNMAP; |
1259 | 0 | return 0; |
1260 | 0 | } else if (!memcmp(str, "munmap", 6)) { |
1261 | 0 | *end = str+6; |
1262 | 0 | res->d = b->core.flag & BAM_FMUNMAP; |
1263 | 0 | return 0; |
1264 | 0 | } else if (!memcmp(str, "reverse", 7)) { |
1265 | 0 | *end = str+7; |
1266 | 0 | res->d = b->core.flag & BAM_FREVERSE; |
1267 | 0 | return 0; |
1268 | 0 | } else if (!memcmp(str, "mreverse", 8)) { |
1269 | 0 | *end = str+8; |
1270 | 0 | res->d = b->core.flag & BAM_FMREVERSE; |
1271 | 0 | return 0; |
1272 | 0 | } else if (!memcmp(str, "read1", 5)) { |
1273 | 0 | *end = str+5; |
1274 | 0 | res->d = b->core.flag & BAM_FREAD1; |
1275 | 0 | return 0; |
1276 | 0 | } else if (!memcmp(str, "read2", 5)) { |
1277 | 0 | *end = str+5; |
1278 | 0 | res->d = b->core.flag & BAM_FREAD2; |
1279 | 0 | return 0; |
1280 | 0 | } else if (!memcmp(str, "secondary", 9)) { |
1281 | 0 | *end = str+9; |
1282 | 0 | res->d = b->core.flag & BAM_FSECONDARY; |
1283 | 0 | return 0; |
1284 | 0 | } else if (!memcmp(str, "qcfail", 6)) { |
1285 | 0 | *end = str+6; |
1286 | 0 | res->d = b->core.flag & BAM_FQCFAIL; |
1287 | 0 | return 0; |
1288 | 0 | } else if (!memcmp(str, "dup", 3)) { |
1289 | 0 | *end = str+3; |
1290 | 0 | res->d = b->core.flag & BAM_FDUP; |
1291 | 0 | return 0; |
1292 | 0 | } else if (!memcmp(str, "supplementary", 13)) { |
1293 | 0 | *end = str+13; |
1294 | 0 | res->d = b->core.flag & BAM_FSUPPLEMENTARY; |
1295 | 0 | return 0; |
1296 | 0 | } else { |
1297 | 0 | hts_log_error("Unrecognised flag string"); |
1298 | 0 | return -1; |
1299 | 0 | } |
1300 | 0 | } |
1301 | 0 | } |
1302 | 0 | break; |
1303 | | |
1304 | 0 | case 'h': |
1305 | 0 | if (memcmp(str, "hclen", 5) == 0) { |
1306 | 0 | int hclen = 0; |
1307 | 0 | uint32_t *cigar = bam_get_cigar(b); |
1308 | 0 | uint32_t ncigar = b->core.n_cigar; |
1309 | | |
1310 | | // left |
1311 | 0 | if (ncigar > 0 && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP) |
1312 | 0 | hclen = bam_cigar_oplen(cigar[0]); |
1313 | | |
1314 | | // right |
1315 | 0 | if (ncigar > 1 && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP) |
1316 | 0 | hclen += bam_cigar_oplen(cigar[ncigar-1]); |
1317 | |
|
1318 | 0 | *end = str+5; |
1319 | 0 | res->d = hclen; |
1320 | 0 | return 0; |
1321 | 0 | } |
1322 | 0 | break; |
1323 | | |
1324 | 0 | case 'l': |
1325 | 0 | if (memcmp(str, "library", 7) == 0) { |
1326 | 0 | *end = str+7; |
1327 | 0 | res->is_str = 1; |
1328 | 0 | const char *lib = bam_get_library(hb->h, b); |
1329 | 0 | kputs(lib ? lib : "", ks_clear(&res->s)); |
1330 | 0 | return 0; |
1331 | 0 | } |
1332 | 0 | break; |
1333 | | |
1334 | 0 | case 'm': |
1335 | 0 | if (memcmp(str, "mapq", 4) == 0) { |
1336 | 0 | *end = str+4; |
1337 | 0 | res->d = b->core.qual; |
1338 | 0 | return 0; |
1339 | 0 | } else if (memcmp(str, "mpos", 4) == 0) { |
1340 | 0 | *end = str+4; |
1341 | 0 | res->d = b->core.mpos+1; |
1342 | 0 | return 0; |
1343 | 0 | } else if (memcmp(str, "mrname", 6) == 0) { |
1344 | 0 | *end = str+6; |
1345 | 0 | res->is_str = 1; |
1346 | 0 | const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); |
1347 | 0 | kputs(rn ? rn : "*", ks_clear(&res->s)); |
1348 | 0 | return 0; |
1349 | 0 | } else if (memcmp(str, "mrefid", 6) == 0) { |
1350 | 0 | *end = str+6; |
1351 | 0 | res->d = b->core.mtid; |
1352 | 0 | return 0; |
1353 | 0 | } |
1354 | 0 | break; |
1355 | | |
1356 | 0 | case 'n': |
1357 | 0 | if (memcmp(str, "ncigar", 6) == 0) { |
1358 | 0 | *end = str+6; |
1359 | 0 | res->d = b->core.n_cigar; |
1360 | 0 | return 0; |
1361 | 0 | } |
1362 | 0 | break; |
1363 | | |
1364 | 0 | case 'p': |
1365 | 0 | if (memcmp(str, "pos", 3) == 0) { |
1366 | 0 | *end = str+3; |
1367 | 0 | res->d = b->core.pos+1; |
1368 | 0 | return 0; |
1369 | 0 | } else if (memcmp(str, "pnext", 5) == 0) { |
1370 | 0 | *end = str+5; |
1371 | 0 | res->d = b->core.mpos+1; |
1372 | 0 | return 0; |
1373 | 0 | } |
1374 | 0 | break; |
1375 | | |
1376 | 0 | case 'q': |
1377 | 0 | if (memcmp(str, "qlen", 4) == 0) { |
1378 | 0 | *end = str+4; |
1379 | 0 | res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)); |
1380 | 0 | return 0; |
1381 | 0 | } else if (memcmp(str, "qname", 5) == 0) { |
1382 | 0 | *end = str+5; |
1383 | 0 | res->is_str = 1; |
1384 | 0 | kputs(bam_get_qname(b), ks_clear(&res->s)); |
1385 | 0 | return 0; |
1386 | 0 | } else if (memcmp(str, "qual", 4) == 0) { |
1387 | 0 | *end = str+4; |
1388 | 0 | ks_clear(&res->s); |
1389 | 0 | if (ks_resize(&res->s, b->core.l_qseq+1) < 0) |
1390 | 0 | return -1; |
1391 | 0 | memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq); |
1392 | 0 | res->s.l = b->core.l_qseq; |
1393 | 0 | res->is_str = 1; |
1394 | 0 | return 0; |
1395 | 0 | } |
1396 | 0 | break; |
1397 | | |
1398 | 0 | case 'r': |
1399 | 0 | if (memcmp(str, "rlen", 4) == 0) { |
1400 | 0 | *end = str+4; |
1401 | 0 | res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); |
1402 | 0 | return 0; |
1403 | 0 | } else if (memcmp(str, "rname", 5) == 0) { |
1404 | 0 | *end = str+5; |
1405 | 0 | res->is_str = 1; |
1406 | 0 | const char *rn = sam_hdr_tid2name(hb->h, b->core.tid); |
1407 | 0 | kputs(rn ? rn : "*", ks_clear(&res->s)); |
1408 | 0 | return 0; |
1409 | 0 | } else if (memcmp(str, "rnext", 5) == 0) { |
1410 | 0 | *end = str+5; |
1411 | 0 | res->is_str = 1; |
1412 | 0 | const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); |
1413 | 0 | kputs(rn ? rn : "*", ks_clear(&res->s)); |
1414 | 0 | return 0; |
1415 | 0 | } else if (memcmp(str, "refid", 5) == 0) { |
1416 | 0 | *end = str+5; |
1417 | 0 | res->d = b->core.tid; |
1418 | 0 | return 0; |
1419 | 0 | } |
1420 | 0 | break; |
1421 | | |
1422 | 0 | case 's': |
1423 | 0 | if (memcmp(str, "seq", 3) == 0) { |
1424 | 0 | *end = str+3; |
1425 | 0 | ks_clear(&res->s); |
1426 | 0 | if (ks_resize(&res->s, b->core.l_qseq+1) < 0) |
1427 | 0 | return -1; |
1428 | 0 | nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq); |
1429 | 0 | res->s.s[b->core.l_qseq] = 0; |
1430 | 0 | res->s.l = b->core.l_qseq; |
1431 | 0 | res->is_str = 1; |
1432 | 0 | return 0; |
1433 | 0 | } else if (memcmp(str, "sclen", 5) == 0) { |
1434 | 0 | int sclen = 0; |
1435 | 0 | uint32_t *cigar = bam_get_cigar(b); |
1436 | 0 | int ncigar = b->core.n_cigar; |
1437 | 0 | int left = 0; |
1438 | | |
1439 | | // left |
1440 | 0 | if (ncigar > 0 |
1441 | 0 | && bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) |
1442 | 0 | left = 0, sclen += bam_cigar_oplen(cigar[0]); |
1443 | 0 | else if (ncigar > 1 |
1444 | 0 | && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP |
1445 | 0 | && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP) |
1446 | 0 | left = 1, sclen += bam_cigar_oplen(cigar[1]); |
1447 | | |
1448 | | // right |
1449 | 0 | if (ncigar-1 > left |
1450 | 0 | && bam_cigar_op(cigar[ncigar-1]) == BAM_CSOFT_CLIP) |
1451 | 0 | sclen += bam_cigar_oplen(cigar[ncigar-1]); |
1452 | 0 | else if (ncigar-2 > left |
1453 | 0 | && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP |
1454 | 0 | && bam_cigar_op(cigar[ncigar-2]) == BAM_CSOFT_CLIP) |
1455 | 0 | sclen += bam_cigar_oplen(cigar[ncigar-2]); |
1456 | |
|
1457 | 0 | *end = str+5; |
1458 | 0 | res->d = sclen; |
1459 | 0 | return 0; |
1460 | 0 | } |
1461 | 0 | break; |
1462 | | |
1463 | 0 | case 't': |
1464 | 0 | if (memcmp(str, "tlen", 4) == 0) { |
1465 | 0 | *end = str+4; |
1466 | 0 | res->d = b->core.isize; |
1467 | 0 | return 0; |
1468 | 0 | } |
1469 | 0 | break; |
1470 | | |
1471 | 0 | case '[': |
1472 | 0 | if (*str == '[' && str[1] && str[2] && str[3] == ']') { |
1473 | | /* aux tags */ |
1474 | 0 | *end = str+4; |
1475 | |
|
1476 | 0 | uint8_t *aux = bam_aux_get(b, str+1); |
1477 | 0 | if (aux) { |
1478 | | // we define the truth of a tag to be its presence, even if 0. |
1479 | 0 | res->is_true = 1; |
1480 | 0 | switch (*aux) { |
1481 | 0 | case 'Z': |
1482 | 0 | case 'H': |
1483 | 0 | res->is_str = 1; |
1484 | 0 | kputs((char *)aux+1, ks_clear(&res->s)); |
1485 | 0 | break; |
1486 | | |
1487 | 0 | case 'A': |
1488 | 0 | res->is_str = 1; |
1489 | 0 | kputsn((char *)aux+1, 1, ks_clear(&res->s)); |
1490 | 0 | break; |
1491 | | |
1492 | 0 | case 'i': case 'I': |
1493 | 0 | case 's': case 'S': |
1494 | 0 | case 'c': case 'C': |
1495 | 0 | res->is_str = 0; |
1496 | 0 | res->d = bam_aux2i(aux); |
1497 | 0 | break; |
1498 | | |
1499 | 0 | case 'f': |
1500 | 0 | case 'd': |
1501 | 0 | res->is_str = 0; |
1502 | 0 | res->d = bam_aux2f(aux); |
1503 | 0 | break; |
1504 | | |
1505 | 0 | default: |
1506 | 0 | hts_log_error("Aux type '%c not yet supported by filters", |
1507 | 0 | *aux); |
1508 | 0 | return -1; |
1509 | 0 | } |
1510 | 0 | return 0; |
1511 | |
|
1512 | 0 | } else { |
1513 | | // hence absent tags are always false (and strings) |
1514 | 0 | res->is_str = 1; |
1515 | 0 | res->s.l = 0; |
1516 | 0 | res->d = 0; |
1517 | 0 | res->is_true = 0; |
1518 | 0 | return 0; |
1519 | 0 | } |
1520 | 0 | } |
1521 | 0 | break; |
1522 | 0 | } |
1523 | | |
1524 | | // All successful matches in switch should return 0. |
1525 | | // So if we didn't match, it's a parse error. |
1526 | 0 | return -1; |
1527 | 0 | } |
1528 | | |
1529 | | // Returns 1 when accepted by the filter, 0 if not, -1 on error. |
1530 | | int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt) |
1531 | 0 | { |
1532 | 0 | hb_pair hb = {h, b}; |
1533 | 0 | hts_expr_val_t res = HTS_EXPR_VAL_INIT; |
1534 | 0 | if (hts_filter_eval2(filt, &hb, bam_sym_lookup, &res)) { |
1535 | 0 | hts_log_error("Couldn't process filter expression"); |
1536 | 0 | hts_expr_val_free(&res); |
1537 | 0 | return -1; |
1538 | 0 | } |
1539 | | |
1540 | 0 | int t = res.is_true; |
1541 | 0 | hts_expr_val_free(&res); |
1542 | |
|
1543 | 0 | return t; |
1544 | 0 | } |
1545 | | |
1546 | | static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) |
1547 | 0 | { |
1548 | 0 | htsFile *fp = fpv; |
1549 | 0 | bam1_t *b = bv; |
1550 | 0 | int pass_filter, ret; |
1551 | |
|
1552 | 0 | do { |
1553 | 0 | ret = cram_get_bam_seq(fp->fp.cram, &b); |
1554 | 0 | if (ret < 0) |
1555 | 0 | return cram_eof(fp->fp.cram) ? -1 : -2; |
1556 | | |
1557 | 0 | if (bam_tag2cigar(b, 1, 1) < 0) |
1558 | 0 | return -2; |
1559 | | |
1560 | 0 | *tid = b->core.tid; |
1561 | 0 | *beg = b->core.pos; |
1562 | 0 | *end = bam_endpos(b); |
1563 | |
|
1564 | 0 | if (fp->filter) { |
1565 | 0 | pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter); |
1566 | 0 | if (pass_filter < 0) |
1567 | 0 | return -2; |
1568 | 0 | } else { |
1569 | 0 | pass_filter = 1; |
1570 | 0 | } |
1571 | 0 | } while (pass_filter == 0); |
1572 | | |
1573 | 0 | return ret; |
1574 | 0 | } |
1575 | | |
1576 | | static int cram_pseek(void *fp, int64_t offset, int whence) |
1577 | 0 | { |
1578 | 0 | cram_fd *fd = (cram_fd *)fp; |
1579 | |
|
1580 | 0 | if ((0 != cram_seek(fd, offset, SEEK_SET)) |
1581 | 0 | && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR))) |
1582 | 0 | return -1; |
1583 | | |
1584 | 0 | fd->curr_position = offset; |
1585 | |
|
1586 | 0 | if (fd->ctr) { |
1587 | 0 | cram_free_container(fd->ctr); |
1588 | 0 | if (fd->ctr_mt && fd->ctr_mt != fd->ctr) |
1589 | 0 | cram_free_container(fd->ctr_mt); |
1590 | |
|
1591 | 0 | fd->ctr = NULL; |
1592 | 0 | fd->ctr_mt = NULL; |
1593 | 0 | fd->ooc = 0; |
1594 | 0 | } |
1595 | |
|
1596 | 0 | return 0; |
1597 | 0 | } |
1598 | | |
1599 | | /* |
1600 | | * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only |
1601 | | * after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered |
1602 | | * container previously fetched. It was designed like this to integrate with the functionality |
1603 | | * of the iterator stepping logic. |
1604 | | */ |
1605 | | |
1606 | | static int64_t cram_ptell(void *fp) |
1607 | 0 | { |
1608 | 0 | cram_fd *fd = (cram_fd *)fp; |
1609 | 0 | cram_container *c; |
1610 | 0 | cram_slice *s; |
1611 | 0 | int64_t ret = -1L; |
1612 | |
|
1613 | 0 | if (fd) { |
1614 | 0 | if ((c = fd->ctr) != NULL) { |
1615 | 0 | if ((s = c->slice) != NULL && s->max_rec) { |
1616 | 0 | if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1)) |
1617 | 0 | fd->curr_position += c->offset + c->length; |
1618 | 0 | } |
1619 | 0 | } |
1620 | 0 | ret = fd->curr_position; |
1621 | 0 | } |
1622 | |
|
1623 | 0 | return ret; |
1624 | 0 | } |
1625 | | |
1626 | | static int bam_pseek(void *fp, int64_t offset, int whence) |
1627 | 0 | { |
1628 | 0 | BGZF *fd = (BGZF *)fp; |
1629 | |
|
1630 | 0 | return bgzf_seek(fd, offset, whence); |
1631 | 0 | } |
1632 | | |
1633 | | static int64_t bam_ptell(void *fp) |
1634 | 0 | { |
1635 | 0 | BGZF *fd = (BGZF *)fp; |
1636 | 0 | if (!fd) |
1637 | 0 | return -1L; |
1638 | | |
1639 | 0 | return bgzf_tell(fd); |
1640 | 0 | } |
1641 | | |
1642 | | |
1643 | | |
1644 | | static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags) |
1645 | 0 | { |
1646 | 0 | switch (fp->format.format) { |
1647 | 0 | case bam: |
1648 | 0 | case sam: |
1649 | 0 | return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags); |
1650 | | |
1651 | 0 | case cram: { |
1652 | 0 | if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL; |
1653 | | |
1654 | | // Cons up a fake "index" just pointing at the associated cram_fd: |
1655 | 0 | hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t)); |
1656 | 0 | if (idx == NULL) return NULL; |
1657 | 0 | idx->fmt = HTS_FMT_CRAI; |
1658 | 0 | idx->cram = fp->fp.cram; |
1659 | 0 | return (hts_idx_t *) idx; |
1660 | 0 | } |
1661 | | |
1662 | 0 | default: |
1663 | 0 | return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t |
1664 | 0 | } |
1665 | 0 | } |
1666 | | |
1667 | | hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags) |
1668 | 0 | { |
1669 | 0 | return index_load(fp, fn, fnidx, flags); |
1670 | 0 | } |
1671 | | |
1672 | 0 | hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) { |
1673 | 0 | return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE); |
1674 | 0 | } |
1675 | | |
1676 | | hts_idx_t *sam_index_load(htsFile *fp, const char *fn) |
1677 | 0 | { |
1678 | 0 | return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE); |
1679 | 0 | } |
1680 | | |
1681 | | static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec) |
1682 | 0 | { |
1683 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1684 | 0 | hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t)); |
1685 | 0 | if (iter == NULL) return NULL; |
1686 | | |
1687 | | // Cons up a dummy iterator for which hts_itr_next() will simply invoke |
1688 | | // the readrec function: |
1689 | 0 | iter->is_cram = 1; |
1690 | 0 | iter->read_rest = 1; |
1691 | 0 | iter->off = NULL; |
1692 | 0 | iter->bins.a = NULL; |
1693 | 0 | iter->readrec = readrec; |
1694 | |
|
1695 | 0 | if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) { |
1696 | 0 | cram_range r = { tid, beg+1, end }; |
1697 | 0 | int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r); |
1698 | |
|
1699 | 0 | iter->curr_off = 0; |
1700 | | // The following fields are not required by hts_itr_next(), but are |
1701 | | // filled in in case user code wants to look at them. |
1702 | 0 | iter->tid = tid; |
1703 | 0 | iter->beg = beg; |
1704 | 0 | iter->end = end; |
1705 | |
|
1706 | 0 | switch (ret) { |
1707 | 0 | case 0: |
1708 | 0 | break; |
1709 | | |
1710 | 0 | case -2: |
1711 | | // No data vs this ref, so mark iterator as completed. |
1712 | | // Same as HTS_IDX_NONE. |
1713 | 0 | iter->finished = 1; |
1714 | 0 | break; |
1715 | | |
1716 | 0 | default: |
1717 | 0 | free(iter); |
1718 | 0 | return NULL; |
1719 | 0 | } |
1720 | 0 | } |
1721 | 0 | else switch (tid) { |
1722 | 0 | case HTS_IDX_REST: |
1723 | 0 | iter->curr_off = 0; |
1724 | 0 | break; |
1725 | 0 | case HTS_IDX_NONE: |
1726 | 0 | iter->curr_off = 0; |
1727 | 0 | iter->finished = 1; |
1728 | 0 | break; |
1729 | 0 | default: |
1730 | 0 | hts_log_error("Query with tid=%d not implemented for CRAM files", tid); |
1731 | 0 | abort(); |
1732 | 0 | break; |
1733 | 0 | } |
1734 | | |
1735 | 0 | return iter; |
1736 | 0 | } |
1737 | | |
1738 | | hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end) |
1739 | 0 | { |
1740 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1741 | 0 | if (idx == NULL) |
1742 | 0 | return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest); |
1743 | 0 | else if (cidx->fmt == HTS_FMT_CRAI) |
1744 | 0 | return cram_itr_query(idx, tid, beg, end, sam_readrec); |
1745 | 0 | else |
1746 | 0 | return hts_itr_query(idx, tid, beg, end, sam_readrec); |
1747 | 0 | } |
1748 | | |
1749 | | static int cram_name2id(void *fdv, const char *ref) |
1750 | 0 | { |
1751 | 0 | cram_fd *fd = (cram_fd *) fdv; |
1752 | 0 | return sam_hdr_name2tid(fd->header, ref); |
1753 | 0 | } |
1754 | | |
1755 | | hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region) |
1756 | 0 | { |
1757 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1758 | 0 | return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, |
1759 | 0 | cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query, |
1760 | 0 | sam_readrec); |
1761 | 0 | } |
1762 | | |
1763 | | hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount) |
1764 | 0 | { |
1765 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1766 | 0 | hts_reglist_t *r_list = NULL; |
1767 | 0 | int r_count = 0; |
1768 | |
|
1769 | 0 | if (!cidx || !hdr) |
1770 | 0 | return NULL; |
1771 | | |
1772 | 0 | hts_itr_t *itr = NULL; |
1773 | 0 | if (cidx->fmt == HTS_FMT_CRAI) { |
1774 | 0 | r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id); |
1775 | 0 | if (!r_list) |
1776 | 0 | return NULL; |
1777 | 0 | itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram, |
1778 | 0 | hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell); |
1779 | 0 | } else { |
1780 | 0 | r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, (hts_name2id_f)(bam_name2id)); |
1781 | 0 | if (!r_list) |
1782 | 0 | return NULL; |
1783 | 0 | itr = hts_itr_regions(idx, r_list, r_count, (hts_name2id_f)(bam_name2id), hdr, |
1784 | 0 | hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell); |
1785 | 0 | } |
1786 | | |
1787 | 0 | if (!itr) |
1788 | 0 | hts_reglist_free(r_list, r_count); |
1789 | |
|
1790 | 0 | return itr; |
1791 | 0 | } |
1792 | | |
1793 | | hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount) |
1794 | 0 | { |
1795 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1796 | |
|
1797 | 0 | if(!cidx || !hdr || !reglist) |
1798 | 0 | return NULL; |
1799 | | |
1800 | 0 | if (cidx->fmt == HTS_FMT_CRAI) |
1801 | 0 | return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram, |
1802 | 0 | hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell); |
1803 | 0 | else |
1804 | 0 | return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr, |
1805 | 0 | hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell); |
1806 | 0 | } |
1807 | | |
1808 | | /********************** |
1809 | | *** SAM header I/O *** |
1810 | | **********************/ |
1811 | | |
1812 | | #include "htslib/kseq.h" |
1813 | | #include "htslib/kstring.h" |
1814 | | |
1815 | | sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text) |
1816 | 0 | { |
1817 | 0 | sam_hdr_t *bh = sam_hdr_init(); |
1818 | 0 | if (!bh) return NULL; |
1819 | | |
1820 | 0 | if (sam_hdr_add_lines(bh, text, l_text) != 0) { |
1821 | 0 | sam_hdr_destroy(bh); |
1822 | 0 | return NULL; |
1823 | 0 | } |
1824 | | |
1825 | 0 | return bh; |
1826 | 0 | } |
1827 | | |
1828 | | // Minimal sanitisation of a header to ensure. |
1829 | | // - null terminated string. |
1830 | | // - all lines start with @ (also implies no blank lines). |
1831 | | // |
1832 | | // Much more could be done, but currently is not, including: |
1833 | | // - checking header types are known (HD, SQ, etc). |
1834 | | // - syntax (eg checking tab separated fields). |
1835 | | // - validating n_targets matches @SQ records. |
1836 | | // - validating target lengths against @SQ records. |
1837 | 6.06k | static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) { |
1838 | 6.06k | if (!h) |
1839 | 51 | return NULL; |
1840 | | |
1841 | | // Special case for empty headers. |
1842 | 6.01k | if (h->l_text == 0) |
1843 | 939 | return h; |
1844 | | |
1845 | 5.07k | size_t i; |
1846 | 5.07k | unsigned int lnum = 0; |
1847 | 5.07k | char *cp = h->text, last = '\n'; |
1848 | 31.3M | for (i = 0; i < h->l_text; i++) { |
1849 | | // NB: l_text excludes terminating nul. This finds early ones. |
1850 | 31.3M | if (cp[i] == 0) |
1851 | 2.27k | break; |
1852 | | |
1853 | | // Error on \n[^@], including duplicate newlines |
1854 | 31.3M | if (last == '\n') { |
1855 | 152k | lnum++; |
1856 | 152k | if (cp[i] != '@') { |
1857 | 0 | hts_log_error("Malformed SAM header at line %u", lnum); |
1858 | 0 | sam_hdr_destroy(h); |
1859 | 0 | return NULL; |
1860 | 0 | } |
1861 | 152k | } |
1862 | | |
1863 | 31.3M | last = cp[i]; |
1864 | 31.3M | } |
1865 | | |
1866 | 5.07k | if (i < h->l_text) { // Early nul found. Complain if not just padding. |
1867 | 2.27k | size_t j = i; |
1868 | 14.7k | while (j < h->l_text && cp[j] == '\0') j++; |
1869 | 2.27k | if (j < h->l_text) |
1870 | 2.27k | hts_log_warning("Unexpected NUL character in header. Possibly truncated"); |
1871 | 2.27k | } |
1872 | | |
1873 | | // Add trailing newline and/or trailing nul if required. |
1874 | 5.07k | if (last != '\n') { |
1875 | 2.26k | hts_log_warning("Missing trailing newline on SAM header. Possibly truncated"); |
1876 | | |
1877 | 2.26k | if (h->l_text < 2 || i >= h->l_text - 2) { |
1878 | 189 | if (h->l_text >= SIZE_MAX - 2) { |
1879 | 0 | hts_log_error("No room for extra newline"); |
1880 | 0 | sam_hdr_destroy(h); |
1881 | 0 | return NULL; |
1882 | 0 | } |
1883 | | |
1884 | 189 | cp = realloc(h->text, (size_t) h->l_text+2); |
1885 | 189 | if (!cp) { |
1886 | 0 | sam_hdr_destroy(h); |
1887 | 0 | return NULL; |
1888 | 0 | } |
1889 | 189 | h->text = cp; |
1890 | 189 | } |
1891 | 2.26k | cp[i++] = '\n'; |
1892 | | |
1893 | | // l_text may be larger already due to multiple nul padding |
1894 | 2.26k | if (h->l_text < i) |
1895 | 0 | h->l_text = i; |
1896 | 2.26k | cp[h->l_text] = '\0'; |
1897 | 2.26k | } |
1898 | | |
1899 | 5.07k | return h; |
1900 | 5.07k | } |
1901 | | |
1902 | 5.01k | static sam_hdr_t *sam_hdr_create(htsFile* fp) { |
1903 | 5.01k | sam_hdr_t* h = sam_hdr_init(); |
1904 | 5.01k | if (!h) |
1905 | 0 | return NULL; |
1906 | | |
1907 | 5.01k | if (sam_hdr_build_from_sam_file(h, fp) != 0) { |
1908 | 405 | sam_hdr_destroy(h); |
1909 | 405 | return NULL; |
1910 | 405 | } |
1911 | | |
1912 | 4.60k | if (fp->bam_header) |
1913 | 0 | sam_hdr_destroy(fp->bam_header); |
1914 | 4.60k | fp->bam_header = sam_hdr_sanitise(h); |
1915 | 4.60k | fp->bam_header->ref_count = 1; |
1916 | | |
1917 | 4.60k | return fp->bam_header; |
1918 | 5.01k | } |
1919 | | |
1920 | | sam_hdr_t *sam_hdr_read(htsFile *fp) |
1921 | 8.37k | { |
1922 | 8.37k | sam_hdr_t *h = NULL; |
1923 | 8.37k | if (!fp) { |
1924 | 0 | errno = EINVAL; |
1925 | 0 | return NULL; |
1926 | 0 | } |
1927 | | |
1928 | 8.37k | switch (fp->format.format) { |
1929 | 153 | case bam: |
1930 | 153 | h = sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf)); |
1931 | 153 | break; |
1932 | | |
1933 | 1.30k | case cram: |
1934 | 1.30k | h = sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header)); |
1935 | 1.30k | break; |
1936 | | |
1937 | 5.01k | case sam: |
1938 | 5.01k | h = sam_hdr_create(fp); |
1939 | 5.01k | break; |
1940 | | |
1941 | 24 | case fastq_format: |
1942 | 1.89k | case fasta_format: |
1943 | 1.89k | return sam_hdr_init(); |
1944 | | |
1945 | 0 | case empty_format: |
1946 | 0 | errno = EPIPE; |
1947 | 0 | return NULL; |
1948 | | |
1949 | 0 | default: |
1950 | 0 | errno = EFTYPE; |
1951 | 0 | return NULL; |
1952 | 8.37k | } |
1953 | | //only sam,bam and cram reaches here |
1954 | 6.47k | if (h && !fp->bam_header) { //set except for sam which already has it |
1955 | | //for cram, it is the o/p header as for rest and not the internal header |
1956 | 1.41k | fp->bam_header = h; |
1957 | 1.41k | sam_hdr_incr_ref(fp->bam_header); |
1958 | 1.41k | } |
1959 | 6.47k | return h; |
1960 | 8.37k | } |
1961 | | |
1962 | | int sam_hdr_write(htsFile *fp, const sam_hdr_t *h) |
1963 | 7.91k | { |
1964 | 7.91k | if (!fp || !h) { |
1965 | 0 | errno = EINVAL; |
1966 | 0 | return -1; |
1967 | 0 | } |
1968 | | |
1969 | 7.91k | switch (fp->format.format) { |
1970 | 2.63k | case binary_format: |
1971 | 2.63k | fp->format.category = sequence_data; |
1972 | 2.63k | fp->format.format = bam; |
1973 | | /* fall-through */ |
1974 | 2.63k | case bam: |
1975 | 2.63k | if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1; |
1976 | 2.63k | break; |
1977 | | |
1978 | 2.63k | case cram: { |
1979 | 2.63k | cram_fd *fd = fp->fp.cram; |
1980 | 2.63k | if (cram_set_header2(fd, h) < 0) return -1; |
1981 | 2.50k | if (fp->fn_aux) |
1982 | 0 | cram_load_reference(fd, fp->fn_aux); |
1983 | 2.50k | if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1; |
1984 | 2.50k | } |
1985 | 2.50k | break; |
1986 | | |
1987 | 2.63k | case text_format: |
1988 | 2.63k | fp->format.category = sequence_data; |
1989 | 2.63k | fp->format.format = sam; |
1990 | | /* fall-through */ |
1991 | 2.63k | case sam: { |
1992 | 2.63k | if (!h->hrecs && !h->text) |
1993 | 0 | return 0; |
1994 | 2.63k | char *text; |
1995 | 2.63k | kstring_t hdr_ks = { 0, 0, NULL }; |
1996 | 2.63k | size_t l_text; |
1997 | 2.63k | ssize_t bytes; |
1998 | 2.63k | int r = 0, no_sq = 0; |
1999 | | |
2000 | 2.63k | if (h->hrecs) { |
2001 | 2.63k | if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) |
2002 | 0 | return -1; |
2003 | 2.63k | text = hdr_ks.s; |
2004 | 2.63k | l_text = hdr_ks.l; |
2005 | 2.63k | } else { |
2006 | 0 | const char *p = NULL; |
2007 | 0 | do { |
2008 | 0 | const char *q = p == NULL ? h->text : p + 4; |
2009 | 0 | p = strstr(q, "@SQ\t"); |
2010 | 0 | } while (!(p == NULL || p == h->text || *(p - 1) == '\n')); |
2011 | 0 | no_sq = p == NULL; |
2012 | 0 | text = h->text; |
2013 | 0 | l_text = h->l_text; |
2014 | 0 | } |
2015 | | |
2016 | 2.63k | if (fp->is_bgzf) { |
2017 | 0 | bytes = bgzf_write(fp->fp.bgzf, text, l_text); |
2018 | 2.63k | } else { |
2019 | 2.63k | bytes = hwrite(fp->fp.hfile, text, l_text); |
2020 | 2.63k | } |
2021 | 2.63k | free(hdr_ks.s); |
2022 | 2.63k | if (bytes != l_text) |
2023 | 0 | return -1; |
2024 | | |
2025 | 2.63k | if (no_sq) { |
2026 | 0 | int i; |
2027 | 0 | for (i = 0; i < h->n_targets; ++i) { |
2028 | 0 | fp->line.l = 0; |
2029 | 0 | r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0; |
2030 | 0 | r |= kputs(h->target_name[i], &fp->line) < 0; |
2031 | 0 | r |= kputsn("\tLN:", 4, &fp->line) < 0; |
2032 | 0 | r |= kputw(h->target_len[i], &fp->line) < 0; |
2033 | 0 | r |= kputc('\n', &fp->line) < 0; |
2034 | 0 | if (r != 0) |
2035 | 0 | return -1; |
2036 | | |
2037 | 0 | if (fp->is_bgzf) { |
2038 | 0 | bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l); |
2039 | 0 | } else { |
2040 | 0 | bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l); |
2041 | 0 | } |
2042 | 0 | if (bytes != fp->line.l) |
2043 | 0 | return -1; |
2044 | 0 | } |
2045 | 0 | } |
2046 | 2.63k | if (fp->is_bgzf) { |
2047 | 0 | if (bgzf_flush(fp->fp.bgzf) != 0) return -1; |
2048 | 2.63k | } else { |
2049 | 2.63k | if (hflush(fp->fp.hfile) != 0) return -1; |
2050 | 2.63k | } |
2051 | 2.63k | } |
2052 | 2.63k | break; |
2053 | | |
2054 | 2.63k | case fastq_format: |
2055 | 0 | case fasta_format: |
2056 | | // Nothing to output; FASTQ has no file headers. |
2057 | 0 | return 0; |
2058 | 0 | break; |
2059 | | |
2060 | 0 | default: |
2061 | 0 | errno = EBADF; |
2062 | 0 | return -1; |
2063 | 7.91k | } |
2064 | | //only sam,bam and cram reaches here |
2065 | 7.78k | if (h) { //the new header |
2066 | 7.78k | sam_hdr_t *tmp = fp->bam_header; |
2067 | 7.78k | fp->bam_header = sam_hdr_dup(h); |
2068 | 7.78k | sam_hdr_destroy(tmp); |
2069 | 7.78k | if (!fp->bam_header && h) |
2070 | 0 | return -1; //failed to duplicate |
2071 | 7.78k | } |
2072 | 7.78k | return 0; |
2073 | 7.78k | } |
2074 | | |
2075 | | static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val) |
2076 | 0 | { |
2077 | 0 | char *p, *q, *beg = NULL, *end = NULL, *newtext; |
2078 | 0 | size_t new_l_text; |
2079 | 0 | if (!h || !key) |
2080 | 0 | return -1; |
2081 | | |
2082 | 0 | if (h->l_text > 3) { |
2083 | 0 | if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists |
2084 | 0 | if ((p = strchr(h->text, '\n')) == 0) return -1; |
2085 | 0 | *p = '\0'; // for strstr call |
2086 | |
|
2087 | 0 | char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' }; |
2088 | |
|
2089 | 0 | if ((q = strstr(h->text, tmp)) != 0) { // key exists |
2090 | 0 | *p = '\n'; // change back |
2091 | | |
2092 | | // mark the key:val |
2093 | 0 | beg = q; |
2094 | 0 | for (q += 4; *q != '\n' && *q != '\t'; ++q); |
2095 | 0 | end = q; |
2096 | |
|
2097 | 0 | if (val && (strncmp(beg + 4, val, end - beg - 4) == 0) |
2098 | 0 | && strlen(val) == end - beg - 4) |
2099 | 0 | return 0; // val is the same, no need to change |
2100 | |
|
2101 | 0 | } else { |
2102 | 0 | beg = end = p; |
2103 | 0 | *p = '\n'; |
2104 | 0 | } |
2105 | 0 | } |
2106 | 0 | } |
2107 | 0 | if (beg == NULL) { // no @HD |
2108 | 0 | new_l_text = h->l_text; |
2109 | 0 | if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9) |
2110 | 0 | return -1; |
2111 | 0 | new_l_text += strlen(SAM_FORMAT_VERSION) + 8; |
2112 | 0 | if (val) { |
2113 | 0 | if (new_l_text > SIZE_MAX - strlen(val) - 5) |
2114 | 0 | return -1; |
2115 | 0 | new_l_text += strlen(val) + 4; |
2116 | 0 | } |
2117 | 0 | newtext = (char*)malloc(new_l_text + 1); |
2118 | 0 | if (!newtext) return -1; |
2119 | | |
2120 | 0 | if (val) |
2121 | 0 | snprintf(newtext, new_l_text + 1, |
2122 | 0 | "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text); |
2123 | 0 | else |
2124 | 0 | snprintf(newtext, new_l_text + 1, |
2125 | 0 | "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text); |
2126 | 0 | } else { // has @HD but different or no key |
2127 | 0 | new_l_text = (beg - h->text) + (h->text + h->l_text - end); |
2128 | 0 | if (val) { |
2129 | 0 | if (new_l_text > SIZE_MAX - strlen(val) - 5) |
2130 | 0 | return -1; |
2131 | 0 | new_l_text += strlen(val) + 4; |
2132 | 0 | } |
2133 | 0 | newtext = (char*)malloc(new_l_text + 1); |
2134 | 0 | if (!newtext) return -1; |
2135 | | |
2136 | 0 | if (val) { |
2137 | 0 | snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s", |
2138 | 0 | (int) (beg - h->text), h->text, key, val, end); |
2139 | 0 | } else { //delete key |
2140 | 0 | snprintf(newtext, new_l_text + 1, "%.*s%s", |
2141 | 0 | (int) (beg - h->text), h->text, end); |
2142 | 0 | } |
2143 | 0 | } |
2144 | 0 | free(h->text); |
2145 | 0 | h->text = newtext; |
2146 | 0 | h->l_text = new_l_text; |
2147 | 0 | return 0; |
2148 | 0 | } |
2149 | | |
2150 | | |
2151 | | int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val) |
2152 | 0 | { |
2153 | 0 | if (!h || !key) |
2154 | 0 | return -1; |
2155 | | |
2156 | 0 | if (!h->hrecs) |
2157 | 0 | return old_sam_hdr_change_HD(h, key, val); |
2158 | | |
2159 | 0 | if (val) { |
2160 | 0 | if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0) |
2161 | 0 | return -1; |
2162 | 0 | } else { |
2163 | 0 | if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0) |
2164 | 0 | return -1; |
2165 | 0 | } |
2166 | 0 | return sam_hdr_rebuild(h); |
2167 | 0 | } |
2168 | | |
2169 | | /* releases existing header and sets new one; increments ref count if not |
2170 | | duplicating */ |
2171 | | int sam_hdr_set(samFile *fp, sam_hdr_t *h, int duplicate) |
2172 | 0 | { |
2173 | 0 | if (!fp) |
2174 | 0 | return -1; |
2175 | | |
2176 | 0 | if (duplicate) { |
2177 | 0 | sam_hdr_t *tmp = fp->bam_header; |
2178 | 0 | fp->bam_header = sam_hdr_dup(h); |
2179 | 0 | sam_hdr_destroy(tmp); |
2180 | 0 | if (!fp->bam_header && h) |
2181 | 0 | return -1; //duplicate failed |
2182 | 0 | } else { |
2183 | 0 | if (fp->bam_header != h) { //if not the same |
2184 | 0 | sam_hdr_destroy(fp->bam_header); |
2185 | 0 | fp->bam_header = h; |
2186 | 0 | sam_hdr_incr_ref(fp->bam_header); |
2187 | 0 | } |
2188 | 0 | } |
2189 | | |
2190 | 0 | return 0; |
2191 | 0 | } |
2192 | | |
2193 | | //return the bam_header, user has to use sam_hdr_incr_ref where ever required |
2194 | | sam_hdr_t* sam_hdr_get(samFile* fp) |
2195 | 0 | { |
2196 | 0 | if (!fp) |
2197 | 0 | return NULL; |
2198 | 0 | return fp->bam_header; |
2199 | 0 | } |
2200 | | |
2201 | | /********************** |
2202 | | *** SAM record I/O *** |
2203 | | **********************/ |
2204 | | |
2205 | | // The speed of this code can vary considerably depending on minor code |
2206 | | // changes elsewhere as some of the tight loops are particularly prone to |
2207 | | // speed changes when the instruction blocks are split over a 32-byte |
2208 | | // boundary. To protect against this, we explicitly specify an alignment |
2209 | | // for this function. If this is insufficient, we may also wish to |
2210 | | // consider alignment of blocks within this function via |
2211 | | // __attribute__((optimize("align-loops=5"))) (gcc) or clang equivalents. |
2212 | | // However it's not very portable. |
2213 | | // Instead we break into separate functions so we can explicitly specify |
2214 | | // use __attribute__((aligned(32))) instead and force consistent loop |
2215 | | // alignment. |
2216 | 357k | static inline int64_t grow_B_array(bam1_t *b, uint32_t *n, size_t size) { |
2217 | | // Avoid overflow on 32-bit platforms, but it breaks BAM anyway |
2218 | 357k | if (*n > INT32_MAX*0.666) { |
2219 | 0 | errno = ENOMEM; |
2220 | 0 | return -1; |
2221 | 0 | } |
2222 | | |
2223 | 357k | size_t bytes = (size_t)size * (size_t)(*n>>1); |
2224 | 357k | if (possibly_expand_bam_data(b, bytes) < 0) { |
2225 | 0 | hts_log_error("Out of memory"); |
2226 | 0 | return -1; |
2227 | 0 | } |
2228 | | |
2229 | 357k | (*n)+=*n>>1; |
2230 | 357k | return 0; |
2231 | 357k | } |
2232 | | |
2233 | | |
2234 | | // This ensures that q always ends up at the next comma after |
2235 | | // reading a number even if it's followed by junk. It |
2236 | | // prevents the possibility of trying to read more than n items. |
2237 | 18.2M | #define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0) |
2238 | | |
2239 | | HTS_ALIGN32 |
2240 | | static char *sam_parse_Bc_vals(bam1_t *b, char *q, uint32_t *nused, |
2241 | 33.3k | uint32_t *nalloc, int *overflow) { |
2242 | 2.31M | while (*q == ',') { |
2243 | 2.28M | if ((*nused)++ >= (*nalloc)) { |
2244 | 567 | if (grow_B_array(b, nalloc, 1) < 0) |
2245 | 0 | return NULL; |
2246 | 567 | } |
2247 | 2.28M | *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, overflow); |
2248 | 2.28M | b->l_data++; |
2249 | 2.28M | } |
2250 | 33.3k | return q; |
2251 | 33.3k | } |
2252 | | |
2253 | | HTS_ALIGN32 |
2254 | | static char *sam_parse_BC_vals(bam1_t *b, char *q, uint32_t *nused, |
2255 | 26.7k | uint32_t *nalloc, int *overflow) { |
2256 | 1.30M | while (*q == ',') { |
2257 | 1.27M | if ((*nused)++ >= (*nalloc)) { |
2258 | 2.57k | if (grow_B_array(b, nalloc, 1) < 0) |
2259 | 0 | return NULL; |
2260 | 2.57k | } |
2261 | 1.27M | if (q[1] != '-') { |
2262 | 1.24M | *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, overflow); |
2263 | 1.24M | b->l_data++; |
2264 | 1.24M | } else { |
2265 | 26.6k | *overflow = 1; |
2266 | 26.6k | q++; |
2267 | 26.6k | skip_to_comma_(q); |
2268 | 26.6k | } |
2269 | 1.27M | } |
2270 | 26.7k | return q; |
2271 | 26.7k | } |
2272 | | |
2273 | | HTS_ALIGN32 |
2274 | | static char *sam_parse_Bs_vals(bam1_t *b, char *q, uint32_t *nused, |
2275 | 11.1k | uint32_t *nalloc, int *overflow) { |
2276 | 4.26M | while (*q == ',') { |
2277 | 4.25M | if ((*nused)++ >= (*nalloc)) { |
2278 | 6.16k | if (grow_B_array(b, nalloc, 2) < 0) |
2279 | 0 | return NULL; |
2280 | 6.16k | } |
2281 | 4.25M | i16_to_le(hts_str2int(q + 1, &q, 16, overflow), |
2282 | 4.25M | b->data + b->l_data); |
2283 | 4.25M | b->l_data += 2; |
2284 | 4.25M | } |
2285 | 11.1k | return q; |
2286 | 11.1k | } |
2287 | | |
2288 | | HTS_ALIGN32 |
2289 | | static char *sam_parse_BS_vals(bam1_t *b, char *q, uint32_t *nused, |
2290 | 6.21k | uint32_t *nalloc, int *overflow) { |
2291 | 6.49M | while (*q == ',') { |
2292 | 6.49M | if ((*nused)++ >= (*nalloc)) { |
2293 | 14.4k | if (grow_B_array(b, nalloc, 2) < 0) |
2294 | 0 | return NULL; |
2295 | 14.4k | } |
2296 | 6.49M | if (q[1] != '-') { |
2297 | 6.39M | u16_to_le(hts_str2uint(q + 1, &q, 16, overflow), |
2298 | 6.39M | b->data + b->l_data); |
2299 | 6.39M | b->l_data += 2; |
2300 | 6.39M | } else { |
2301 | 98.7k | *overflow = 1; |
2302 | 98.7k | q++; |
2303 | 98.7k | skip_to_comma_(q); |
2304 | 98.7k | } |
2305 | 6.49M | } |
2306 | 6.21k | return q; |
2307 | 6.21k | } |
2308 | | |
2309 | | HTS_ALIGN32 |
2310 | | static char *sam_parse_Bi_vals(bam1_t *b, char *q, uint32_t *nused, |
2311 | 33.3k | uint32_t *nalloc, int *overflow) { |
2312 | 16.0M | while (*q == ',') { |
2313 | 16.0M | if ((*nused)++ >= (*nalloc)) { |
2314 | 164 | if (grow_B_array(b, nalloc, 4) < 0) |
2315 | 0 | return NULL; |
2316 | 164 | } |
2317 | 16.0M | i32_to_le(hts_str2int(q + 1, &q, 32, overflow), |
2318 | 16.0M | b->data + b->l_data); |
2319 | 16.0M | b->l_data += 4; |
2320 | 16.0M | } |
2321 | 33.3k | return q; |
2322 | 33.3k | } |
2323 | | |
2324 | | HTS_ALIGN32 |
2325 | | static char *sam_parse_BI_vals(bam1_t *b, char *q, uint32_t *nused, |
2326 | 97.8k | uint32_t *nalloc, int *overflow) { |
2327 | 4.22M | while (*q == ',') { |
2328 | 4.13M | if ((*nused)++ >= (*nalloc)) { |
2329 | 279k | if (grow_B_array(b, nalloc, 4) < 0) |
2330 | 0 | return NULL; |
2331 | 279k | } |
2332 | 4.13M | if (q[1] != '-') { |
2333 | 3.97M | u32_to_le(hts_str2uint(q + 1, &q, 32, overflow), |
2334 | 3.97M | b->data + b->l_data); |
2335 | 3.97M | b->l_data += 4; |
2336 | 3.97M | } else { |
2337 | 159k | *overflow = 1; |
2338 | 159k | q++; |
2339 | 159k | skip_to_comma_(q); |
2340 | 159k | } |
2341 | 4.13M | } |
2342 | 97.8k | return q; |
2343 | 97.8k | } |
2344 | | |
2345 | | HTS_ALIGN32 |
2346 | | static char *sam_parse_Bf_vals(bam1_t *b, char *q, uint32_t *nused, |
2347 | 16.4k | uint32_t *nalloc, int *overflow) { |
2348 | 432k | while (*q == ',') { |
2349 | 416k | if ((*nused)++ >= (*nalloc)) { |
2350 | 53.7k | if (grow_B_array(b, nalloc, 4) < 0) |
2351 | 0 | return NULL; |
2352 | 53.7k | } |
2353 | 416k | float_to_le(strtod(q + 1, &q), b->data + b->l_data); |
2354 | 416k | b->l_data += 4; |
2355 | 416k | } |
2356 | 16.4k | return q; |
2357 | 16.4k | } |
2358 | | |
2359 | | HTS_ALIGN32 |
2360 | | static int sam_parse_B_vals_r(char type, uint32_t nalloc, char *in, |
2361 | | char **end, bam1_t *b, |
2362 | 225k | int *ctr) { |
2363 | | // Protect against infinite recursion when dealing with invalid input. |
2364 | | // An example string is "XX:B:C,-". The lack of a number means min=0, |
2365 | | // but it overflowed due to "-" and so we repeat ad-infinitum. |
2366 | | // |
2367 | | // Loop detection is the safest solution incase there are other |
2368 | | // strange corner cases with malformed inputs. |
2369 | 225k | if (++(*ctr) > 2) { |
2370 | 5 | hts_log_error("Malformed data in B:%c array", type); |
2371 | 5 | return -1; |
2372 | 5 | } |
2373 | | |
2374 | 225k | int orig_l = b->l_data; |
2375 | 225k | char *q = in; |
2376 | 225k | int32_t size; |
2377 | 225k | size_t bytes; |
2378 | 225k | int overflow = 0; |
2379 | | |
2380 | 225k | size = aux_type2size(type); |
2381 | 225k | if (size <= 0 || size > 4) { |
2382 | 1 | hts_log_error("Unrecognized type B:%c", type); |
2383 | 1 | return -1; |
2384 | 1 | } |
2385 | | |
2386 | | // Ensure space for type + values. |
2387 | | // The first pass through here we don't know the number of entries and |
2388 | | // nalloc == 0. We start with a small working set and then parse the |
2389 | | // data, growing as needed. |
2390 | | // |
2391 | | // If we have a second pass through we do know the number of entries |
2392 | | // and nalloc is already known. We have no need to expand the bam data. |
2393 | 225k | if (!nalloc) |
2394 | 154k | nalloc=7; |
2395 | | |
2396 | | // Ensure allocated memory is big enough (for current nalloc estimate) |
2397 | 225k | bytes = (size_t) nalloc * (size_t) size; |
2398 | 225k | if (bytes / size != nalloc |
2399 | 225k | || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) { |
2400 | 0 | hts_log_error("Out of memory"); |
2401 | 0 | return -1; |
2402 | 0 | } |
2403 | | |
2404 | 225k | uint32_t nused = 0; |
2405 | | |
2406 | 225k | b->data[b->l_data++] = 'B'; |
2407 | 225k | b->data[b->l_data++] = type; |
2408 | | // 32-bit B-array length is inserted later once we know it. |
2409 | 225k | int b_len_idx = b->l_data; |
2410 | 225k | b->l_data += sizeof(uint32_t); |
2411 | | |
2412 | 225k | if (type == 'c') { |
2413 | 33.3k | if (!(q = sam_parse_Bc_vals(b, q, &nused, &nalloc, &overflow))) |
2414 | 0 | return -1; |
2415 | 191k | } else if (type == 'C') { |
2416 | 26.7k | if (!(q = sam_parse_BC_vals(b, q, &nused, &nalloc, &overflow))) |
2417 | 0 | return -1; |
2418 | 165k | } else if (type == 's') { |
2419 | 11.1k | if (!(q = sam_parse_Bs_vals(b, q, &nused, &nalloc, &overflow))) |
2420 | 0 | return -1; |
2421 | 153k | } else if (type == 'S') { |
2422 | 6.21k | if (!(q = sam_parse_BS_vals(b, q, &nused, &nalloc, &overflow))) |
2423 | 0 | return -1; |
2424 | 147k | } else if (type == 'i') { |
2425 | 33.3k | if (!(q = sam_parse_Bi_vals(b, q, &nused, &nalloc, &overflow))) |
2426 | 0 | return -1; |
2427 | 114k | } else if (type == 'I') { |
2428 | 97.8k | if (!(q = sam_parse_BI_vals(b, q, &nused, &nalloc, &overflow))) |
2429 | 0 | return -1; |
2430 | 97.8k | } else if (type == 'f') { |
2431 | 16.4k | if (!(q = sam_parse_Bf_vals(b, q, &nused, &nalloc, &overflow))) |
2432 | 0 | return -1; |
2433 | 16.4k | } |
2434 | 225k | if (*q != '\t' && *q != '\0') { |
2435 | | // Unknown B array type or junk in the numbers |
2436 | 183 | hts_log_error("Malformed B:%c", type); |
2437 | 183 | return -1; |
2438 | 183 | } |
2439 | 224k | i32_to_le(nused, b->data + b_len_idx); |
2440 | | |
2441 | 224k | if (!overflow) { |
2442 | 154k | *end = q; |
2443 | 154k | return 0; |
2444 | 154k | } else { |
2445 | 70.6k | int64_t max = 0, min = 0, val; |
2446 | | // Given type was incorrect. Try to rescue the situation. |
2447 | 70.6k | char *r = q; |
2448 | 70.6k | q = in; |
2449 | 70.6k | overflow = 0; |
2450 | 70.6k | b->l_data = orig_l; |
2451 | | // Find out what range of values is present |
2452 | 16.8M | while (q < r) { |
2453 | 16.7M | val = hts_str2int(q + 1, &q, 64, &overflow); |
2454 | 16.7M | if (max < val) max = val; |
2455 | 16.7M | if (min > val) min = val; |
2456 | 16.7M | skip_to_comma_(q); |
2457 | 16.7M | } |
2458 | | // Retry with appropriate type |
2459 | 70.6k | if (!overflow) { |
2460 | 70.6k | if (min < 0) { |
2461 | 70.4k | if (min >= INT8_MIN && max <= INT8_MAX) { |
2462 | 32.6k | return sam_parse_B_vals_r('c', nalloc, in, end, b, ctr); |
2463 | 37.7k | } else if (min >= INT16_MIN && max <= INT16_MAX) { |
2464 | 4.40k | return sam_parse_B_vals_r('s', nalloc, in, end, b, ctr); |
2465 | 33.3k | } else if (min >= INT32_MIN && max <= INT32_MAX) { |
2466 | 33.3k | return sam_parse_B_vals_r('i', nalloc, in, end, b, ctr); |
2467 | 33.3k | } |
2468 | 70.4k | } else { |
2469 | 189 | if (max < UINT8_MAX) { |
2470 | 10 | return sam_parse_B_vals_r('C', nalloc, in, end, b, ctr); |
2471 | 179 | } else if (max <= UINT16_MAX) { |
2472 | 0 | return sam_parse_B_vals_r('S', nalloc, in, end, b, ctr); |
2473 | 179 | } else if (max <= UINT32_MAX) { |
2474 | 177 | return sam_parse_B_vals_r('I', nalloc, in, end, b, ctr); |
2475 | 177 | } |
2476 | 189 | } |
2477 | 70.6k | } |
2478 | | // If here then at least one of the values is too big to store |
2479 | 50 | hts_log_error("Numeric value in B array out of allowed range"); |
2480 | 50 | return -1; |
2481 | 70.6k | } |
2482 | 224k | #undef skip_to_comma_ |
2483 | 224k | } |
2484 | | |
2485 | | HTS_ALIGN32 |
2486 | | static int sam_parse_B_vals(char type, char *in, char **end, bam1_t *b) |
2487 | 154k | { |
2488 | 154k | int ctr = 0; |
2489 | 154k | uint32_t nalloc = 0; |
2490 | 154k | return sam_parse_B_vals_r(type, nalloc, in, end, b, &ctr); |
2491 | 154k | } |
2492 | | |
2493 | 345k | static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) { |
2494 | 345k | if (*v >= '1' && *v <= '9') { |
2495 | 98.8k | return hts_str2uint(v, rv, 16, overflow); |
2496 | 98.8k | } |
2497 | 246k | else if (*v == '0') { |
2498 | | // handle single-digit "0" directly; otherwise it's hex or octal |
2499 | 86.5k | if (v[1] == '\t') { *rv = v+1; return 0; } |
2500 | 2.02k | else { |
2501 | 2.02k | unsigned long val = strtoul(v, rv, 0); |
2502 | 2.02k | if (val > 65535) { *overflow = 1; return 65535; } |
2503 | 2.02k | return val; |
2504 | 2.02k | } |
2505 | 86.5k | } |
2506 | 159k | else { |
2507 | | // TODO implement symbolic flag letters |
2508 | 159k | *rv = v; |
2509 | 159k | return 0; |
2510 | 159k | } |
2511 | 345k | } |
2512 | | |
2513 | | // Parse tag line and append to bam object b. |
2514 | | // Shared by both SAM and FASTQ parsers. |
2515 | | // |
2516 | | // The difference between the two is how lenient we are to recognising |
2517 | | // non-compliant strings. The FASTQ parser glosses over arbitrary |
2518 | | // non-SAM looking strings. |
2519 | | static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient, |
2520 | 344k | khash_t(tag) *tag_whitelist) { |
2521 | 344k | int overflow = 0; |
2522 | 344k | int checkpoint; |
2523 | 344k | char logbuf[40]; |
2524 | 344k | char *q = start, *p = end; |
2525 | | |
2526 | 344k | #define _parse_err(cond, ...) \ |
2527 | 8.56M | do { \ |
2528 | 18.6M | if (cond) { \ |
2529 | 666 | if (lenient) { \ |
2530 | 0 | while (q < p && !isspace_c(*q)) \ |
2531 | 0 | q++; \ |
2532 | 0 | while (q < p && isspace_c(*q)) \ |
2533 | 0 | q++; \ |
2534 | 0 | b->l_data = checkpoint; \ |
2535 | 0 | goto loop; \ |
2536 | 666 | } else { \ |
2537 | 666 | hts_log_error(__VA_ARGS__); \ |
2538 | 666 | goto err_ret; \ |
2539 | 666 | } \ |
2540 | 666 | } \ |
2541 | 8.56M | } while (0) |
2542 | | |
2543 | 8.06M | while (q < p) loop: { |
2544 | 8.06M | char type; |
2545 | 8.06M | checkpoint = b->l_data; |
2546 | 8.06M | if (p - q < 5) { |
2547 | 27 | if (lenient) { |
2548 | 0 | break; |
2549 | 27 | } else { |
2550 | 27 | hts_log_error("Incomplete aux field"); |
2551 | 27 | goto err_ret; |
2552 | 27 | } |
2553 | 27 | } |
2554 | 4.03M | _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id"); |
2555 | | |
2556 | 4.03M | if (lenient && (q[2] | q[4]) != ':') { |
2557 | 0 | while (q < p && !isspace_c(*q)) |
2558 | 0 | q++; |
2559 | 0 | while (q < p && isspace_c(*q)) |
2560 | 0 | q++; |
2561 | 0 | continue; |
2562 | 0 | } |
2563 | | |
2564 | 4.03M | if (tag_whitelist) { |
2565 | 0 | int tt = q[0]*256 + q[1]; |
2566 | 0 | if (kh_get(tag, tag_whitelist, tt) == kh_end(tag_whitelist)) { |
2567 | 0 | while (q < p && *q != '\t') |
2568 | 0 | q++; |
2569 | 0 | continue; |
2570 | 0 | } |
2571 | 0 | } |
2572 | | |
2573 | | // Copy over id |
2574 | 4.03M | if (possibly_expand_bam_data(b, 2) < 0) goto err_ret; |
2575 | 4.03M | memcpy(b->data + b->l_data, q, 2); b->l_data += 2; |
2576 | 4.03M | q += 3; type = *q++; ++q; // q points to value |
2577 | 4.03M | if (type != 'Z' && type != 'H') // the only zero length acceptable fields |
2578 | 3.21M | _parse_err(*q <= '\t', "incomplete aux field"); |
2579 | | |
2580 | | // Ensure enough space for a double + type allocated. |
2581 | 4.03M | if (possibly_expand_bam_data(b, 16) < 0) goto err_ret; |
2582 | | |
2583 | 4.03M | if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { |
2584 | 1.16M | b->data[b->l_data++] = 'A'; |
2585 | 1.16M | b->data[b->l_data++] = *q++; |
2586 | 2.87M | } else if (type == 'i' || type == 'I') { |
2587 | 1.81M | if (*q == '-') { |
2588 | 1.49M | int32_t x = hts_str2int(q, &q, 32, &overflow); |
2589 | 1.49M | if (x >= INT8_MIN) { |
2590 | 796k | b->data[b->l_data++] = 'c'; |
2591 | 796k | b->data[b->l_data++] = x; |
2592 | 796k | } else if (x >= INT16_MIN) { |
2593 | 199k | b->data[b->l_data++] = 's'; |
2594 | 199k | i16_to_le(x, b->data + b->l_data); |
2595 | 199k | b->l_data += 2; |
2596 | 503k | } else { |
2597 | 503k | b->data[b->l_data++] = 'i'; |
2598 | 503k | i32_to_le(x, b->data + b->l_data); |
2599 | 503k | b->l_data += 4; |
2600 | 503k | } |
2601 | 1.49M | } else { |
2602 | 317k | uint32_t x = hts_str2uint(q, &q, 32, &overflow); |
2603 | 317k | if (x <= UINT8_MAX) { |
2604 | 185k | b->data[b->l_data++] = 'C'; |
2605 | 185k | b->data[b->l_data++] = x; |
2606 | 185k | } else if (x <= UINT16_MAX) { |
2607 | 104k | b->data[b->l_data++] = 'S'; |
2608 | 104k | u16_to_le(x, b->data + b->l_data); |
2609 | 104k | b->l_data += 2; |
2610 | 104k | } else { |
2611 | 27.0k | b->data[b->l_data++] = 'I'; |
2612 | 27.0k | u32_to_le(x, b->data + b->l_data); |
2613 | 27.0k | b->l_data += 4; |
2614 | 27.0k | } |
2615 | 317k | } |
2616 | 1.81M | } else if (type == 'f') { |
2617 | 41.9k | b->data[b->l_data++] = 'f'; |
2618 | 41.9k | float_to_le(strtod(q, &q), b->data + b->l_data); |
2619 | 41.9k | b->l_data += sizeof(float); |
2620 | 1.01M | } else if (type == 'd') { |
2621 | 42.9k | b->data[b->l_data++] = 'd'; |
2622 | 42.9k | double_to_le(strtod(q, &q), b->data + b->l_data); |
2623 | 42.9k | b->l_data += sizeof(double); |
2624 | 969k | } else if (type == 'Z' || type == 'H') { |
2625 | 814k | char *end = strchr(q, '\t'); |
2626 | 814k | if (!end) end = q + strlen(q); |
2627 | 814k | _parse_err(type == 'H' && ((end-q)&1) != 0, |
2628 | 814k | "hex field does not have an even number of digits"); |
2629 | 814k | b->data[b->l_data++] = type; |
2630 | 814k | if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret; |
2631 | 814k | memcpy(b->data + b->l_data, q, end - q); |
2632 | 814k | b->l_data += end - q; |
2633 | 814k | b->data[b->l_data++] = '\0'; |
2634 | 814k | q = end; |
2635 | 814k | } else if (type == 'B') { |
2636 | 154k | type = *q++; // q points to the first ',' following the typing byte |
2637 | 154k | _parse_err(*q && *q != ',' && *q != '\t', |
2638 | 154k | "B aux field type not followed by ','"); |
2639 | | |
2640 | 154k | if (sam_parse_B_vals(type, q, &q, b) < 0) |
2641 | 239 | goto err_ret; |
2642 | 154k | } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1)); |
2643 | | |
2644 | 21.4M | while (*q > '\t') { q++; } // Skip any junk to next tab |
2645 | 4.03M | q++; |
2646 | 4.03M | } |
2647 | | |
2648 | 343k | _parse_err(!lenient && overflow != 0, "numeric value out of allowed range"); |
2649 | 343k | #undef _parse_err |
2650 | | |
2651 | 343k | return 0; |
2652 | | |
2653 | 932 | err_ret: |
2654 | 932 | return -2; |
2655 | 343k | } |
2656 | | |
2657 | | int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) |
2658 | 345k | { |
2659 | 1.42M | #define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0) |
2660 | | |
2661 | 345k | #if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff |
2662 | | |
2663 | | // Macro that operates on 64-bits at a time. |
2664 | 345k | #define COPY_MINUS_N(to,from,n,l,failed) \ |
2665 | 345k | do { \ |
2666 | 274k | uint64_u *from8 = (uint64_u *)(from); \ |
2667 | 274k | uint64_u *to8 = (uint64_u *)(to); \ |
2668 | 274k | uint64_t uflow = 0; \ |
2669 | 274k | size_t l8 = (l)>>3, i; \ |
2670 | 274k | for (i = 0; i < l8; i++) { \ |
2671 | 0 | to8[i] = from8[i] - (n)*0x0101010101010101UL; \ |
2672 | 0 | uflow |= to8[i]; \ |
2673 | 0 | } \ |
2674 | 277k | for (i<<=3; i < (l); ++i) { \ |
2675 | 2.86k | to[i] = from[i] - (n); \ |
2676 | 2.86k | uflow |= to[i]; \ |
2677 | 2.86k | } \ |
2678 | 274k | failed = (uflow & 0x8080808080808080UL) > 0; \ |
2679 | 274k | } while (0) |
2680 | | |
2681 | | #else |
2682 | | |
2683 | | // Basic version which operates a byte at a time |
2684 | | #define COPY_MINUS_N(to,from,n,l,failed) do { \ |
2685 | | uint8_t uflow = 0; \ |
2686 | | for (i = 0; i < (l); ++i) { \ |
2687 | | (to)[i] = (from)[i] - (n); \ |
2688 | | uflow |= (uint8_t) (to)[i]; \ |
2689 | | } \ |
2690 | | failed = (uflow & 0x80) > 0; \ |
2691 | | } while (0) |
2692 | | |
2693 | | #endif |
2694 | | |
2695 | 604k | #define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l) |
2696 | 4.38M | #define _parse_err(cond, ...) do { if (cond) { hts_log_error(__VA_ARGS__); goto err_ret; } } while (0) |
2697 | 1.22M | #define _parse_warn(cond, ...) do { if (cond) { hts_log_warning(__VA_ARGS__); } } while (0) |
2698 | | |
2699 | 345k | uint8_t *t; |
2700 | | |
2701 | 345k | char *p = s->s, *q; |
2702 | 345k | int i, overflow = 0; |
2703 | 345k | char logbuf[40]; |
2704 | 345k | hts_pos_t cigreflen; |
2705 | 345k | bam1_core_t *c = &b->core; |
2706 | | |
2707 | 345k | b->l_data = 0; |
2708 | 345k | memset(c, 0, 32); |
2709 | | |
2710 | | // qname |
2711 | 345k | q = _read_token(p); |
2712 | | |
2713 | 345k | _parse_warn(p - q <= 1, "empty query name"); |
2714 | 345k | _parse_err(p - q > 255, "query name too long"); |
2715 | | // resize large enough for name + extranul |
2716 | 345k | if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret; |
2717 | 345k | memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q; |
2718 | | |
2719 | 345k | c->l_extranul = (4 - (b->l_data & 3)) & 3; |
2720 | 345k | memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul); |
2721 | 345k | b->l_data += c->l_extranul; |
2722 | | |
2723 | 345k | c->l_qname = p - q + c->l_extranul; |
2724 | | |
2725 | | // flag |
2726 | 345k | c->flag = parse_sam_flag(p, &p, &overflow); |
2727 | 345k | if (*p++ != '\t') goto err_ret; // malformated flag |
2728 | | |
2729 | | // chr |
2730 | 345k | q = _read_token(p); |
2731 | 345k | if (strcmp(q, "*")) { |
2732 | 309k | _parse_err(h->n_targets == 0, "no SQ lines present in the header"); |
2733 | 309k | c->tid = bam_name2id(h, q); |
2734 | 309k | _parse_err(c->tid < -1, "failed to parse header"); |
2735 | 309k | _parse_warn(c->tid < 0, "unrecognized reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX)); |
2736 | 309k | } else c->tid = -1; |
2737 | | |
2738 | | // pos |
2739 | 345k | c->pos = hts_str2uint(p, &p, 62, &overflow) - 1; |
2740 | 345k | if (*p++ != '\t') goto err_ret; |
2741 | 345k | if (c->pos < 0 && c->tid >= 0) { |
2742 | 42.3k | _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped"); |
2743 | 42.3k | c->tid = -1; |
2744 | 42.3k | } |
2745 | 345k | if (c->tid < 0) c->flag |= BAM_FUNMAP; |
2746 | | |
2747 | | // mapq |
2748 | 345k | c->qual = hts_str2uint(p, &p, 8, &overflow); |
2749 | 345k | if (*p++ != '\t') goto err_ret; |
2750 | | // cigar |
2751 | 344k | if (*p != '*') { |
2752 | 302k | uint32_t *cigar = NULL; |
2753 | 302k | int old_l_data = b->l_data; |
2754 | 302k | int n_cigar = bam_parse_cigar(p, &p, b); |
2755 | 302k | if (n_cigar < 1 || *p++ != '\t') goto err_ret; |
2756 | 301k | cigar = (uint32_t *)(b->data + old_l_data); |
2757 | | |
2758 | | // can't use bam_endpos() directly as some fields not yet set up |
2759 | 301k | cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1; |
2760 | 301k | if (cigreflen == 0) cigreflen = 1; |
2761 | 301k | } else { |
2762 | 42.9k | _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped"); |
2763 | 42.9k | c->flag |= BAM_FUNMAP; |
2764 | 42.9k | q = _read_token(p); |
2765 | 42.9k | cigreflen = 1; |
2766 | 42.9k | } |
2767 | 344k | _parse_err(HTS_POS_MAX - cigreflen <= c->pos, |
2768 | 344k | "read ends beyond highest supported position"); |
2769 | 344k | c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5); |
2770 | | // mate chr |
2771 | 344k | q = _read_token(p); |
2772 | 344k | if (strcmp(q, "=") == 0) { |
2773 | 2 | c->mtid = c->tid; |
2774 | 344k | } else if (strcmp(q, "*") == 0) { |
2775 | 0 | c->mtid = -1; |
2776 | 344k | } else { |
2777 | 344k | c->mtid = bam_name2id(h, q); |
2778 | 344k | _parse_err(c->mtid < -1, "failed to parse header"); |
2779 | 344k | _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX)); |
2780 | 344k | } |
2781 | | // mpos |
2782 | 344k | c->mpos = hts_str2uint(p, &p, 62, &overflow) - 1; |
2783 | 344k | if (*p++ != '\t') goto err_ret; |
2784 | 344k | if (c->mpos < 0 && c->mtid >= 0) { |
2785 | 138k | _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped"); |
2786 | 138k | c->mtid = -1; |
2787 | 138k | } |
2788 | | // tlen |
2789 | 344k | c->isize = hts_str2int(p, &p, 63, &overflow); |
2790 | 344k | if (*p++ != '\t') goto err_ret; |
2791 | 344k | _parse_err(overflow, "number outside allowed range"); |
2792 | | // seq |
2793 | 344k | q = _read_token(p); |
2794 | 344k | if (strcmp(q, "*")) { |
2795 | 260k | _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long"); |
2796 | 260k | c->l_qseq = p - q - 1; |
2797 | 260k | hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname)); |
2798 | 260k | _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length"); |
2799 | 260k | i = (c->l_qseq + 1) >> 1; |
2800 | 260k | _get_mem(uint8_t, &t, b, i); |
2801 | | |
2802 | 260k | unsigned int lqs2 = c->l_qseq&~1, i; |
2803 | 277k | for (i = 0; i < lqs2; i+=2) |
2804 | 17.7k | t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]]; |
2805 | 330k | for (; i < c->l_qseq; ++i) |
2806 | 70.6k | t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2); |
2807 | 260k | } else c->l_qseq = 0; |
2808 | | // qual |
2809 | 688k | _get_mem(uint8_t, &t, b, c->l_qseq); |
2810 | 688k | if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) { |
2811 | 69.6k | memset(t, 0xff, c->l_qseq); |
2812 | 69.6k | p += 2; |
2813 | 274k | } else { |
2814 | 274k | int failed = 0; |
2815 | 274k | _parse_err(s->l - (p - s->s) < c->l_qseq |
2816 | 274k | || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'), |
2817 | 274k | "SEQ and QUAL are of different length"); |
2818 | 274k | COPY_MINUS_N(t, p, 33, c->l_qseq, failed); |
2819 | 274k | _parse_err(failed, "invalid QUAL character"); |
2820 | 274k | p += c->l_qseq + 1; |
2821 | 274k | } |
2822 | | |
2823 | | // aux |
2824 | 344k | if (aux_parse(p, s->s + s->l, b, 0, NULL) < 0) |
2825 | 932 | goto err_ret; |
2826 | | |
2827 | 343k | if (bam_tag2cigar(b, 1, 1) < 0) |
2828 | 0 | return -2; |
2829 | 343k | return 0; |
2830 | | |
2831 | 0 | #undef _parse_warn |
2832 | 0 | #undef _parse_err |
2833 | 0 | #undef _get_mem |
2834 | 0 | #undef _read_token |
2835 | 2.58k | err_ret: |
2836 | 2.58k | return -2; |
2837 | 343k | } |
2838 | | |
2839 | 302k | static uint32_t read_ncigar(const char *q) { |
2840 | 302k | uint32_t n_cigar = 0; |
2841 | 3.40M | for (; *q && *q != '\t'; ++q) |
2842 | 3.10M | if (!isdigit_c(*q)) ++n_cigar; |
2843 | 302k | if (!n_cigar) { |
2844 | 104 | hts_log_error("No CIGAR operations"); |
2845 | 104 | return 0; |
2846 | 104 | } |
2847 | 301k | if (n_cigar >= 2147483647) { |
2848 | 0 | hts_log_error("Too many CIGAR operations"); |
2849 | 0 | return 0; |
2850 | 0 | } |
2851 | | |
2852 | 301k | return n_cigar; |
2853 | 301k | } |
2854 | | |
2855 | | /*! @function |
2856 | | @abstract Parse a CIGAR string into preallocated a uint32_t array |
2857 | | @param in [in] pointer to the source string |
2858 | | @param a_cigar [out] address of the destination uint32_t buffer |
2859 | | @return number of processed input characters; 0 on error |
2860 | | */ |
2861 | 301k | static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) { |
2862 | 301k | int i, overflow = 0; |
2863 | 301k | const char *p = in; |
2864 | 931k | for (i = 0; i < n_cigar; i++) { |
2865 | 630k | uint32_t len; |
2866 | 630k | int op; |
2867 | 630k | char *q; |
2868 | 630k | len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT; |
2869 | 630k | if (q == p) { |
2870 | 123 | hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p); |
2871 | 123 | return 0; |
2872 | 123 | } |
2873 | 630k | if (overflow) { |
2874 | 24 | hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p); |
2875 | 24 | return 0; |
2876 | 24 | } |
2877 | 630k | p = q; |
2878 | 630k | op = bam_cigar_table[(unsigned char)*p++]; |
2879 | 630k | if (op < 0) { |
2880 | 207 | hts_log_error("Unrecognized CIGAR operator"); |
2881 | 207 | return 0; |
2882 | 207 | } |
2883 | 629k | a_cigar[i] = len; |
2884 | 629k | a_cigar[i] |= op; |
2885 | 629k | } |
2886 | | |
2887 | 301k | return p-in; |
2888 | 301k | } |
2889 | | |
2890 | 0 | ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) { |
2891 | 0 | size_t n_cigar = 0; |
2892 | 0 | int diff; |
2893 | |
|
2894 | 0 | if (!in || !a_cigar || !a_mem) { |
2895 | 0 | hts_log_error("NULL pointer arguments"); |
2896 | 0 | return -1; |
2897 | 0 | } |
2898 | 0 | if (end) *end = (char *)in; |
2899 | |
|
2900 | 0 | if (*in == '*') { |
2901 | 0 | if (end) (*end)++; |
2902 | 0 | return 0; |
2903 | 0 | } |
2904 | 0 | n_cigar = read_ncigar(in); |
2905 | 0 | if (!n_cigar) return 0; |
2906 | 0 | if (n_cigar > *a_mem) { |
2907 | 0 | uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar)); |
2908 | 0 | if (a_tmp) { |
2909 | 0 | *a_cigar = a_tmp; |
2910 | 0 | *a_mem = n_cigar; |
2911 | 0 | } else { |
2912 | 0 | hts_log_error("Memory allocation error"); |
2913 | 0 | return -1; |
2914 | 0 | } |
2915 | 0 | } |
2916 | | |
2917 | 0 | if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1; |
2918 | 0 | if (end) *end = (char *)in+diff; |
2919 | |
|
2920 | 0 | return n_cigar; |
2921 | 0 | } |
2922 | | |
2923 | 302k | ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) { |
2924 | 302k | size_t n_cigar = 0; |
2925 | 302k | int diff; |
2926 | | |
2927 | 302k | if (!in || !b) { |
2928 | 0 | hts_log_error("NULL pointer arguments"); |
2929 | 0 | return -1; |
2930 | 0 | } |
2931 | 302k | if (end) *end = (char *)in; |
2932 | | |
2933 | 302k | n_cigar = (*in == '*') ? 0 : read_ncigar(in); |
2934 | 302k | if (!n_cigar && b->core.n_cigar == 0) { |
2935 | 104 | if (end) *end = (char *)in+1; |
2936 | 104 | return 0; |
2937 | 104 | } |
2938 | | |
2939 | 301k | ssize_t cig_diff = n_cigar - b->core.n_cigar; |
2940 | 301k | if (cig_diff > 0 && |
2941 | 301k | possibly_expand_bam_data(b, cig_diff * sizeof(uint32_t)) < 0) { |
2942 | 0 | hts_log_error("Memory allocation error"); |
2943 | 0 | return -1; |
2944 | 0 | } |
2945 | | |
2946 | 301k | uint32_t *cig = bam_get_cigar(b); |
2947 | 301k | if ((uint8_t *)cig != b->data + b->l_data) { |
2948 | | // Modifying an BAM existing BAM record |
2949 | 0 | uint8_t *seq = bam_get_seq(b); |
2950 | 0 | memmove(cig + n_cigar, seq, (b->data + b->l_data) - seq); |
2951 | 0 | } |
2952 | | |
2953 | 301k | if (n_cigar) { |
2954 | 301k | if (!(diff = parse_cigar(in, cig, n_cigar))) |
2955 | 354 | return -1; |
2956 | 301k | } else { |
2957 | 0 | diff = 1; // handle "*" |
2958 | 0 | } |
2959 | | |
2960 | 301k | b->l_data += cig_diff * sizeof(uint32_t); |
2961 | 301k | b->core.n_cigar = n_cigar; |
2962 | 301k | if (end) *end = (char *)in + diff; |
2963 | | |
2964 | 301k | return n_cigar; |
2965 | 301k | } |
2966 | | |
2967 | | /* |
2968 | | * ----------------------------------------------------------------------------- |
2969 | | * SAM threading |
2970 | | */ |
2971 | | // Size of SAM text block (reading) |
2972 | 0 | #define SAM_NBYTES 240000 |
2973 | | |
2974 | | // Number of BAM records (writing, up to NB_mem in size) |
2975 | 0 | #define SAM_NBAM 1000 |
2976 | | |
2977 | | struct SAM_state; |
2978 | | |
2979 | | // Output job - a block of BAM records |
2980 | | typedef struct sp_bams { |
2981 | | struct sp_bams *next; |
2982 | | int serial; |
2983 | | |
2984 | | bam1_t *bams; |
2985 | | int nbams, abams; // used and alloc for bams[] array |
2986 | | size_t bam_mem; // very approximate total size |
2987 | | |
2988 | | struct SAM_state *fd; |
2989 | | } sp_bams; |
2990 | | |
2991 | | // Input job - a block of SAM text |
2992 | | typedef struct sp_lines { |
2993 | | struct sp_lines *next; |
2994 | | int serial; |
2995 | | |
2996 | | char *data; |
2997 | | int data_size; |
2998 | | int alloc; |
2999 | | |
3000 | | struct SAM_state *fd; |
3001 | | sp_bams *bams; |
3002 | | } sp_lines; |
3003 | | |
3004 | | enum sam_cmd { |
3005 | | SAM_NONE = 0, |
3006 | | SAM_CLOSE, |
3007 | | SAM_CLOSE_DONE, |
3008 | | SAM_AT_EOF, |
3009 | | }; |
3010 | | |
3011 | | typedef struct SAM_state { |
3012 | | sam_hdr_t *h; |
3013 | | |
3014 | | hts_tpool *p; |
3015 | | int own_pool; |
3016 | | pthread_mutex_t lines_m; |
3017 | | hts_tpool_process *q; |
3018 | | pthread_t dispatcher; |
3019 | | int dispatcher_set; |
3020 | | |
3021 | | sp_lines *lines; |
3022 | | sp_bams *bams; |
3023 | | |
3024 | | sp_bams *curr_bam; |
3025 | | int curr_idx; |
3026 | | int serial; |
3027 | | |
3028 | | // Be warned: moving these mutexes around in this struct can reduce |
3029 | | // threading performance by up to 70%! |
3030 | | pthread_mutex_t command_m; |
3031 | | pthread_cond_t command_c; |
3032 | | enum sam_cmd command; |
3033 | | |
3034 | | // One of the E* errno codes |
3035 | | int errcode; |
3036 | | |
3037 | | htsFile *fp; |
3038 | | } SAM_state; |
3039 | | |
3040 | | // Returns a SAM_state struct from a generic hFILE. |
3041 | | // |
3042 | | // Returns NULL on failure. |
3043 | 0 | static SAM_state *sam_state_create(htsFile *fp) { |
3044 | | // Ideally sam_open wouldn't be a #define to hts_open but instead would |
3045 | | // be a redirect call with an additional 'S' mode. This in turn would |
3046 | | // correctly set the designed format to sam instead of a generic |
3047 | | // text_format. |
3048 | 0 | if (fp->format.format != sam && fp->format.format != text_format) |
3049 | 0 | return NULL; |
3050 | | |
3051 | 0 | SAM_state *fd = calloc(1, sizeof(*fd)); |
3052 | 0 | if (!fd) |
3053 | 0 | return NULL; |
3054 | | |
3055 | 0 | fp->state = fd; |
3056 | 0 | fd->fp = fp; |
3057 | |
|
3058 | 0 | return fd; |
3059 | 0 | } |
3060 | | |
3061 | | static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str); |
3062 | | static void *sam_format_worker(void *arg); |
3063 | | |
3064 | 0 | static void sam_state_err(SAM_state *fd, int errcode) { |
3065 | 0 | pthread_mutex_lock(&fd->command_m); |
3066 | 0 | if (!fd->errcode) |
3067 | 0 | fd->errcode = errcode; |
3068 | 0 | pthread_mutex_unlock(&fd->command_m); |
3069 | 0 | } |
3070 | | |
3071 | 0 | static void sam_free_sp_bams(sp_bams *b) { |
3072 | 0 | if (!b) |
3073 | 0 | return; |
3074 | | |
3075 | 0 | if (b->bams) { |
3076 | 0 | int i; |
3077 | 0 | for (i = 0; i < b->abams; i++) { |
3078 | 0 | if (b->bams[i].data) |
3079 | 0 | free(b->bams[i].data); |
3080 | 0 | } |
3081 | 0 | free(b->bams); |
3082 | 0 | } |
3083 | 0 | free(b); |
3084 | 0 | } |
3085 | | |
3086 | | // Destroys the state produce by sam_state_create. |
3087 | 9.31k | int sam_state_destroy(htsFile *fp) { |
3088 | 9.31k | int ret = 0; |
3089 | | |
3090 | 9.31k | if (!fp->state) |
3091 | 9.31k | return 0; |
3092 | | |
3093 | 0 | SAM_state *fd = fp->state; |
3094 | 0 | if (fd->p) { |
3095 | 0 | if (fd->h) { |
3096 | | // Notify sam_dispatcher we're closing |
3097 | 0 | pthread_mutex_lock(&fd->command_m); |
3098 | 0 | if (fd->command != SAM_CLOSE_DONE) |
3099 | 0 | fd->command = SAM_CLOSE; |
3100 | 0 | pthread_cond_signal(&fd->command_c); |
3101 | 0 | ret = -fd->errcode; |
3102 | 0 | if (fd->q) |
3103 | 0 | hts_tpool_wake_dispatch(fd->q); // unstick the reader |
3104 | |
|
3105 | 0 | if (!fp->is_write && fd->q && fd->dispatcher_set) { |
3106 | 0 | for (;;) { |
3107 | | // Avoid deadlocks with dispatcher |
3108 | 0 | if (fd->command == SAM_CLOSE_DONE) |
3109 | 0 | break; |
3110 | 0 | hts_tpool_wake_dispatch(fd->q); |
3111 | 0 | pthread_mutex_unlock(&fd->command_m); |
3112 | 0 | hts_usleep(10000); |
3113 | 0 | pthread_mutex_lock(&fd->command_m); |
3114 | 0 | } |
3115 | 0 | } |
3116 | 0 | pthread_mutex_unlock(&fd->command_m); |
3117 | |
|
3118 | 0 | if (fp->is_write) { |
3119 | | // Dispatch the last partial block. |
3120 | 0 | sp_bams *gb = fd->curr_bam; |
3121 | 0 | if (!ret && gb && gb->nbams > 0 && fd->q) |
3122 | 0 | ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb); |
3123 | | |
3124 | | // Flush and drain output |
3125 | 0 | if (fd->q) |
3126 | 0 | hts_tpool_process_flush(fd->q); |
3127 | 0 | pthread_mutex_lock(&fd->command_m); |
3128 | 0 | if (!ret) ret = -fd->errcode; |
3129 | 0 | pthread_mutex_unlock(&fd->command_m); |
3130 | |
|
3131 | 0 | while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) { |
3132 | 0 | hts_usleep(10000); |
3133 | 0 | pthread_mutex_lock(&fd->command_m); |
3134 | 0 | ret = -fd->errcode; |
3135 | | // not empty but shutdown implies error |
3136 | 0 | if (hts_tpool_process_is_shutdown(fd->q) && !ret) |
3137 | 0 | ret = EIO; |
3138 | 0 | pthread_mutex_unlock(&fd->command_m); |
3139 | 0 | } |
3140 | 0 | if (fd->q) |
3141 | 0 | hts_tpool_process_shutdown(fd->q); |
3142 | 0 | } |
3143 | | |
3144 | | // Wait for it to acknowledge |
3145 | 0 | if (fd->dispatcher_set) |
3146 | 0 | pthread_join(fd->dispatcher, NULL); |
3147 | 0 | if (!ret) ret = -fd->errcode; |
3148 | 0 | } |
3149 | | |
3150 | | // Tidy up memory |
3151 | 0 | if (fd->q) |
3152 | 0 | hts_tpool_process_destroy(fd->q); |
3153 | |
|
3154 | 0 | if (fd->own_pool && fp->format.compression == no_compression) { |
3155 | 0 | hts_tpool_destroy(fd->p); |
3156 | 0 | fd->p = NULL; |
3157 | 0 | } |
3158 | 0 | pthread_mutex_destroy(&fd->lines_m); |
3159 | 0 | pthread_mutex_destroy(&fd->command_m); |
3160 | 0 | pthread_cond_destroy(&fd->command_c); |
3161 | |
|
3162 | 0 | sp_lines *l = fd->lines; |
3163 | 0 | while (l) { |
3164 | 0 | sp_lines *n = l->next; |
3165 | 0 | free(l->data); |
3166 | 0 | free(l); |
3167 | 0 | l = n; |
3168 | 0 | } |
3169 | |
|
3170 | 0 | sp_bams *b = fd->bams; |
3171 | 0 | while (b) { |
3172 | 0 | if (fd->curr_bam == b) |
3173 | 0 | fd->curr_bam = NULL; |
3174 | 0 | sp_bams *n = b->next; |
3175 | 0 | sam_free_sp_bams(b); |
3176 | 0 | b = n; |
3177 | 0 | } |
3178 | |
|
3179 | 0 | if (fd->curr_bam) |
3180 | 0 | sam_free_sp_bams(fd->curr_bam); |
3181 | | |
3182 | | // Decrement counter by one, maybe destroying too. |
3183 | | // This is to permit the caller using bam_hdr_destroy |
3184 | | // before sam_close without triggering decode errors |
3185 | | // in the background threads. |
3186 | 0 | bam_hdr_destroy(fd->h); |
3187 | 0 | } |
3188 | |
|
3189 | 0 | free(fp->state); |
3190 | 0 | fp->state = NULL; |
3191 | 0 | return ret; |
3192 | 9.31k | } |
3193 | | |
3194 | | // Cleanup function - job for sam_parse_worker; result for sam_format_worker |
3195 | 0 | static void cleanup_sp_lines(void *arg) { |
3196 | 0 | sp_lines *gl = (sp_lines *)arg; |
3197 | 0 | if (!gl) return; |
3198 | | |
3199 | | // Should always be true for lines passed to / from thread workers. |
3200 | 0 | assert(gl->next == NULL); |
3201 | |
|
3202 | 0 | free(gl->data); |
3203 | 0 | sam_free_sp_bams(gl->bams); |
3204 | 0 | free(gl); |
3205 | 0 | } |
3206 | | |
3207 | | // Run from one of the worker threads. |
3208 | | // Convert a passed in array of lines to array of BAMs, returning |
3209 | | // the result back to the thread queue. |
3210 | 0 | static void *sam_parse_worker(void *arg) { |
3211 | 0 | sp_lines *gl = (sp_lines *)arg; |
3212 | 0 | sp_bams *gb = NULL; |
3213 | 0 | char *lines = gl->data; |
3214 | 0 | int i; |
3215 | 0 | bam1_t *b; |
3216 | 0 | SAM_state *fd = gl->fd; |
3217 | | |
3218 | | // Use a block of BAM structs we had earlier if available. |
3219 | 0 | pthread_mutex_lock(&fd->lines_m); |
3220 | 0 | if (fd->bams) { |
3221 | 0 | gb = fd->bams; |
3222 | 0 | fd->bams = gb->next; |
3223 | 0 | } |
3224 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3225 | |
|
3226 | 0 | if (gb == NULL) { |
3227 | 0 | gb = calloc(1, sizeof(*gb)); |
3228 | 0 | if (!gb) { |
3229 | 0 | return NULL; |
3230 | 0 | } |
3231 | 0 | gb->abams = 100; |
3232 | 0 | gb->bams = b = calloc(gb->abams, sizeof(*b)); |
3233 | 0 | if (!gb->bams) { |
3234 | 0 | sam_state_err(fd, ENOMEM); |
3235 | 0 | goto err; |
3236 | 0 | } |
3237 | 0 | gb->nbams = 0; |
3238 | 0 | gb->bam_mem = 0; |
3239 | 0 | } |
3240 | 0 | gb->serial = gl->serial; |
3241 | 0 | gb->next = NULL; |
3242 | |
|
3243 | 0 | b = (bam1_t *)gb->bams; |
3244 | 0 | if (!b) { |
3245 | 0 | sam_state_err(fd, ENOMEM); |
3246 | 0 | goto err; |
3247 | 0 | } |
3248 | | |
3249 | 0 | i = 0; |
3250 | 0 | char *cp = lines, *cp_end = lines + gl->data_size; |
3251 | 0 | while (cp < cp_end) { |
3252 | 0 | if (i >= gb->abams) { |
3253 | 0 | int old_abams = gb->abams; |
3254 | 0 | gb->abams *= 2; |
3255 | 0 | b = (bam1_t *)realloc(gb->bams, gb->abams*sizeof(bam1_t)); |
3256 | 0 | if (!b) { |
3257 | 0 | gb->abams /= 2; |
3258 | 0 | sam_state_err(fd, ENOMEM); |
3259 | 0 | goto err; |
3260 | 0 | } |
3261 | 0 | memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b)); |
3262 | 0 | gb->bams = b; |
3263 | 0 | } |
3264 | | |
3265 | | // Ideally we'd get sam_parse1 to return the number of |
3266 | | // bytes decoded and to be able to stop on newline as |
3267 | | // well as \0. |
3268 | | // |
3269 | | // We can then avoid the additional strchr loop. |
3270 | | // It's around 6% of our CPU cost, albeit threadable. |
3271 | | // |
3272 | | // However this is an API change so for now we copy. |
3273 | | |
3274 | 0 | char *nl = strchr(cp, '\n'); |
3275 | 0 | char *line_end; |
3276 | 0 | if (nl) { |
3277 | 0 | line_end = nl; |
3278 | 0 | if (line_end > cp && *(line_end - 1) == '\r') |
3279 | 0 | line_end--; |
3280 | 0 | nl++; |
3281 | 0 | } else { |
3282 | 0 | nl = line_end = cp_end; |
3283 | 0 | } |
3284 | 0 | *line_end = '\0'; |
3285 | 0 | kstring_t ks = { line_end - cp, gl->alloc, cp }; |
3286 | 0 | if (sam_parse1(&ks, fd->h, &b[i]) < 0) { |
3287 | 0 | sam_state_err(fd, errno ? errno : EIO); |
3288 | 0 | cleanup_sp_lines(gl); |
3289 | 0 | goto err; |
3290 | 0 | } |
3291 | | |
3292 | 0 | cp = nl; |
3293 | 0 | i++; |
3294 | 0 | } |
3295 | 0 | gb->nbams = i; |
3296 | |
|
3297 | 0 | pthread_mutex_lock(&fd->lines_m); |
3298 | 0 | gl->next = fd->lines; |
3299 | 0 | fd->lines = gl; |
3300 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3301 | 0 | return gb; |
3302 | | |
3303 | 0 | err: |
3304 | 0 | sam_free_sp_bams(gb); |
3305 | 0 | return NULL; |
3306 | 0 | } |
3307 | | |
3308 | 0 | static void *sam_parse_eof(void *arg) { |
3309 | 0 | return NULL; |
3310 | 0 | } |
3311 | | |
3312 | | // Cleanup function - result for sam_parse_worker; job for sam_format_worker |
3313 | 0 | static void cleanup_sp_bams(void *arg) { |
3314 | 0 | sam_free_sp_bams((sp_bams *) arg); |
3315 | 0 | } |
3316 | | |
3317 | | // Runs in its own thread. |
3318 | | // Reads a block of text (SAM) and sends a new job to the thread queue to |
3319 | | // translate this to BAM. |
3320 | 0 | static void *sam_dispatcher_read(void *vp) { |
3321 | 0 | htsFile *fp = vp; |
3322 | 0 | kstring_t line = {0}; |
3323 | 0 | int line_frag = 0; |
3324 | 0 | SAM_state *fd = fp->state; |
3325 | 0 | sp_lines *l = NULL; |
3326 | | |
3327 | | // Pre-allocate buffer for left-over bits of line (exact size doesn't |
3328 | | // matter as it will grow if necessary). |
3329 | 0 | if (ks_resize(&line, 1000) < 0) |
3330 | 0 | goto err; |
3331 | | |
3332 | 0 | for (;;) { |
3333 | | // Check for command |
3334 | 0 | pthread_mutex_lock(&fd->command_m); |
3335 | 0 | switch (fd->command) { |
3336 | | |
3337 | 0 | case SAM_CLOSE: |
3338 | 0 | pthread_cond_signal(&fd->command_c); |
3339 | 0 | pthread_mutex_unlock(&fd->command_m); |
3340 | 0 | hts_tpool_process_shutdown(fd->q); |
3341 | 0 | goto tidyup; |
3342 | | |
3343 | 0 | default: |
3344 | 0 | break; |
3345 | 0 | } |
3346 | 0 | pthread_mutex_unlock(&fd->command_m); |
3347 | |
|
3348 | 0 | pthread_mutex_lock(&fd->lines_m); |
3349 | 0 | if (fd->lines) { |
3350 | | // reuse existing line buffer |
3351 | 0 | l = fd->lines; |
3352 | 0 | fd->lines = l->next; |
3353 | 0 | } |
3354 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3355 | |
|
3356 | 0 | if (l == NULL) { |
3357 | | // none to reuse, to create a new one |
3358 | 0 | l = calloc(1, sizeof(*l)); |
3359 | 0 | if (!l) |
3360 | 0 | goto err; |
3361 | 0 | l->alloc = SAM_NBYTES; |
3362 | 0 | l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1 |
3363 | 0 | if (!l->data) { |
3364 | 0 | free(l); |
3365 | 0 | l = NULL; |
3366 | 0 | goto err; |
3367 | 0 | } |
3368 | 0 | l->fd = fd; |
3369 | 0 | } |
3370 | 0 | l->next = NULL; |
3371 | |
|
3372 | 0 | if (l->alloc < line_frag+SAM_NBYTES/2) { |
3373 | 0 | char *rp = realloc(l->data, line_frag+SAM_NBYTES/2 +8); |
3374 | 0 | if (!rp) |
3375 | 0 | goto err; |
3376 | 0 | l->alloc = line_frag+SAM_NBYTES/2; |
3377 | 0 | l->data = rp; |
3378 | 0 | } |
3379 | 0 | memcpy(l->data, line.s, line_frag); |
3380 | |
|
3381 | 0 | l->data_size = line_frag; |
3382 | 0 | ssize_t nbytes; |
3383 | 0 | longer_line: |
3384 | 0 | if (fp->is_bgzf) |
3385 | 0 | nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag); |
3386 | 0 | else |
3387 | 0 | nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag); |
3388 | 0 | if (nbytes < 0) { |
3389 | 0 | sam_state_err(fd, errno ? errno : EIO); |
3390 | 0 | goto err; |
3391 | 0 | } else if (nbytes == 0) |
3392 | 0 | break; // EOF |
3393 | 0 | l->data_size += nbytes; |
3394 | | |
3395 | | // trim to last \n. Maybe \r\n, but that's still fine |
3396 | 0 | if (nbytes == l->alloc - line_frag) { |
3397 | 0 | char *cp_end = l->data + l->data_size; |
3398 | 0 | char *cp = cp_end-1; |
3399 | |
|
3400 | 0 | while (cp > (char *)l->data && *cp != '\n') |
3401 | 0 | cp--; |
3402 | | |
3403 | | // entire buffer is part of a single line |
3404 | 0 | if (cp == l->data) { |
3405 | 0 | line_frag = l->data_size; |
3406 | 0 | char *rp = realloc(l->data, l->alloc * 2 + 8); |
3407 | 0 | if (!rp) |
3408 | 0 | goto err; |
3409 | 0 | l->alloc *= 2; |
3410 | 0 | l->data = rp; |
3411 | 0 | assert(l->alloc >= l->data_size); |
3412 | 0 | assert(l->alloc >= line_frag); |
3413 | 0 | assert(l->alloc >= l->alloc - line_frag); |
3414 | 0 | goto longer_line; |
3415 | 0 | } |
3416 | 0 | cp++; |
3417 | | |
3418 | | // line holds the remainder of our line. |
3419 | 0 | if (ks_resize(&line, cp_end - cp) < 0) |
3420 | 0 | goto err; |
3421 | 0 | memcpy(line.s, cp, cp_end - cp); |
3422 | 0 | line_frag = cp_end - cp; |
3423 | 0 | l->data_size = l->alloc - line_frag; |
3424 | 0 | } else { |
3425 | | // out of buffer |
3426 | 0 | line_frag = 0; |
3427 | 0 | } |
3428 | | |
3429 | 0 | l->serial = fd->serial++; |
3430 | | //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial); |
3431 | 0 | if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l, |
3432 | 0 | cleanup_sp_lines, cleanup_sp_bams, 0) < 0) |
3433 | 0 | goto err; |
3434 | 0 | pthread_mutex_lock(&fd->command_m); |
3435 | 0 | if (fd->command == SAM_CLOSE) { |
3436 | 0 | pthread_mutex_unlock(&fd->command_m); |
3437 | 0 | l = NULL; |
3438 | 0 | goto tidyup; |
3439 | 0 | } |
3440 | 0 | l = NULL; // Now "owned" by sam_parse_worker() |
3441 | 0 | pthread_mutex_unlock(&fd->command_m); |
3442 | 0 | } |
3443 | | |
3444 | | // Submit a NULL sp_bams entry to act as an EOF marker |
3445 | 0 | if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0) |
3446 | 0 | goto err; |
3447 | | |
3448 | | // At EOF, wait for close request. |
3449 | | // (In future if we add support for seek, this is where we need to catch it.) |
3450 | 0 | for (;;) { |
3451 | 0 | pthread_mutex_lock(&fd->command_m); |
3452 | 0 | if (fd->command == SAM_NONE) |
3453 | 0 | pthread_cond_wait(&fd->command_c, &fd->command_m); |
3454 | 0 | switch (fd->command) { |
3455 | 0 | case SAM_CLOSE: |
3456 | 0 | pthread_cond_signal(&fd->command_c); |
3457 | 0 | pthread_mutex_unlock(&fd->command_m); |
3458 | 0 | hts_tpool_process_shutdown(fd->q); |
3459 | 0 | goto tidyup; |
3460 | | |
3461 | 0 | default: |
3462 | 0 | pthread_mutex_unlock(&fd->command_m); |
3463 | 0 | break; |
3464 | 0 | } |
3465 | 0 | } |
3466 | | |
3467 | 0 | tidyup: |
3468 | 0 | pthread_mutex_lock(&fd->command_m); |
3469 | 0 | fd->command = SAM_CLOSE_DONE; |
3470 | 0 | pthread_cond_signal(&fd->command_c); |
3471 | 0 | pthread_mutex_unlock(&fd->command_m); |
3472 | |
|
3473 | 0 | if (l) { |
3474 | 0 | pthread_mutex_lock(&fd->lines_m); |
3475 | 0 | l->next = fd->lines; |
3476 | 0 | fd->lines = l; |
3477 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3478 | 0 | } |
3479 | 0 | free(line.s); |
3480 | |
|
3481 | 0 | return NULL; |
3482 | | |
3483 | 0 | err: |
3484 | 0 | sam_state_err(fd, errno ? errno : ENOMEM); |
3485 | 0 | hts_tpool_process_shutdown(fd->q); |
3486 | 0 | goto tidyup; |
3487 | 0 | } |
3488 | | |
3489 | | // Runs in its own thread. |
3490 | | // Takes encoded blocks of SAM off the thread results queue and writes them |
3491 | | // to our output stream. |
3492 | 0 | static void *sam_dispatcher_write(void *vp) { |
3493 | 0 | htsFile *fp = vp; |
3494 | 0 | SAM_state *fd = fp->state; |
3495 | 0 | hts_tpool_result *r; |
3496 | | |
3497 | | // Iterates until result queue is shutdown, where it returns NULL. |
3498 | 0 | while ((r = hts_tpool_next_result_wait(fd->q))) { |
3499 | 0 | sp_lines *gl = (sp_lines *)hts_tpool_result_data(r); |
3500 | 0 | if (!gl) { |
3501 | 0 | sam_state_err(fd, ENOMEM); |
3502 | 0 | goto err; |
3503 | 0 | } |
3504 | | |
3505 | 0 | if (fp->idx) { |
3506 | 0 | sp_bams *gb = gl->bams; |
3507 | 0 | int i = 0, count = 0; |
3508 | 0 | while (i < gl->data_size) { |
3509 | 0 | int j = i; |
3510 | 0 | while (i < gl->data_size && gl->data[i] != '\n') |
3511 | 0 | i++; |
3512 | 0 | if (i < gl->data_size) |
3513 | 0 | i++; |
3514 | |
|
3515 | 0 | if (fp->is_bgzf) { |
3516 | 0 | if (bgzf_flush_try(fp->fp.bgzf, i-j) < 0) |
3517 | 0 | goto err; |
3518 | 0 | if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j) |
3519 | 0 | goto err; |
3520 | 0 | } else { |
3521 | 0 | if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j) |
3522 | 0 | goto err; |
3523 | 0 | } |
3524 | | |
3525 | 0 | bam1_t *b = &gb->bams[count++]; |
3526 | 0 | if (fp->format.compression == bgzf) { |
3527 | 0 | if (bgzf_idx_push(fp->fp.bgzf, fp->idx, |
3528 | 0 | b->core.tid, b->core.pos, bam_endpos(b), |
3529 | 0 | bgzf_tell(fp->fp.bgzf), |
3530 | 0 | !(b->core.flag&BAM_FUNMAP)) < 0) { |
3531 | 0 | sam_state_err(fd, errno ? errno : ENOMEM); |
3532 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
3533 | 0 | bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1); |
3534 | 0 | goto err; |
3535 | 0 | } |
3536 | 0 | } else { |
3537 | 0 | if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b), |
3538 | 0 | bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) { |
3539 | 0 | sam_state_err(fd, errno ? errno : ENOMEM); |
3540 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
3541 | 0 | bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1); |
3542 | 0 | goto err; |
3543 | 0 | } |
3544 | 0 | } |
3545 | 0 | } |
3546 | | |
3547 | 0 | assert(count == gb->nbams); |
3548 | | |
3549 | | // Add bam array to free-list |
3550 | 0 | pthread_mutex_lock(&fd->lines_m); |
3551 | 0 | gb->next = fd->bams; |
3552 | 0 | fd->bams = gl->bams; |
3553 | 0 | gl->bams = NULL; |
3554 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3555 | 0 | } else { |
3556 | 0 | if (fp->is_bgzf) { |
3557 | | // We keep track of how much in the current block we have |
3558 | | // remaining => R. We look for the last newline in input |
3559 | | // [i] to [i+R], backwards => position N. |
3560 | | // |
3561 | | // If we find a newline, we write out bytes i to N. |
3562 | | // We know we cannot fit the next record in this bgzf block, |
3563 | | // so we flush what we have and copy input N to i+R into |
3564 | | // the start of a new block, and recompute a new R for that. |
3565 | | // |
3566 | | // If we don't find a newline (i==N) then we cannot extend |
3567 | | // the current block at all, so flush whatever is in it now |
3568 | | // if it ends on a newline. |
3569 | | // We still copy i(==N) to i+R to the next block and |
3570 | | // continue as before with a new R. |
3571 | | // |
3572 | | // The only exception on the flush is when we run out of |
3573 | | // data in the input. In that case we skip it as we don't |
3574 | | // yet know if the next record will fit. |
3575 | | // |
3576 | | // Both conditions share the same code here: |
3577 | | // - Look for newline (pos N) |
3578 | | // - Write i to N (which maybe 0) |
3579 | | // - Flush if block ends on newline and not end of input |
3580 | | // - write N to i+R |
3581 | |
|
3582 | 0 | int i = 0; |
3583 | 0 | BGZF *fb = fp->fp.bgzf; |
3584 | 0 | while (i < gl->data_size) { |
3585 | | // remaining space in block |
3586 | 0 | int R = BGZF_BLOCK_SIZE - fb->block_offset; |
3587 | 0 | int eod = 0; |
3588 | 0 | if (R > gl->data_size-i) |
3589 | 0 | R = gl->data_size-i, eod = 1; |
3590 | | |
3591 | | // Find last newline in input data |
3592 | 0 | int N = i + R; |
3593 | 0 | while (--N > i) { |
3594 | 0 | if (gl->data[N] == '\n') |
3595 | 0 | break; |
3596 | 0 | } |
3597 | |
|
3598 | 0 | if (N != i) { |
3599 | | // Found a newline |
3600 | 0 | N++; |
3601 | 0 | if (bgzf_write(fb, &gl->data[i], N-i) != N-i) |
3602 | 0 | goto err; |
3603 | 0 | } |
3604 | | |
3605 | | // Flush bgzf block |
3606 | 0 | int b_off = fb->block_offset; |
3607 | 0 | if (!eod && b_off && |
3608 | 0 | ((char *)fb->uncompressed_block)[b_off-1] == '\n') |
3609 | 0 | if (bgzf_flush_try(fb, BGZF_BLOCK_SIZE) < 0) |
3610 | 0 | goto err; |
3611 | | |
3612 | | // Copy from N onwards into next block |
3613 | 0 | if (i+R > N) |
3614 | 0 | if (bgzf_write(fb, &gl->data[N], i+R - N) |
3615 | 0 | != i+R - N) |
3616 | 0 | goto err; |
3617 | | |
3618 | 0 | i = i+R; |
3619 | 0 | } |
3620 | 0 | } else { |
3621 | 0 | if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size) |
3622 | 0 | goto err; |
3623 | 0 | } |
3624 | 0 | } |
3625 | | |
3626 | 0 | hts_tpool_delete_result(r, 0); |
3627 | | |
3628 | | // Also updated by main thread |
3629 | 0 | pthread_mutex_lock(&fd->lines_m); |
3630 | 0 | gl->next = fd->lines; |
3631 | 0 | fd->lines = gl; |
3632 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3633 | 0 | } |
3634 | | |
3635 | 0 | sam_state_err(fd, 0); // success |
3636 | 0 | hts_tpool_process_shutdown(fd->q); |
3637 | 0 | return NULL; |
3638 | | |
3639 | 0 | err: |
3640 | 0 | sam_state_err(fd, errno ? errno : EIO); |
3641 | 0 | return (void *)-1; |
3642 | 0 | } |
3643 | | |
3644 | | // Run from one of the worker threads. |
3645 | | // Convert a passed in array of BAMs (sp_bams) and converts to a block |
3646 | | // of text SAM records (sp_lines). |
3647 | 0 | static void *sam_format_worker(void *arg) { |
3648 | 0 | sp_bams *gb = (sp_bams *)arg; |
3649 | 0 | sp_lines *gl = NULL; |
3650 | 0 | int i; |
3651 | 0 | SAM_state *fd = gb->fd; |
3652 | 0 | htsFile *fp = fd->fp; |
3653 | | |
3654 | | // Use a block of SAM strings we had earlier if available. |
3655 | 0 | pthread_mutex_lock(&fd->lines_m); |
3656 | 0 | if (fd->lines) { |
3657 | 0 | gl = fd->lines; |
3658 | 0 | fd->lines = gl->next; |
3659 | 0 | } |
3660 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3661 | |
|
3662 | 0 | if (gl == NULL) { |
3663 | 0 | gl = calloc(1, sizeof(*gl)); |
3664 | 0 | if (!gl) { |
3665 | 0 | sam_state_err(fd, ENOMEM); |
3666 | 0 | return NULL; |
3667 | 0 | } |
3668 | 0 | gl->alloc = gl->data_size = 0; |
3669 | 0 | gl->data = NULL; |
3670 | 0 | } |
3671 | 0 | gl->serial = gb->serial; |
3672 | 0 | gl->next = NULL; |
3673 | |
|
3674 | 0 | kstring_t ks = {0, gl->alloc, gl->data}; |
3675 | |
|
3676 | 0 | for (i = 0; i < gb->nbams; i++) { |
3677 | 0 | if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) { |
3678 | 0 | sam_state_err(fd, errno ? errno : EIO); |
3679 | 0 | goto err; |
3680 | 0 | } |
3681 | 0 | kputc('\n', &ks); |
3682 | 0 | } |
3683 | | |
3684 | 0 | pthread_mutex_lock(&fd->lines_m); |
3685 | 0 | gl->data_size = ks.l; |
3686 | 0 | gl->alloc = ks.m; |
3687 | 0 | gl->data = ks.s; |
3688 | |
|
3689 | 0 | if (fp->idx) { |
3690 | | // Keep hold of the bam array a little longer as |
3691 | | // sam_dispatcher_write needs to use them for building the index. |
3692 | 0 | gl->bams = gb; |
3693 | 0 | } else { |
3694 | | // Add bam array to free-list |
3695 | 0 | gb->next = fd->bams; |
3696 | 0 | fd->bams = gb; |
3697 | 0 | } |
3698 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3699 | |
|
3700 | 0 | return gl; |
3701 | | |
3702 | 0 | err: |
3703 | | // Possible race between this and fd->curr_bam. |
3704 | | // Easier to not free and leave it on the input list so it |
3705 | | // gets freed there instead? |
3706 | | // sam_free_sp_bams(gb); |
3707 | 0 | if (gl) { |
3708 | 0 | free(gl->data); |
3709 | 0 | free(gl); |
3710 | 0 | } |
3711 | 0 | return NULL; |
3712 | 0 | } |
3713 | | |
3714 | 0 | int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) { |
3715 | 0 | if (fp->state) |
3716 | 0 | return 0; |
3717 | | |
3718 | 0 | if (!(fp->state = sam_state_create(fp))) |
3719 | 0 | return -1; |
3720 | 0 | SAM_state *fd = (SAM_state *)fp->state; |
3721 | |
|
3722 | 0 | pthread_mutex_init(&fd->lines_m, NULL); |
3723 | 0 | pthread_mutex_init(&fd->command_m, NULL); |
3724 | 0 | pthread_cond_init(&fd->command_c, NULL); |
3725 | 0 | fd->p = p->pool; |
3726 | 0 | int qsize = p->qsize; |
3727 | 0 | if (!qsize) |
3728 | 0 | qsize = 2*hts_tpool_size(fd->p); |
3729 | 0 | fd->q = hts_tpool_process_init(fd->p, qsize, 0); |
3730 | 0 | if (!fd->q) { |
3731 | 0 | sam_state_destroy(fp); |
3732 | 0 | return -1; |
3733 | 0 | } |
3734 | | |
3735 | 0 | if (fp->format.compression == bgzf) |
3736 | 0 | return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize); |
3737 | | |
3738 | 0 | return 0; |
3739 | 0 | } |
3740 | | |
3741 | 0 | int sam_set_threads(htsFile *fp, int nthreads) { |
3742 | 0 | if (nthreads <= 0) |
3743 | 0 | return 0; |
3744 | | |
3745 | 0 | htsThreadPool p; |
3746 | 0 | p.pool = hts_tpool_init(nthreads); |
3747 | 0 | p.qsize = nthreads*2; |
3748 | |
|
3749 | 0 | int ret = sam_set_thread_pool(fp, &p); |
3750 | 0 | if (ret < 0) |
3751 | 0 | return ret; |
3752 | | |
3753 | 0 | SAM_state *fd = (SAM_state *)fp->state; |
3754 | 0 | fd->own_pool = 1; |
3755 | |
|
3756 | 0 | return 0; |
3757 | 0 | } |
3758 | | |
3759 | 0 | #define UMI_TAGS 5 |
3760 | | typedef struct { |
3761 | | kstring_t name; |
3762 | | kstring_t comment; // NB: pointer into name, do not free |
3763 | | kstring_t seq; |
3764 | | kstring_t qual; |
3765 | | int casava; |
3766 | | int aux; |
3767 | | int rnum; |
3768 | | char BC[3]; // aux tag ID for barcode |
3769 | | char UMI[UMI_TAGS][3]; // aux tag list for UMIs. |
3770 | | khash_t(tag) *tags; // which aux tags to use (if empty, use all). |
3771 | | char nprefix; |
3772 | | int sra_names; |
3773 | | regex_t regex; |
3774 | | } fastq_state; |
3775 | | |
3776 | | // Initialise fastq state. |
3777 | | // Name char of '@' or '>' distinguishes fastq vs fasta variant |
3778 | 1.89k | static fastq_state *fastq_state_init(int name_char) { |
3779 | 1.89k | fastq_state *x = (fastq_state *)calloc(1, sizeof(*x)); |
3780 | 1.89k | if (!x) |
3781 | 0 | return NULL; |
3782 | 1.89k | strcpy(x->BC, "BC"); |
3783 | 1.89k | x->nprefix = name_char; |
3784 | | // Default Illumina naming convention |
3785 | 1.89k | char *re = "^[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:([^:#/]+)"; |
3786 | 1.89k | if (regcomp(&x->regex, re, REG_EXTENDED) != 0) { |
3787 | 0 | free(x); |
3788 | 0 | return NULL; |
3789 | 0 | } |
3790 | | |
3791 | 1.89k | return x; |
3792 | 1.89k | } |
3793 | | |
3794 | 2.53k | void fastq_state_destroy(htsFile *fp) { |
3795 | 2.53k | if (fp->state) { |
3796 | 1.89k | fastq_state *x = (fastq_state *)fp->state; |
3797 | 1.89k | if (x->tags) |
3798 | 0 | kh_destroy(tag, x->tags); |
3799 | 1.89k | ks_free(&x->name); |
3800 | 1.89k | ks_free(&x->seq); |
3801 | 1.89k | ks_free(&x->qual); |
3802 | 1.89k | regfree(&x->regex); |
3803 | 1.89k | free(fp->state); |
3804 | 1.89k | } |
3805 | 2.53k | } |
3806 | | |
3807 | 0 | int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) { |
3808 | 0 | va_list args; |
3809 | |
|
3810 | 0 | if (!fp) |
3811 | 0 | return -1; |
3812 | 0 | if (!fp->state) |
3813 | 0 | if (!(fp->state = fastq_state_init(fp->format.format == fastq_format |
3814 | 0 | ? '@' : '>'))) |
3815 | 0 | return -1; |
3816 | | |
3817 | 0 | fastq_state *x = (fastq_state *)fp->state; |
3818 | |
|
3819 | 0 | switch (opt) { |
3820 | 0 | case FASTQ_OPT_CASAVA: |
3821 | 0 | x->casava = 1; |
3822 | 0 | break; |
3823 | | |
3824 | 0 | case FASTQ_OPT_NAME2: |
3825 | 0 | x->sra_names = 1; |
3826 | 0 | break; |
3827 | | |
3828 | 0 | case FASTQ_OPT_AUX: { |
3829 | 0 | va_start(args, opt); |
3830 | 0 | x->aux = 1; |
3831 | 0 | char *tag = va_arg(args, char *); |
3832 | 0 | va_end(args); |
3833 | 0 | if (tag && strcmp(tag, "1") != 0) { |
3834 | 0 | if (!x->tags) |
3835 | 0 | if (!(x->tags = kh_init(tag))) |
3836 | 0 | return -1; |
3837 | | |
3838 | 0 | size_t i, tlen = strlen(tag); |
3839 | 0 | for (i = 0; i+3 <= tlen+1; i += 3) { |
3840 | 0 | if (tag[i+0] == ',' || tag[i+1] == ',' || |
3841 | 0 | !(tag[i+2] == ',' || tag[i+2] == '\0')) { |
3842 | 0 | hts_log_warning("Bad tag format '%.3s'; skipping option", tag+i); |
3843 | 0 | break; |
3844 | 0 | } |
3845 | 0 | int ret, tcode = tag[i+0]*256 + tag[i+1]; |
3846 | 0 | kh_put(tag, x->tags, tcode, &ret); |
3847 | 0 | if (ret < 0) |
3848 | 0 | return -1; |
3849 | 0 | } |
3850 | 0 | } |
3851 | 0 | break; |
3852 | 0 | } |
3853 | | |
3854 | 0 | case FASTQ_OPT_BARCODE: { |
3855 | 0 | va_start(args, opt); |
3856 | 0 | char *bc = va_arg(args, char *); |
3857 | 0 | va_end(args); |
3858 | 0 | strncpy(x->BC, bc, 2); |
3859 | 0 | x->BC[2] = 0; |
3860 | 0 | break; |
3861 | 0 | } |
3862 | | |
3863 | 0 | case FASTQ_OPT_UMI: { |
3864 | | // UMI tag: an empty string disables UMI by setting x->UMI[0] to \0\0\0 |
3865 | 0 | va_start(args, opt); |
3866 | 0 | char *bc = va_arg(args, char *), *bc_orig = bc; |
3867 | 0 | va_end(args); |
3868 | 0 | if (!bc || strcmp(bc, "1") == 0) |
3869 | 0 | bc = "RX"; |
3870 | 0 | int ntags = 0, err = 0; |
3871 | 0 | for (ntags = 0; *bc && ntags < UMI_TAGS; ntags++) { |
3872 | 0 | if (!isalpha(bc[0]) || !isalnum_c(bc[1])) { |
3873 | 0 | err = 1; |
3874 | 0 | break; |
3875 | 0 | } |
3876 | | |
3877 | 0 | strncpy(x->UMI[ntags], bc, 3); |
3878 | 0 | bc += 2; |
3879 | 0 | if (*bc && *bc != ',') { |
3880 | 0 | err = 1; |
3881 | 0 | break; |
3882 | 0 | } |
3883 | 0 | bc+=(*bc==','); |
3884 | 0 | x->UMI[ntags][2] = 0; |
3885 | 0 | } |
3886 | 0 | for (; ntags < UMI_TAGS; ntags++) |
3887 | 0 | x->UMI[ntags][0] = x->UMI[ntags][1] = x->UMI[ntags][2] = 0; |
3888 | | |
3889 | |
|
3890 | 0 | if (err) |
3891 | 0 | hts_log_warning("Bad UMI tag list '%s'", bc_orig); |
3892 | |
|
3893 | 0 | break; |
3894 | 0 | } |
3895 | | |
3896 | 0 | case FASTQ_OPT_UMI_REGEX: { |
3897 | 0 | va_start(args, opt); |
3898 | 0 | char *re = va_arg(args, char *); |
3899 | 0 | va_end(args); |
3900 | |
|
3901 | 0 | regfree(&x->regex); |
3902 | 0 | if (regcomp(&x->regex, re, REG_EXTENDED) != 0) { |
3903 | 0 | hts_log_error("Regular expression '%s' is not supported", re); |
3904 | 0 | return -1; |
3905 | 0 | } |
3906 | 0 | break; |
3907 | 0 | } |
3908 | | |
3909 | 0 | case FASTQ_OPT_RNUM: |
3910 | 0 | x->rnum = 1; |
3911 | 0 | break; |
3912 | | |
3913 | 0 | default: |
3914 | 0 | break; |
3915 | 0 | } |
3916 | 0 | return 0; |
3917 | 0 | } |
3918 | | |
3919 | 17.2M | static int fastq_parse1(htsFile *fp, bam1_t *b) { |
3920 | 17.2M | fastq_state *x = (fastq_state *)fp->state; |
3921 | 17.2M | size_t i, l; |
3922 | 17.2M | int ret = 0; |
3923 | | |
3924 | 17.2M | if (fp->format.format == fasta_format && fp->line.s) { |
3925 | | // For FASTA we've already read the >name line; steal it |
3926 | | // Not the most efficient, but we don't optimise for fasta reading. |
3927 | 17.2M | if (fp->line.l == 0) |
3928 | 900 | return -1; // EOF |
3929 | | |
3930 | 17.2M | free(x->name.s); |
3931 | 17.2M | x->name = fp->line; |
3932 | 17.2M | fp->line.l = fp->line.m = 0; |
3933 | 17.2M | fp->line.s = NULL; |
3934 | 17.2M | } else { |
3935 | | // Read a FASTQ format entry. |
3936 | 1.90k | ret = hts_getline(fp, KS_SEP_LINE, &x->name); |
3937 | 1.90k | if (ret == -1) |
3938 | 0 | return -1; // EOF |
3939 | 1.90k | else if (ret < -1) |
3940 | 18 | return ret; // ERR |
3941 | 1.90k | } |
3942 | | |
3943 | | // Name |
3944 | 17.2M | if (*x->name.s != x->nprefix) |
3945 | 3 | return -2; |
3946 | | |
3947 | | // Reverse the SRA strangeness of putting the run_name.number before |
3948 | | // the read name. |
3949 | 17.2M | i = 0; |
3950 | 17.2M | char *name = x->name.s+1; |
3951 | 17.2M | if (x->sra_names) { |
3952 | 0 | char *cp = strpbrk(x->name.s, " \t"); |
3953 | 0 | if (cp) { |
3954 | 0 | while (*cp == ' ' || *cp == '\t') |
3955 | 0 | cp++; |
3956 | 0 | *--cp = '@'; |
3957 | 0 | i = cp - x->name.s; |
3958 | 0 | name = cp+1; |
3959 | 0 | } |
3960 | 0 | } |
3961 | | |
3962 | 17.2M | l = x->name.l; |
3963 | 17.2M | char *s = x->name.s; |
3964 | 53.6M | while (i < l && !isspace_c(s[i])) |
3965 | 36.3M | i++; |
3966 | 17.2M | if (i < l) { |
3967 | 121k | s[i] = 0; |
3968 | 121k | x->name.l = i++; |
3969 | 121k | } |
3970 | | |
3971 | | // Comment; a kstring struct, but pointer into name line. (Do not free) |
3972 | 17.7M | while (i < l && isspace_c(s[i])) |
3973 | 508k | i++; |
3974 | 17.2M | x->comment.s = s+i; |
3975 | 17.2M | x->comment.l = l - i; |
3976 | | |
3977 | | // Seq |
3978 | 17.2M | x->seq.l = 0; |
3979 | 131M | for (;;) { |
3980 | 131M | if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0) |
3981 | 1.77k | if (fp->format.format == fastq_format || ret < -1) |
3982 | 852 | return -2; |
3983 | 131M | if (ret == -1 || |
3984 | 131M | *fp->line.s == (fp->format.format == fastq_format ? '+' : '>')) |
3985 | 17.2M | break; |
3986 | 113M | if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0) |
3987 | 0 | return -2; |
3988 | 113M | } |
3989 | | |
3990 | | // Qual |
3991 | 17.2M | if (fp->format.format == fastq_format) { |
3992 | 21 | size_t remainder = x->seq.l; |
3993 | 21 | x->qual.l = 0; |
3994 | 10.3k | do { |
3995 | 10.3k | if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0) |
3996 | 6 | return -2; |
3997 | 10.3k | if (fp->line.l > remainder) |
3998 | 9 | return -2; |
3999 | 10.3k | if (kputsn(fp->line.s, fp->line.l, &x->qual) < 0) |
4000 | 0 | return -2; |
4001 | 10.3k | remainder -= fp->line.l; |
4002 | 10.3k | } while (remainder > 0); |
4003 | | |
4004 | | // Decr qual |
4005 | 179k | for (i = 0; i < x->qual.l; i++) |
4006 | 179k | x->qual.s[i] -= '!'; |
4007 | 6 | } |
4008 | | |
4009 | 17.2M | int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED; |
4010 | 17.2M | if (x->name.l > 2 && |
4011 | 1.35M | x->name.s[x->name.l-2] == '/' && |
4012 | 71.9k | isdigit_c(x->name.s[x->name.l-1])) { |
4013 | 65.5k | switch(x->name.s[x->name.l-1]) { |
4014 | 6.66k | case '1': flag |= BAM_FREAD1 | pflag; break; |
4015 | 10.6k | case '2': flag |= BAM_FREAD2 | pflag; break; |
4016 | 48.1k | default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break; |
4017 | 65.5k | } |
4018 | 65.5k | x->name.s[x->name.l-=2] = 0; |
4019 | 65.5k | } |
4020 | | |
4021 | | // Strip Illumina formatted UMI off read-name |
4022 | 17.2M | char UMI_seq[256]; // maximum length in spec |
4023 | 17.2M | size_t UMI_len = 0; |
4024 | 17.2M | if (x->UMI[0][0]) { |
4025 | 0 | regmatch_t match[3]; |
4026 | 0 | if (regexec(&x->regex, x->name.s, 2, match, 0) == 0 |
4027 | 0 | && match[0].rm_so >= 0 // whole regex |
4028 | 0 | && match[1].rm_so >= 0) { // bracketted UMI component |
4029 | 0 | UMI_len = match[1].rm_eo - match[1].rm_so; |
4030 | 0 | if (UMI_len > 255) { |
4031 | 0 | hts_log_error("SAM read name is too long"); |
4032 | 0 | return -2; |
4033 | 0 | } |
4034 | | |
4035 | | // The SAMTags spec recommends (but not requires) separating |
4036 | | // barcodes with hyphen ('-'). |
4037 | 0 | size_t i; |
4038 | 0 | for (i = 0; i < UMI_len; i++) |
4039 | 0 | UMI_seq[i] = isalpha_c(x->name.s[i+match[1].rm_so]) |
4040 | 0 | ? x->name.s[i+match[1].rm_so] |
4041 | 0 | : '-'; |
4042 | | |
4043 | | // Move any trailing #num earlier in the name |
4044 | 0 | if (UMI_len) { |
4045 | 0 | UMI_seq[UMI_len++] = 0; |
4046 | |
|
4047 | 0 | x->name.l = match[1].rm_so; |
4048 | 0 | if (x->name.l > 0 && x->name.s[x->name.l-1] == ':') |
4049 | 0 | x->name.l--; // remove colon too |
4050 | 0 | char *cp = x->name.s + match[1].rm_eo; |
4051 | 0 | while (*cp) |
4052 | 0 | x->name.s[x->name.l++] = *cp++; |
4053 | 0 | x->name.s[x->name.l] = 0; |
4054 | 0 | } |
4055 | 0 | } |
4056 | 0 | } |
4057 | | |
4058 | | // Convert to BAM |
4059 | 17.2M | ret = bam_set1(b, |
4060 | 17.2M | x->name.s + x->name.l - name, name, |
4061 | 17.2M | flag, |
4062 | 17.2M | -1, -1, 0, // ref '*', pos, mapq, |
4063 | 17.2M | 0, NULL, // no cigar, |
4064 | 17.2M | -1, -1, 0, // mate |
4065 | 17.2M | x->seq.l, x->seq.s, x->qual.s, |
4066 | 17.2M | 0); |
4067 | 17.2M | if (ret < 0) return -2; |
4068 | | |
4069 | | // Add UMI tag if removed from read-name above |
4070 | 17.2M | if (UMI_len) { |
4071 | 0 | if (bam_aux_append(b, x->UMI[0], 'Z', UMI_len, (uint8_t *)UMI_seq) < 0) |
4072 | 0 | ret = -2; |
4073 | 0 | } |
4074 | | |
4075 | | // Identify Illumina CASAVA strings. |
4076 | | // <read>:<is_filtered>:<control_bits>:<barcode_sequence> |
4077 | 17.2M | char *barcode = NULL; |
4078 | 17.2M | int barcode_len = 0; |
4079 | 17.2M | kstring_t *kc = &x->comment; |
4080 | 17.2M | char *endptr; |
4081 | 17.2M | if (x->casava && |
4082 | | // \d:[YN]:\d+:[ACGTN]+ |
4083 | 0 | kc->l > 6 && (kc->s[1] | kc->s[3]) == ':' && isdigit_c(kc->s[0]) && |
4084 | 0 | strtol(kc->s+4, &endptr, 10) >= 0 && endptr != kc->s+4 |
4085 | 0 | && *endptr == ':') { |
4086 | | |
4087 | | // read num |
4088 | 0 | switch(kc->s[0]) { |
4089 | 0 | case '1': b->core.flag |= BAM_FREAD1 | pflag; break; |
4090 | 0 | case '2': b->core.flag |= BAM_FREAD2 | pflag; break; |
4091 | 0 | default : b->core.flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break; |
4092 | 0 | } |
4093 | | |
4094 | 0 | if (kc->s[2] == 'Y') |
4095 | 0 | b->core.flag |= BAM_FQCFAIL; |
4096 | | |
4097 | | // Barcode, maybe numeric in which case we skip it |
4098 | 0 | if (!isdigit_c(endptr[1])) { |
4099 | 0 | barcode = endptr+1; |
4100 | 0 | for (i = barcode - kc->s; i < kc->l; i++) |
4101 | 0 | if (isspace_c(kc->s[i])) |
4102 | 0 | break; |
4103 | |
|
4104 | 0 | kc->s[i] = 0; |
4105 | 0 | barcode_len = i+1-(barcode - kc->s); |
4106 | 0 | } |
4107 | 0 | } |
4108 | | |
4109 | 17.2M | if (ret >= 0 && barcode_len) |
4110 | 0 | if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0) |
4111 | 0 | ret = -2; |
4112 | | |
4113 | 17.2M | if (!x->aux) |
4114 | 17.2M | return ret; |
4115 | | |
4116 | | // Identify any SAM style aux tags in comments too. |
4117 | 0 | if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0) |
4118 | 0 | ret = -2; |
4119 | |
|
4120 | 0 | return ret; |
4121 | 17.2M | } |
4122 | | |
4123 | | // Internal component of sam_read1 below |
4124 | 431 | static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { |
4125 | 431 | int ret = bam_read1(fp->fp.bgzf, b); |
4126 | 431 | if (h && ret >= 0) { |
4127 | 367 | if (b->core.tid >= h->n_targets || b->core.tid < -1 || |
4128 | 355 | b->core.mtid >= h->n_targets || b->core.mtid < -1) { |
4129 | 15 | errno = ERANGE; |
4130 | 15 | return -3; |
4131 | 15 | } |
4132 | 367 | } |
4133 | 416 | return ret; |
4134 | 431 | } |
4135 | | |
4136 | | // Internal component of sam_read1 below |
4137 | 1.17k | static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) { |
4138 | 1.17k | int ret = cram_get_bam_seq(fp->fp.cram, b); |
4139 | 1.17k | if (ret < 0) |
4140 | 1.17k | return cram_eof(fp->fp.cram) ? -1 : -2; |
4141 | | |
4142 | 0 | if (bam_tag2cigar(*b, 1, 1) < 0) |
4143 | 0 | return -2; |
4144 | | |
4145 | 0 | return ret; |
4146 | 0 | } |
4147 | | |
4148 | | // Internal component of sam_read1 below |
4149 | 347k | static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { |
4150 | 347k | int ret; |
4151 | | |
4152 | | // Consume 1st line after header parsing as it wasn't using peek |
4153 | 347k | if (fp->line.l != 0) { |
4154 | 0 | ret = sam_parse1(&fp->line, h, b); |
4155 | 0 | fp->line.l = 0; |
4156 | 0 | return ret; |
4157 | 0 | } |
4158 | | |
4159 | 347k | if (fp->state) { |
4160 | 0 | SAM_state *fd = (SAM_state *)fp->state; |
4161 | |
|
4162 | 0 | if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) { |
4163 | | // We don't support multi-threaded SAM parsing with seeks yet. |
4164 | 0 | int ret; |
4165 | 0 | if ((ret = sam_state_destroy(fp)) < 0) { |
4166 | 0 | errno = -ret; |
4167 | 0 | return -2; |
4168 | 0 | } |
4169 | 0 | if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0) |
4170 | 0 | return -2; |
4171 | 0 | fp->fp.bgzf->seeked = 0; |
4172 | 0 | goto err_recover; |
4173 | 0 | } |
4174 | | |
4175 | 0 | if (!fd->h) { |
4176 | 0 | fd->h = h; |
4177 | 0 | fd->h->ref_count++; |
4178 | | // Ensure hrecs is initialised now as we don't want multiple |
4179 | | // threads trying to do this simultaneously. |
4180 | 0 | if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0) |
4181 | 0 | return -2; |
4182 | | |
4183 | | // We can only do this once we've got a header |
4184 | 0 | if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read, |
4185 | 0 | fp) != 0) |
4186 | 0 | return -2; |
4187 | 0 | fd->dispatcher_set = 1; |
4188 | 0 | } |
4189 | | |
4190 | 0 | if (fd->h != h) { |
4191 | 0 | hts_log_error("SAM multi-threaded decoding does not support changing header"); |
4192 | 0 | return -2; |
4193 | 0 | } |
4194 | | |
4195 | 0 | sp_bams *gb = fd->curr_bam; |
4196 | 0 | if (!gb) { |
4197 | 0 | if (fd->errcode) { |
4198 | | // In case reader failed |
4199 | 0 | errno = fd->errcode; |
4200 | 0 | return -2; |
4201 | 0 | } |
4202 | | |
4203 | 0 | pthread_mutex_lock(&fd->command_m); |
4204 | 0 | int cmd = fd->command; |
4205 | 0 | pthread_mutex_unlock(&fd->command_m); |
4206 | 0 | if (cmd == SAM_AT_EOF) |
4207 | 0 | return -1; |
4208 | | |
4209 | 0 | hts_tpool_result *r = hts_tpool_next_result_wait(fd->q); |
4210 | 0 | if (!r) |
4211 | 0 | return -2; |
4212 | 0 | fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r); |
4213 | 0 | hts_tpool_delete_result(r, 0); |
4214 | 0 | } |
4215 | 0 | if (!gb) { |
4216 | 0 | pthread_mutex_lock(&fd->command_m); |
4217 | 0 | fd->command = SAM_AT_EOF; |
4218 | 0 | pthread_mutex_unlock(&fd->command_m); |
4219 | 0 | return fd->errcode ? -2 : -1; |
4220 | 0 | } |
4221 | 0 | bam1_t *b_array = (bam1_t *)gb->bams; |
4222 | 0 | if (fd->curr_idx < gb->nbams) |
4223 | 0 | if (!bam_copy1(b, &b_array[fd->curr_idx++])) |
4224 | 0 | return -2; |
4225 | 0 | if (fd->curr_idx == gb->nbams) { |
4226 | 0 | pthread_mutex_lock(&fd->lines_m); |
4227 | 0 | gb->next = fd->bams; |
4228 | 0 | fd->bams = gb; |
4229 | 0 | pthread_mutex_unlock(&fd->lines_m); |
4230 | |
|
4231 | 0 | fd->curr_bam = NULL; |
4232 | 0 | fd->curr_idx = 0; |
4233 | | // Consider prefetching next record? I.e. |
4234 | | // } else { |
4235 | | // __builtin_prefetch(&b_array[fd->curr_idx], 0, 3); |
4236 | 0 | } |
4237 | |
|
4238 | 0 | ret = 0; |
4239 | |
|
4240 | 347k | } else { |
4241 | 347k | err_recover: |
4242 | 347k | ret = hts_getline(fp, KS_SEP_LINE, &fp->line); |
4243 | 347k | if (ret < 0) return ret; |
4244 | | |
4245 | 345k | ret = sam_parse1(&fp->line, h, b); |
4246 | 345k | fp->line.l = 0; |
4247 | 345k | if (ret < 0) { |
4248 | 2.58k | hts_log_warning("Parse error at line %lld", (long long)fp->lineno); |
4249 | 2.58k | if (h && h->ignore_sam_err) goto err_recover; |
4250 | 2.58k | } |
4251 | 345k | } |
4252 | | |
4253 | 345k | return ret; |
4254 | 347k | } |
4255 | | |
4256 | | // Returns 0 on success, |
4257 | | // -1 on EOF, |
4258 | | // <-1 on error |
4259 | | int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) |
4260 | 17.6M | { |
4261 | 17.6M | int ret, pass_filter; |
4262 | | |
4263 | 17.6M | do { |
4264 | 17.6M | switch (fp->format.format) { |
4265 | 431 | case bam: |
4266 | 431 | ret = sam_read1_bam(fp, h, b); |
4267 | 431 | break; |
4268 | | |
4269 | 1.17k | case cram: |
4270 | 1.17k | ret = sam_read1_cram(fp, h, &b); |
4271 | 1.17k | break; |
4272 | | |
4273 | 347k | case sam: |
4274 | 347k | ret = sam_read1_sam(fp, h, b); |
4275 | 347k | break; |
4276 | | |
4277 | 17.2M | case fasta_format: |
4278 | 17.2M | case fastq_format: { |
4279 | 17.2M | fastq_state *x = (fastq_state *)fp->state; |
4280 | 17.2M | if (!x) { |
4281 | 1.89k | if (!(fp->state = fastq_state_init(fp->format.format |
4282 | 1.89k | == fastq_format ? '@' : '>'))) |
4283 | 0 | return -2; |
4284 | 1.89k | } |
4285 | | |
4286 | 17.2M | return fastq_parse1(fp, b); |
4287 | 17.2M | } |
4288 | | |
4289 | 0 | case empty_format: |
4290 | 0 | errno = EPIPE; |
4291 | 0 | return -3; |
4292 | | |
4293 | 0 | default: |
4294 | 0 | errno = EFTYPE; |
4295 | 0 | return -3; |
4296 | 17.6M | } |
4297 | | |
4298 | 349k | pass_filter = (ret >= 0 && fp->filter) |
4299 | 349k | ? sam_passes_filter(h, b, fp->filter) |
4300 | 349k | : 1; |
4301 | 349k | } while (pass_filter == 0); |
4302 | | |
4303 | 349k | return pass_filter < 0 ? -2 : ret; |
4304 | 17.6M | } |
4305 | | |
4306 | | // With gcc, -O3 or -ftree-loop-vectorize is really key here as otherwise |
4307 | | // this code isn't vectorised and runs far slower than is necessary (even |
4308 | | // with the restrict keyword being used). |
4309 | | static inline void HTS_OPT3 |
4310 | 485 | add33(uint8_t *a, const uint8_t * b, int32_t len) { |
4311 | 485 | uint32_t i; |
4312 | 61.8k | for (i = 0; i < len; i++) |
4313 | 61.3k | a[i] = b[i]+33; |
4314 | 485 | } |
4315 | | |
4316 | | static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) |
4317 | 5.87M | { |
4318 | 5.87M | int i, r = 0; |
4319 | 5.87M | uint8_t *s, *end; |
4320 | 5.87M | const bam1_core_t *c = &b->core; |
4321 | | |
4322 | 5.87M | if (c->l_qname == 0) |
4323 | 0 | return -1; |
4324 | 5.87M | r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str); |
4325 | 5.87M | r |= kputc_('\t', str); // query name |
4326 | 5.87M | r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag |
4327 | 5.87M | if (c->tid >= 0) { // chr |
4328 | 76.2k | r |= kputs(h->target_name[c->tid] , str); |
4329 | 76.2k | r |= kputc_('\t', str); |
4330 | 5.80M | } else r |= kputsn_("*\t", 2, str); |
4331 | 5.87M | r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos |
4332 | 5.87M | r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual |
4333 | 5.87M | if (c->n_cigar) { // cigar |
4334 | 100k | uint32_t *cigar = bam_get_cigar(b); |
4335 | 4.33M | for (i = 0; i < c->n_cigar; ++i) { |
4336 | 4.23M | r |= kputw(bam_cigar_oplen(cigar[i]), str); |
4337 | 4.23M | r |= kputc_(bam_cigar_opchr(cigar[i]), str); |
4338 | 4.23M | } |
4339 | 5.77M | } else r |= kputc_('*', str); |
4340 | 5.87M | r |= kputc_('\t', str); |
4341 | 5.87M | if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr |
4342 | 6.14k | else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str); |
4343 | 5.64k | else { |
4344 | 5.64k | r |= kputs(h->target_name[c->mtid], str); |
4345 | 5.64k | r |= kputc_('\t', str); |
4346 | 5.64k | } |
4347 | 5.87M | r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos |
4348 | 5.87M | r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len |
4349 | 5.87M | if (c->l_qseq) { // seq and qual |
4350 | 175k | uint8_t *s = bam_get_seq(b); |
4351 | 175k | if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err; |
4352 | 175k | char *cp = str->s + str->l; |
4353 | | |
4354 | | // Sequence, 2 bases at a time |
4355 | 175k | nibble2base(s, cp, c->l_qseq); |
4356 | 175k | cp[c->l_qseq] = '\t'; |
4357 | 175k | cp += c->l_qseq+1; |
4358 | | |
4359 | | // Quality |
4360 | 175k | s = bam_get_qual(b); |
4361 | 175k | i = 0; |
4362 | 175k | if (s[0] == 0xff) { |
4363 | 175k | cp[i++] = '*'; |
4364 | 175k | } else { |
4365 | 485 | add33((uint8_t *)cp, s, c->l_qseq); // cp[i] = s[i]+33; |
4366 | 485 | i = c->l_qseq; |
4367 | 485 | } |
4368 | 175k | cp[i] = 0; |
4369 | 175k | cp += i; |
4370 | 175k | str->l = cp - str->s; |
4371 | 5.70M | } else r |= kputsn_("*\t*", 3, str); |
4372 | | |
4373 | 5.87M | s = bam_get_aux(b); // aux |
4374 | 5.87M | end = b->data + b->l_data; |
4375 | | |
4376 | 7.17M | while (end - s >= 4) { |
4377 | 1.29M | r |= kputc_('\t', str); |
4378 | 1.29M | if ((s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)) == NULL) |
4379 | 22 | goto bad_aux; |
4380 | 1.29M | } |
4381 | 5.87M | r |= kputsn("", 0, str); // nul terminate |
4382 | 5.87M | if (r < 0) goto mem_err; |
4383 | | |
4384 | 5.87M | return str->l; |
4385 | | |
4386 | 22 | bad_aux: |
4387 | 22 | hts_log_error("Corrupted aux data for read %.*s flag %d", |
4388 | 22 | b->core.l_qname, bam_get_qname(b), b->core.flag); |
4389 | 22 | errno = EINVAL; |
4390 | 22 | return -1; |
4391 | | |
4392 | 0 | mem_err: |
4393 | 0 | hts_log_error("Out of memory"); |
4394 | 0 | errno = ENOMEM; |
4395 | 0 | return -1; |
4396 | 5.87M | } |
4397 | | |
4398 | | int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) |
4399 | 5.87M | { |
4400 | 5.87M | str->l = 0; |
4401 | 5.87M | return sam_format1_append(h, b, str); |
4402 | 5.87M | } |
4403 | | |
4404 | | static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end); |
4405 | | int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str) |
4406 | 0 | { |
4407 | 0 | unsigned flag = b->core.flag; |
4408 | 0 | int i, e = 0, len = b->core.l_qseq; |
4409 | 0 | uint8_t *seq, *qual; |
4410 | |
|
4411 | 0 | str->l = 0; |
4412 | | |
4413 | | // Name |
4414 | 0 | if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF) |
4415 | 0 | return -1; |
4416 | | |
4417 | | // UMI tag |
4418 | 0 | if (x && *x->UMI[0]) { |
4419 | | // Temporary copy of '#num' if present |
4420 | 0 | char plex[256]; |
4421 | 0 | size_t len = str->l; |
4422 | 0 | while (len && str->s[len] != ':' && str->s[len] != '#') |
4423 | 0 | len--; |
4424 | |
|
4425 | 0 | if (str->s[len] == '#' && str->l - len < 255) { |
4426 | 0 | memcpy(plex, &str->s[len], str->l - len); |
4427 | 0 | plex[str->l - len] = 0; |
4428 | 0 | str->l = len; |
4429 | 0 | } else { |
4430 | 0 | *plex = 0; |
4431 | 0 | } |
4432 | |
|
4433 | 0 | uint8_t *bc = NULL; |
4434 | 0 | int n; |
4435 | 0 | for (n = 0; !bc && n < UMI_TAGS; n++) |
4436 | 0 | bc = bam_aux_get(b, x->UMI[n]); |
4437 | 0 | if (bc && *bc == 'Z') { |
4438 | 0 | int err = kputc(':', str) < 0; |
4439 | | // Replace any non-alpha with '+' |
4440 | 0 | while (*++bc) |
4441 | 0 | err |= kputc(isalpha_c(*bc) ? toupper_c(*bc) : '+', str) < 0; |
4442 | 0 | if (err) |
4443 | 0 | return -1; |
4444 | 0 | } |
4445 | | |
4446 | 0 | if (*plex && kputs(plex, str) < 0) |
4447 | 0 | return -1; |
4448 | 0 | } |
4449 | | |
4450 | | // /1 or /2 suffix |
4451 | 0 | if (x && x->rnum && (flag & BAM_FPAIRED)) { |
4452 | 0 | int r12 = flag & (BAM_FREAD1 | BAM_FREAD2); |
4453 | 0 | if (r12 == BAM_FREAD1) { |
4454 | 0 | if (kputs("/1", str) == EOF) |
4455 | 0 | return -1; |
4456 | 0 | } else if (r12 == BAM_FREAD2) { |
4457 | 0 | if (kputs("/2", str) == EOF) |
4458 | 0 | return -1; |
4459 | 0 | } |
4460 | 0 | } |
4461 | | |
4462 | | // Illumina CASAVA tag. |
4463 | | // This is <rnum>:<Y/N qcfail>:<control-bits>:<barcode-or-zero> |
4464 | 0 | if (x && x->casava) { |
4465 | 0 | int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0; |
4466 | 0 | char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N'; |
4467 | 0 | uint8_t *bc = bam_aux_get(b, x->BC); |
4468 | 0 | if (ksprintf(str, " %d:%c:0:%s", rnum, filtered, |
4469 | 0 | bc ? (char *)bc+1 : "0") < 0) |
4470 | 0 | return -1; |
4471 | | |
4472 | 0 | if (bc && (*bc != 'Z' || (!isupper_c(bc[1]) && !islower_c(bc[1])))) { |
4473 | 0 | hts_log_warning("BC tag starts with non-sequence base; using '0'"); |
4474 | 0 | str->l -= strlen((char *)bc)-2; // limit to 1 char |
4475 | 0 | str->s[str->l-1] = '0'; |
4476 | 0 | str->s[str->l] = 0; |
4477 | 0 | bc = NULL; |
4478 | 0 | } |
4479 | | |
4480 | | // Replace any non-alpha with '+'. Ie seq-seq to seq+seq |
4481 | 0 | if (bc) { |
4482 | 0 | int l = strlen((char *)bc+1); |
4483 | 0 | char *c = (char *)str->s + str->l - l; |
4484 | 0 | for (i = 0; i < l; i++) { |
4485 | 0 | if (!isalpha_c(c[i])) |
4486 | 0 | c[i] = '+'; |
4487 | 0 | else if (islower_c(c[i])) |
4488 | 0 | c[i] = toupper_c(c[i]); |
4489 | 0 | } |
4490 | 0 | } |
4491 | 0 | } |
4492 | | |
4493 | | // Aux tags |
4494 | 0 | if (x && x->aux) { |
4495 | 0 | uint8_t *s = bam_get_aux(b), *end = b->data + b->l_data; |
4496 | 0 | while (s && end - s >= 4) { |
4497 | 0 | int tt = s[0]*256 + s[1]; |
4498 | 0 | if (x->tags == NULL || |
4499 | 0 | kh_get(tag, x->tags, tt) != kh_end(x->tags)) { |
4500 | 0 | e |= kputc_('\t', str) < 0; |
4501 | 0 | if (!(s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str))) |
4502 | 0 | return -1; |
4503 | 0 | } else { |
4504 | 0 | s = skip_aux(s+2, end); |
4505 | 0 | } |
4506 | 0 | } |
4507 | 0 | e |= kputsn("", 0, str) < 0; // nul terminate |
4508 | 0 | } |
4509 | | |
4510 | 0 | if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1; |
4511 | 0 | e |= kputc_('\n', str) < 0; |
4512 | | |
4513 | | // Seq line |
4514 | 0 | seq = bam_get_seq(b); |
4515 | 0 | if (flag & BAM_FREVERSE) |
4516 | 0 | for (i = len-1; i >= 0; i--) |
4517 | 0 | e |= kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str) < 0; |
4518 | 0 | else |
4519 | 0 | for (i = 0; i < len; i++) |
4520 | 0 | e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0; |
4521 | | |
4522 | | |
4523 | | // Qual line |
4524 | 0 | if (x->nprefix == '@') { |
4525 | 0 | kputsn("\n+\n", 3, str); |
4526 | 0 | qual = bam_get_qual(b); |
4527 | 0 | if (qual[0] == 0xff) |
4528 | 0 | for (i = 0; i < len; i++) |
4529 | 0 | e |= kputc_('B', str) < 0; |
4530 | 0 | else if (flag & BAM_FREVERSE) |
4531 | 0 | for (i = len-1; i >= 0; i--) |
4532 | 0 | e |= kputc_(33 + qual[i], str) < 0; |
4533 | 0 | else |
4534 | 0 | for (i = 0; i < len; i++) |
4535 | 0 | e |= kputc_(33 + qual[i], str) < 0; |
4536 | |
|
4537 | 0 | } |
4538 | 0 | e |= kputc('\n', str) < 0; |
4539 | |
|
4540 | 0 | return e ? -1 : str->l; |
4541 | 0 | } |
4542 | | |
4543 | | // Sadly we need to be able to modify the bam_hdr here so we can |
4544 | | // reference count the structure. |
4545 | | int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) |
4546 | 17.6M | { |
4547 | 17.6M | switch (fp->format.format) { |
4548 | 0 | case binary_format: |
4549 | 0 | fp->format.category = sequence_data; |
4550 | 0 | fp->format.format = bam; |
4551 | | /* fall-through */ |
4552 | 5.87M | case bam: |
4553 | 5.87M | return bam_write_idx1(fp, h, b); |
4554 | | |
4555 | 5.87M | case cram: |
4556 | 5.87M | return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b); |
4557 | | |
4558 | 0 | case text_format: |
4559 | 0 | fp->format.category = sequence_data; |
4560 | 0 | fp->format.format = sam; |
4561 | | /* fall-through */ |
4562 | 5.87M | case sam: |
4563 | 5.87M | if (fp->state) { |
4564 | 0 | SAM_state *fd = (SAM_state *)fp->state; |
4565 | | |
4566 | | // Threaded output |
4567 | 0 | if (!fd->h) { |
4568 | | // NB: discard const. We don't actually modify sam_hdr_t here, |
4569 | | // just data pointed to by it (which is a bit weasely still), |
4570 | | // but out cached pointer must be non-const as we want to |
4571 | | // destroy it later on and sam_hdr_destroy takes non-const. |
4572 | | // |
4573 | | // We do this because some tools do sam_hdr_destroy; sam_close |
4574 | | // while others do sam_close; sam_hdr_destroy. The former is |
4575 | | // an issue as we need the header still when flushing. |
4576 | 0 | fd->h = (sam_hdr_t *)h; |
4577 | 0 | fd->h->ref_count++; |
4578 | |
|
4579 | 0 | if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write, |
4580 | 0 | fp) != 0) |
4581 | 0 | return -2; |
4582 | 0 | fd->dispatcher_set = 1; |
4583 | 0 | } |
4584 | | |
4585 | 0 | if (fd->h != h) { |
4586 | 0 | hts_log_error("SAM multi-threaded decoding does not support changing header"); |
4587 | 0 | return -2; |
4588 | 0 | } |
4589 | | |
4590 | | // Find a suitable BAM array to copy to |
4591 | 0 | sp_bams *gb = fd->curr_bam; |
4592 | 0 | if (!gb) { |
4593 | 0 | pthread_mutex_lock(&fd->lines_m); |
4594 | 0 | if (fd->bams) { |
4595 | 0 | fd->curr_bam = gb = fd->bams; |
4596 | 0 | fd->bams = gb->next; |
4597 | 0 | gb->next = NULL; |
4598 | 0 | gb->nbams = 0; |
4599 | 0 | gb->bam_mem = 0; |
4600 | 0 | pthread_mutex_unlock(&fd->lines_m); |
4601 | 0 | } else { |
4602 | 0 | pthread_mutex_unlock(&fd->lines_m); |
4603 | 0 | if (!(gb = calloc(1, sizeof(*gb)))) return -1; |
4604 | 0 | if (!(gb->bams = calloc(SAM_NBAM, sizeof(*gb->bams)))) { |
4605 | 0 | free(gb); |
4606 | 0 | return -1; |
4607 | 0 | } |
4608 | 0 | gb->nbams = 0; |
4609 | 0 | gb->abams = SAM_NBAM; |
4610 | 0 | gb->bam_mem = 0; |
4611 | 0 | gb->fd = fd; |
4612 | 0 | fd->curr_idx = 0; |
4613 | 0 | fd->curr_bam = gb; |
4614 | 0 | } |
4615 | 0 | } |
4616 | | |
4617 | 0 | if (!bam_copy1(&gb->bams[gb->nbams++], b)) |
4618 | 0 | return -2; |
4619 | 0 | gb->bam_mem += b->l_data + sizeof(*b); |
4620 | | |
4621 | | // Dispatch if full |
4622 | 0 | if (gb->nbams == SAM_NBAM || gb->bam_mem > SAM_NBYTES*0.8) { |
4623 | 0 | gb->serial = fd->serial++; |
4624 | 0 | pthread_mutex_lock(&fd->command_m); |
4625 | 0 | if (fd->errcode != 0) { |
4626 | 0 | pthread_mutex_unlock(&fd->command_m); |
4627 | 0 | return -fd->errcode; |
4628 | 0 | } |
4629 | 0 | if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb, |
4630 | 0 | cleanup_sp_bams, |
4631 | 0 | cleanup_sp_lines, 0) < 0) { |
4632 | 0 | pthread_mutex_unlock(&fd->command_m); |
4633 | 0 | return -1; |
4634 | 0 | } |
4635 | 0 | pthread_mutex_unlock(&fd->command_m); |
4636 | 0 | fd->curr_bam = NULL; |
4637 | 0 | } |
4638 | | |
4639 | | // Dummy value as we don't know how long it really is. |
4640 | | // We could track file sizes via a SAM_state field, but I don't think |
4641 | | // it is necessary. |
4642 | 0 | return 1; |
4643 | 5.87M | } else { |
4644 | 5.87M | if (sam_format1(h, b, &fp->line) < 0) return -1; |
4645 | 5.87M | kputc('\n', &fp->line); |
4646 | 5.87M | if (fp->is_bgzf) { |
4647 | 0 | if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) |
4648 | 0 | return -1; |
4649 | 0 | if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1; |
4650 | 5.87M | } else { |
4651 | 5.87M | if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1; |
4652 | 5.87M | } |
4653 | | |
4654 | 5.87M | if (fp->idx) { |
4655 | 0 | if (fp->format.compression == bgzf) { |
4656 | 0 | if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), |
4657 | 0 | bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) { |
4658 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
4659 | 0 | bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); |
4660 | 0 | return -1; |
4661 | 0 | } |
4662 | 0 | } else { |
4663 | 0 | if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b), |
4664 | 0 | bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) { |
4665 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
4666 | 0 | bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); |
4667 | 0 | return -1; |
4668 | 0 | } |
4669 | 0 | } |
4670 | 0 | } |
4671 | | |
4672 | 5.87M | return fp->line.l; |
4673 | 5.87M | } |
4674 | | |
4675 | | |
4676 | 0 | case fasta_format: |
4677 | 0 | case fastq_format: { |
4678 | 0 | fastq_state *x = (fastq_state *)fp->state; |
4679 | 0 | if (!x) { |
4680 | 0 | if (!(fp->state = fastq_state_init(fp->format.format |
4681 | 0 | == fastq_format ? '@' : '>'))) |
4682 | 0 | return -2; |
4683 | 0 | } |
4684 | | |
4685 | 0 | if (fastq_format1(fp->state, b, &fp->line) < 0) |
4686 | 0 | return -1; |
4687 | 0 | if (fp->is_bgzf) { |
4688 | 0 | if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) |
4689 | 0 | return -1; |
4690 | 0 | if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l) |
4691 | 0 | return -1; |
4692 | 0 | } else { |
4693 | 0 | if (hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l) |
4694 | 0 | return -1; |
4695 | 0 | } |
4696 | 0 | return fp->line.l; |
4697 | 0 | } |
4698 | | |
4699 | 0 | default: |
4700 | 0 | errno = EBADF; |
4701 | 0 | return -1; |
4702 | 17.6M | } |
4703 | 17.6M | } |
4704 | | |
4705 | | /************************ |
4706 | | *** Auxiliary fields *** |
4707 | | ************************/ |
4708 | | #ifndef HTS_LITTLE_ENDIAN |
4709 | | static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) { |
4710 | | int tsz = aux_type2size(type); |
4711 | | |
4712 | | if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1; |
4713 | | |
4714 | | switch (tsz) { |
4715 | | case 'H': case 'Z': case 1: // Trivial |
4716 | | memcpy(out, in, len); |
4717 | | break; |
4718 | | |
4719 | | #define aux_val_to_le(type_t, store_le) do { \ |
4720 | | type_t v; \ |
4721 | | size_t i; \ |
4722 | | for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \ |
4723 | | memcpy(&v, in + i, sizeof(type_t)); \ |
4724 | | store_le(v, out); \ |
4725 | | } \ |
4726 | | } while (0) |
4727 | | |
4728 | | case 2: aux_val_to_le(uint16_t, u16_to_le); break; |
4729 | | case 4: aux_val_to_le(uint32_t, u32_to_le); break; |
4730 | | case 8: aux_val_to_le(uint64_t, u64_to_le); break; |
4731 | | |
4732 | | #undef aux_val_to_le |
4733 | | |
4734 | | case 'B': { // Recurse! |
4735 | | uint32_t n; |
4736 | | if (len < 5) return -1; |
4737 | | memcpy(&n, in + 1, 4); |
4738 | | out[0] = in[0]; |
4739 | | u32_to_le(n, out + 1); |
4740 | | return aux_to_le(in[0], out + 5, in + 5, len - 5); |
4741 | | } |
4742 | | |
4743 | | default: // Unknown type code |
4744 | | return -1; |
4745 | | } |
4746 | | |
4747 | | |
4748 | | |
4749 | | return 0; |
4750 | | } |
4751 | | #endif |
4752 | | |
4753 | | int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data) |
4754 | 0 | { |
4755 | 0 | uint32_t new_len; |
4756 | |
|
4757 | 0 | assert(b->l_data >= 0); |
4758 | 0 | new_len = b->l_data + 3 + len; |
4759 | 0 | if (new_len > INT32_MAX || new_len < b->l_data) goto nomem; |
4760 | | |
4761 | 0 | if (realloc_bam_data(b, new_len) < 0) return -1; |
4762 | | |
4763 | 0 | b->data[b->l_data] = tag[0]; |
4764 | 0 | b->data[b->l_data + 1] = tag[1]; |
4765 | 0 | b->data[b->l_data + 2] = type; |
4766 | |
|
4767 | 0 | #ifdef HTS_LITTLE_ENDIAN |
4768 | 0 | memcpy(b->data + b->l_data + 3, data, len); |
4769 | | #else |
4770 | | if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) { |
4771 | | errno = EINVAL; |
4772 | | return -1; |
4773 | | } |
4774 | | #endif |
4775 | |
|
4776 | 0 | b->l_data = new_len; |
4777 | |
|
4778 | 0 | return 0; |
4779 | | |
4780 | 0 | nomem: |
4781 | 0 | errno = ENOMEM; |
4782 | 0 | return -1; |
4783 | 0 | } |
4784 | | |
4785 | | static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end) |
4786 | 2.89M | { |
4787 | 2.89M | int size; |
4788 | 2.89M | uint32_t n; |
4789 | 2.89M | if (s >= end) return end; |
4790 | 2.89M | size = aux_type2size(*s); ++s; // skip type |
4791 | 2.89M | switch (size) { |
4792 | 595k | case 'Z': |
4793 | 613k | case 'H': |
4794 | 613k | s = memchr(s, 0, end-s); |
4795 | 613k | return s ? s+1 : end; |
4796 | 94.3k | case 'B': |
4797 | 94.3k | if (end - s < 5) return NULL; |
4798 | 94.3k | size = aux_type2size(*s); ++s; |
4799 | 94.3k | n = le_to_u32(s); |
4800 | 94.3k | s += 4; |
4801 | 94.3k | if (size == 0 || end - s < size * n) return NULL; |
4802 | 94.3k | return s + size * n; |
4803 | 84 | case 0: |
4804 | 84 | return NULL; |
4805 | 2.18M | default: |
4806 | 2.18M | if (end - s < size) return NULL; |
4807 | 2.18M | return s + size; |
4808 | 2.89M | } |
4809 | 2.89M | } |
4810 | | |
4811 | | uint8_t *bam_aux_first(const bam1_t *b) |
4812 | 6.04M | { |
4813 | 6.04M | uint8_t *s = bam_get_aux(b); |
4814 | 6.04M | uint8_t *end = b->data + b->l_data; |
4815 | 6.04M | if (end - s <= 2) { errno = ENOENT; return NULL; } |
4816 | 241k | return s+2; |
4817 | 6.04M | } |
4818 | | |
4819 | | uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s) |
4820 | 2.82M | { |
4821 | 2.82M | uint8_t *end = b->data + b->l_data; |
4822 | 2.82M | uint8_t *next = s? skip_aux((uint8_t *) s, end) : end; |
4823 | 2.82M | if (next == NULL) goto bad_aux; |
4824 | 2.82M | if (end - next <= 2) { errno = ENOENT; return NULL; } |
4825 | 2.65M | return next+2; |
4826 | | |
4827 | 84 | bad_aux: |
4828 | 84 | hts_log_error("Corrupted aux data for read %s flag %d", |
4829 | 84 | bam_get_qname(b), b->core.flag); |
4830 | 84 | errno = EINVAL; |
4831 | 84 | return NULL; |
4832 | 2.82M | } |
4833 | | |
4834 | | uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) |
4835 | 6.04M | { |
4836 | 6.04M | uint8_t *s; |
4837 | 8.87M | for (s = bam_aux_first(b); s; s = bam_aux_next(b, s)) |
4838 | 2.89M | if (s[-2] == tag[0] && s[-1] == tag[1]) { |
4839 | | // Check the tag value is valid and complete |
4840 | 73.6k | uint8_t *e = skip_aux(s, b->data + b->l_data); |
4841 | 73.6k | if (e == NULL) goto bad_aux; |
4842 | 73.6k | if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux; |
4843 | | |
4844 | 73.6k | return s; |
4845 | 73.6k | } |
4846 | | |
4847 | | // errno now as set by bam_aux_first()/bam_aux_next() |
4848 | 5.97M | return NULL; |
4849 | | |
4850 | 0 | bad_aux: |
4851 | 0 | hts_log_error("Corrupted aux data for read %s flag %d", |
4852 | 0 | bam_get_qname(b), b->core.flag); |
4853 | 0 | errno = EINVAL; |
4854 | 0 | return NULL; |
4855 | 6.04M | } |
4856 | | |
4857 | | int bam_aux_del(bam1_t *b, uint8_t *s) |
4858 | 0 | { |
4859 | 0 | s = bam_aux_remove(b, s); |
4860 | 0 | return (s || errno == ENOENT)? 0 : -1; |
4861 | 0 | } |
4862 | | |
4863 | | uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s) |
4864 | 0 | { |
4865 | 0 | uint8_t *end = b->data + b->l_data; |
4866 | 0 | uint8_t *next = skip_aux(s, end); |
4867 | 0 | if (next == NULL) goto bad_aux; |
4868 | | |
4869 | 0 | b->l_data -= next - (s-2); |
4870 | 0 | if (next >= end) { errno = ENOENT; return NULL; } |
4871 | | |
4872 | 0 | memmove(s-2, next, end - next); |
4873 | 0 | return s; |
4874 | | |
4875 | 0 | bad_aux: |
4876 | 0 | hts_log_error("Corrupted aux data for read %s flag %d", |
4877 | 0 | bam_get_qname(b), b->core.flag); |
4878 | 0 | errno = EINVAL; |
4879 | 0 | return NULL; |
4880 | 0 | } |
4881 | | |
4882 | | int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data) |
4883 | 0 | { |
4884 | | // FIXME: This is not at all efficient! |
4885 | 0 | size_t ln = len >= 0 ? len : strlen(data) + 1; |
4886 | 0 | size_t old_ln = 0; |
4887 | 0 | int need_nul = ln == 0 || data[ln - 1] != '\0'; |
4888 | 0 | int save_errno = errno; |
4889 | 0 | int new_tag = 0; |
4890 | 0 | uint8_t *s = bam_aux_get(b,tag), *e; |
4891 | |
|
4892 | 0 | if (s) { // Replacing existing tag |
4893 | 0 | char type = *s; |
4894 | 0 | if (type != 'Z') { |
4895 | 0 | hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type); |
4896 | 0 | errno = EINVAL; |
4897 | 0 | return -1; |
4898 | 0 | } |
4899 | 0 | s++; |
4900 | 0 | e = memchr(s, '\0', b->data + b->l_data - s); |
4901 | 0 | old_ln = (e ? e - s : b->data + b->l_data - s) + 1; |
4902 | 0 | s -= 3; |
4903 | 0 | } else { |
4904 | 0 | if (errno != ENOENT) { // Invalid aux data, give up |
4905 | 0 | return -1; |
4906 | 0 | } else { // Tag doesn't exist - put it on the end |
4907 | 0 | errno = save_errno; |
4908 | 0 | s = b->data + b->l_data; |
4909 | 0 | new_tag = 3; |
4910 | 0 | } |
4911 | 0 | } |
4912 | | |
4913 | 0 | if (old_ln < ln + need_nul + new_tag) { |
4914 | 0 | ptrdiff_t s_offset = s - b->data; |
4915 | 0 | if (possibly_expand_bam_data(b, ln + need_nul + new_tag - old_ln) < 0) |
4916 | 0 | return -1; |
4917 | 0 | s = b->data + s_offset; |
4918 | 0 | } |
4919 | 0 | if (!new_tag) { |
4920 | 0 | memmove(s + 3 + ln + need_nul, |
4921 | 0 | s + 3 + old_ln, |
4922 | 0 | b->l_data - (s + 3 - b->data) - old_ln); |
4923 | 0 | } |
4924 | 0 | b->l_data += new_tag + ln + need_nul - old_ln; |
4925 | |
|
4926 | 0 | s[0] = tag[0]; |
4927 | 0 | s[1] = tag[1]; |
4928 | 0 | s[2] = 'Z'; |
4929 | 0 | memmove(s+3,data,ln); |
4930 | 0 | if (need_nul) s[3 + ln] = '\0'; |
4931 | 0 | return 0; |
4932 | 0 | } |
4933 | | |
4934 | | int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val) |
4935 | 0 | { |
4936 | 0 | uint32_t sz, old_sz = 0, new = 0; |
4937 | 0 | uint8_t *s, type; |
4938 | |
|
4939 | 0 | if (val < INT32_MIN || val > UINT32_MAX) { |
4940 | 0 | errno = EOVERFLOW; |
4941 | 0 | return -1; |
4942 | 0 | } |
4943 | 0 | if (val < INT16_MIN) { type = 'i'; sz = 4; } |
4944 | 0 | else if (val < INT8_MIN) { type = 's'; sz = 2; } |
4945 | 0 | else if (val < 0) { type = 'c'; sz = 1; } |
4946 | 0 | else if (val < UINT8_MAX) { type = 'C'; sz = 1; } |
4947 | 0 | else if (val < UINT16_MAX) { type = 'S'; sz = 2; } |
4948 | 0 | else { type = 'I'; sz = 4; } |
4949 | |
|
4950 | 0 | s = bam_aux_get(b, tag); |
4951 | 0 | if (s) { // Tag present - how big was the old one? |
4952 | 0 | switch (*s) { |
4953 | 0 | case 'c': case 'C': old_sz = 1; break; |
4954 | 0 | case 's': case 'S': old_sz = 2; break; |
4955 | 0 | case 'i': case 'I': old_sz = 4; break; |
4956 | 0 | default: errno = EINVAL; return -1; // Not an integer |
4957 | 0 | } |
4958 | 0 | } else { |
4959 | 0 | if (errno == ENOENT) { // Tag doesn't exist - add a new one |
4960 | 0 | s = b->data + b->l_data; |
4961 | 0 | new = 1; |
4962 | 0 | } else { // Invalid aux data, give up. |
4963 | 0 | return -1; |
4964 | 0 | } |
4965 | 0 | } |
4966 | | |
4967 | 0 | if (new || old_sz < sz) { |
4968 | | // Make room for new tag |
4969 | 0 | ptrdiff_t s_offset = s - b->data; |
4970 | 0 | if (possibly_expand_bam_data(b, (new ? 3 : 0) + sz - old_sz) < 0) |
4971 | 0 | return -1; |
4972 | 0 | s = b->data + s_offset; |
4973 | 0 | if (new) { // Add tag id |
4974 | 0 | *s++ = tag[0]; |
4975 | 0 | *s++ = tag[1]; |
4976 | 0 | } else { // Shift following data so we have space |
4977 | 0 | memmove(s + sz, s + old_sz, b->l_data - s_offset - old_sz); |
4978 | 0 | } |
4979 | 0 | } else { |
4980 | | // Reuse old space. Data value may be bigger than necessary but |
4981 | | // we avoid having to move everything else |
4982 | 0 | sz = old_sz; |
4983 | 0 | type = (val < 0 ? "\0cs\0i" : "\0CS\0I")[old_sz]; |
4984 | 0 | assert(type > 0); |
4985 | 0 | } |
4986 | 0 | *s++ = type; |
4987 | 0 | #ifdef HTS_LITTLE_ENDIAN |
4988 | 0 | memcpy(s, &val, sz); |
4989 | | #else |
4990 | | switch (sz) { |
4991 | | case 4: u32_to_le(val, s); break; |
4992 | | case 2: u16_to_le(val, s); break; |
4993 | | default: *s = val; break; |
4994 | | } |
4995 | | #endif |
4996 | 0 | b->l_data += (new ? 3 : 0) + sz - old_sz; |
4997 | 0 | return 0; |
4998 | 0 | } |
4999 | | |
5000 | | int bam_aux_update_float(bam1_t *b, const char tag[2], float val) |
5001 | 0 | { |
5002 | 0 | uint8_t *s = bam_aux_get(b, tag); |
5003 | 0 | int shrink = 0, new = 0; |
5004 | |
|
5005 | 0 | if (s) { // Tag present - what was it? |
5006 | 0 | switch (*s) { |
5007 | 0 | case 'f': break; |
5008 | 0 | case 'd': shrink = 1; break; |
5009 | 0 | default: errno = EINVAL; return -1; // Not a float |
5010 | 0 | } |
5011 | 0 | } else { |
5012 | 0 | if (errno == ENOENT) { // Tag doesn't exist - add a new one |
5013 | 0 | new = 1; |
5014 | 0 | } else { // Invalid aux data, give up. |
5015 | 0 | return -1; |
5016 | 0 | } |
5017 | 0 | } |
5018 | | |
5019 | 0 | if (new) { // Ensure there's room |
5020 | 0 | if (possibly_expand_bam_data(b, 3 + 4) < 0) |
5021 | 0 | return -1; |
5022 | 0 | s = b->data + b->l_data; |
5023 | 0 | *s++ = tag[0]; |
5024 | 0 | *s++ = tag[1]; |
5025 | 0 | } else if (shrink) { // Convert non-standard double tag to float |
5026 | 0 | memmove(s + 5, s + 9, b->l_data - ((s + 9) - b->data)); |
5027 | 0 | b->l_data -= 4; |
5028 | 0 | } |
5029 | 0 | *s++ = 'f'; |
5030 | 0 | float_to_le(val, s); |
5031 | 0 | if (new) b->l_data += 7; |
5032 | |
|
5033 | 0 | return 0; |
5034 | 0 | } |
5035 | | |
5036 | | int bam_aux_update_array(bam1_t *b, const char tag[2], |
5037 | | uint8_t type, uint32_t items, void *data) |
5038 | 0 | { |
5039 | 0 | uint8_t *s = bam_aux_get(b, tag); |
5040 | 0 | size_t old_sz = 0, new_sz; |
5041 | 0 | int new = 0; |
5042 | |
|
5043 | 0 | if (s) { // Tag present |
5044 | 0 | if (*s != 'B') { errno = EINVAL; return -1; } |
5045 | 0 | old_sz = aux_type2size(s[1]); |
5046 | 0 | if (old_sz < 1 || old_sz > 4) { errno = EINVAL; return -1; } |
5047 | 0 | old_sz *= le_to_u32(s + 2); |
5048 | 0 | } else { |
5049 | 0 | if (errno == ENOENT) { // Tag doesn't exist - add a new one |
5050 | 0 | s = b->data + b->l_data; |
5051 | 0 | new = 1; |
5052 | 0 | } else { // Invalid aux data, give up. |
5053 | 0 | return -1; |
5054 | 0 | } |
5055 | 0 | } |
5056 | | |
5057 | 0 | new_sz = aux_type2size(type); |
5058 | 0 | if (new_sz < 1 || new_sz > 4) { errno = EINVAL; return -1; } |
5059 | 0 | if (items > INT32_MAX / new_sz) { errno = ENOMEM; return -1; } |
5060 | 0 | new_sz *= items; |
5061 | |
|
5062 | 0 | if (new || old_sz < new_sz) { |
5063 | | // Make room for new tag |
5064 | 0 | ptrdiff_t s_offset = s - b->data; |
5065 | 0 | if (possibly_expand_bam_data(b, (new ? 8 : 0) + new_sz - old_sz) < 0) |
5066 | 0 | return -1; |
5067 | 0 | s = b->data + s_offset; |
5068 | 0 | } |
5069 | 0 | if (new) { // Add tag id and type |
5070 | 0 | *s++ = tag[0]; |
5071 | 0 | *s++ = tag[1]; |
5072 | 0 | *s = 'B'; |
5073 | 0 | b->l_data += 8 + new_sz; |
5074 | 0 | } else if (old_sz != new_sz) { // shift following data if necessary |
5075 | 0 | memmove(s + 6 + new_sz, s + 6 + old_sz, |
5076 | 0 | b->l_data - ((s + 6 + old_sz) - b->data)); |
5077 | 0 | b->l_data -= old_sz; |
5078 | 0 | b->l_data += new_sz; |
5079 | 0 | } |
5080 | |
|
5081 | 0 | s[1] = type; |
5082 | 0 | u32_to_le(items, s + 2); |
5083 | 0 | if (new_sz > 0) { |
5084 | 0 | #ifdef HTS_LITTLE_ENDIAN |
5085 | 0 | memcpy(s + 6, data, new_sz); |
5086 | | #else |
5087 | | return aux_to_le(type, s + 6, data, new_sz); |
5088 | | #endif |
5089 | 0 | } |
5090 | 0 | return 0; |
5091 | 0 | } |
5092 | | |
5093 | | static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s, |
5094 | | uint32_t idx) |
5095 | 0 | { |
5096 | 0 | switch (type) { |
5097 | 0 | case 'c': return le_to_i8(s + idx); |
5098 | 0 | case 'C': return s[idx]; |
5099 | 0 | case 's': return le_to_i16(s + 2 * idx); |
5100 | 0 | case 'S': return le_to_u16(s + 2 * idx); |
5101 | 0 | case 'i': return le_to_i32(s + 4 * idx); |
5102 | 0 | case 'I': return le_to_u32(s + 4 * idx); |
5103 | 0 | default: |
5104 | 0 | errno = EINVAL; |
5105 | 0 | return 0; |
5106 | 0 | } |
5107 | 0 | } |
5108 | | |
5109 | | int64_t bam_aux2i(const uint8_t *s) |
5110 | 0 | { |
5111 | 0 | int type; |
5112 | 0 | type = *s++; |
5113 | 0 | return get_int_aux_val(type, s, 0); |
5114 | 0 | } |
5115 | | |
5116 | | double bam_aux2f(const uint8_t *s) |
5117 | 0 | { |
5118 | 0 | int type; |
5119 | 0 | type = *s++; |
5120 | 0 | if (type == 'd') return le_to_double(s); |
5121 | 0 | else if (type == 'f') return le_to_float(s); |
5122 | 0 | else return get_int_aux_val(type, s, 0); |
5123 | 0 | } |
5124 | | |
5125 | | char bam_aux2A(const uint8_t *s) |
5126 | 0 | { |
5127 | 0 | int type; |
5128 | 0 | type = *s++; |
5129 | 0 | if (type == 'A') return *(char*)s; |
5130 | 0 | errno = EINVAL; |
5131 | 0 | return 0; |
5132 | 0 | } |
5133 | | |
5134 | | char *bam_aux2Z(const uint8_t *s) |
5135 | 0 | { |
5136 | 0 | int type; |
5137 | 0 | type = *s++; |
5138 | 0 | if (type == 'Z' || type == 'H') return (char*)s; |
5139 | 0 | errno = EINVAL; |
5140 | 0 | return 0; |
5141 | 0 | } |
5142 | | |
5143 | | uint32_t bam_auxB_len(const uint8_t *s) |
5144 | 0 | { |
5145 | 0 | if (s[0] != 'B') { |
5146 | 0 | errno = EINVAL; |
5147 | 0 | return 0; |
5148 | 0 | } |
5149 | 0 | return le_to_u32(s + 2); |
5150 | 0 | } |
5151 | | |
5152 | | int64_t bam_auxB2i(const uint8_t *s, uint32_t idx) |
5153 | 0 | { |
5154 | 0 | uint32_t len = bam_auxB_len(s); |
5155 | 0 | if (idx >= len) { |
5156 | 0 | errno = ERANGE; |
5157 | 0 | return 0; |
5158 | 0 | } |
5159 | 0 | return get_int_aux_val(s[1], s + 6, idx); |
5160 | 0 | } |
5161 | | |
5162 | | double bam_auxB2f(const uint8_t *s, uint32_t idx) |
5163 | 0 | { |
5164 | 0 | uint32_t len = bam_auxB_len(s); |
5165 | 0 | if (idx >= len) { |
5166 | 0 | errno = ERANGE; |
5167 | 0 | return 0.0; |
5168 | 0 | } |
5169 | 0 | if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx); |
5170 | 0 | else return get_int_aux_val(s[1], s + 6, idx); |
5171 | 0 | } |
5172 | | |
5173 | | int sam_open_mode(char *mode, const char *fn, const char *format) |
5174 | 0 | { |
5175 | | // TODO Parse "bam5" etc for compression level |
5176 | 0 | if (format == NULL) { |
5177 | | // Try to pick a format based on the filename extension |
5178 | 0 | char extension[HTS_MAX_EXT_LEN]; |
5179 | 0 | if (find_file_extension(fn, extension) < 0) return -1; |
5180 | 0 | return sam_open_mode(mode, fn, extension); |
5181 | 0 | } |
5182 | 0 | else if (strcasecmp(format, "bam") == 0) strcpy(mode, "b"); |
5183 | 0 | else if (strcasecmp(format, "cram") == 0) strcpy(mode, "c"); |
5184 | 0 | else if (strcasecmp(format, "sam") == 0) strcpy(mode, ""); |
5185 | 0 | else if (strcasecmp(format, "sam.gz") == 0) strcpy(mode, "z"); |
5186 | 0 | else if (strcasecmp(format, "fastq") == 0 || |
5187 | 0 | strcasecmp(format, "fq") == 0) strcpy(mode, "f"); |
5188 | 0 | else if (strcasecmp(format, "fastq.gz") == 0 || |
5189 | 0 | strcasecmp(format, "fq.gz") == 0) strcpy(mode, "fz"); |
5190 | 0 | else if (strcasecmp(format, "fasta") == 0 || |
5191 | 0 | strcasecmp(format, "fa") == 0) strcpy(mode, "F"); |
5192 | 0 | else if (strcasecmp(format, "fasta.gz") == 0 || |
5193 | 0 | strcasecmp(format, "fa.gz") == 0) strcpy(mode, "Fz"); |
5194 | 0 | else return -1; |
5195 | | |
5196 | 0 | return 0; |
5197 | 0 | } |
5198 | | |
5199 | | // A version of sam_open_mode that can handle ,key=value options. |
5200 | | // The format string is allocated and returned, to be freed by the caller. |
5201 | | // Prefix should be "r" or "w", |
5202 | | char *sam_open_mode_opts(const char *fn, |
5203 | | const char *mode, |
5204 | | const char *format) |
5205 | 0 | { |
5206 | 0 | char *mode_opts = malloc((format ? strlen(format) : 1) + |
5207 | 0 | (mode ? strlen(mode) : 1) + 12); |
5208 | 0 | char *opts, *cp; |
5209 | 0 | int format_len; |
5210 | |
|
5211 | 0 | if (!mode_opts) |
5212 | 0 | return NULL; |
5213 | | |
5214 | 0 | strcpy(mode_opts, mode ? mode : "r"); |
5215 | 0 | cp = mode_opts + strlen(mode_opts); |
5216 | |
|
5217 | 0 | if (format == NULL) { |
5218 | | // Try to pick a format based on the filename extension |
5219 | 0 | char extension[HTS_MAX_EXT_LEN]; |
5220 | 0 | if (find_file_extension(fn, extension) < 0) { |
5221 | 0 | free(mode_opts); |
5222 | 0 | return NULL; |
5223 | 0 | } |
5224 | 0 | if (sam_open_mode(cp, fn, extension) == 0) { |
5225 | 0 | return mode_opts; |
5226 | 0 | } else { |
5227 | 0 | free(mode_opts); |
5228 | 0 | return NULL; |
5229 | 0 | } |
5230 | 0 | } |
5231 | | |
5232 | 0 | if ((opts = strchr(format, ','))) { |
5233 | 0 | format_len = opts-format; |
5234 | 0 | } else { |
5235 | 0 | opts=""; |
5236 | 0 | format_len = strlen(format); |
5237 | 0 | } |
5238 | |
|
5239 | 0 | if (strncmp(format, "bam", format_len) == 0) { |
5240 | 0 | *cp++ = 'b'; |
5241 | 0 | } else if (strncmp(format, "cram", format_len) == 0) { |
5242 | 0 | *cp++ = 'c'; |
5243 | 0 | } else if (strncmp(format, "cram2", format_len) == 0) { |
5244 | 0 | *cp++ = 'c'; |
5245 | 0 | strcpy(cp, ",VERSION=2.1"); |
5246 | 0 | cp += 12; |
5247 | 0 | } else if (strncmp(format, "cram3", format_len) == 0) { |
5248 | 0 | *cp++ = 'c'; |
5249 | 0 | strcpy(cp, ",VERSION=3.0"); |
5250 | 0 | cp += 12; |
5251 | 0 | } else if (strncmp(format, "sam", format_len) == 0) { |
5252 | 0 | ; // format mode="" |
5253 | 0 | } else if (strncmp(format, "sam.gz", format_len) == 0) { |
5254 | 0 | *cp++ = 'z'; |
5255 | 0 | } else if (strncmp(format, "fastq", format_len) == 0 || |
5256 | 0 | strncmp(format, "fq", format_len) == 0) { |
5257 | 0 | *cp++ = 'f'; |
5258 | 0 | } else if (strncmp(format, "fastq.gz", format_len) == 0 || |
5259 | 0 | strncmp(format, "fq.gz", format_len) == 0) { |
5260 | 0 | *cp++ = 'f'; |
5261 | 0 | *cp++ = 'z'; |
5262 | 0 | } else if (strncmp(format, "fasta", format_len) == 0 || |
5263 | 0 | strncmp(format, "fa", format_len) == 0) { |
5264 | 0 | *cp++ = 'F'; |
5265 | 0 | } else if (strncmp(format, "fasta.gz", format_len) == 0 || |
5266 | 0 | strncmp(format, "fa", format_len) == 0) { |
5267 | 0 | *cp++ = 'F'; |
5268 | 0 | *cp++ = 'z'; |
5269 | 0 | } else { |
5270 | 0 | free(mode_opts); |
5271 | 0 | return NULL; |
5272 | 0 | } |
5273 | | |
5274 | 0 | strcpy(cp, opts); |
5275 | |
|
5276 | 0 | return mode_opts; |
5277 | 0 | } |
5278 | | |
5279 | 0 | #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n)) |
5280 | | int bam_str2flag(const char *str) |
5281 | 0 | { |
5282 | 0 | char *end, *beg = (char*) str; |
5283 | 0 | long int flag = strtol(str, &end, 0); |
5284 | 0 | if ( end!=str ) return flag; // the conversion was successful |
5285 | 0 | flag = 0; |
5286 | 0 | while ( *str ) |
5287 | 0 | { |
5288 | 0 | end = beg; |
5289 | 0 | while ( *end && *end!=',' ) end++; |
5290 | 0 | if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED; |
5291 | 0 | else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR; |
5292 | 0 | else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP; |
5293 | 0 | else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP; |
5294 | 0 | else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE; |
5295 | 0 | else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE; |
5296 | 0 | else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1; |
5297 | 0 | else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2; |
5298 | 0 | else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY; |
5299 | 0 | else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL; |
5300 | 0 | else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP; |
5301 | 0 | else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY; |
5302 | 0 | else return -1; |
5303 | 0 | if ( !*end ) break; |
5304 | 0 | beg = end + 1; |
5305 | 0 | } |
5306 | 0 | return flag; |
5307 | 0 | } |
5308 | | |
5309 | | char *bam_flag2str(int flag) |
5310 | 0 | { |
5311 | 0 | kstring_t str = {0,0,0}; |
5312 | 0 | if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED"); |
5313 | 0 | if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR"); |
5314 | 0 | if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP"); |
5315 | 0 | if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP"); |
5316 | 0 | if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE"); |
5317 | 0 | if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE"); |
5318 | 0 | if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1"); |
5319 | 0 | if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2"); |
5320 | 0 | if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY"); |
5321 | 0 | if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL"); |
5322 | 0 | if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP"); |
5323 | 0 | if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY"); |
5324 | 0 | if ( str.l == 0 ) kputsn("", 0, &str); |
5325 | 0 | return str.s; |
5326 | 0 | } |
5327 | | |
5328 | | |
5329 | | /************************** |
5330 | | *** Pileup and Mpileup *** |
5331 | | **************************/ |
5332 | | |
5333 | | #if !defined(BAM_NO_PILEUP) |
5334 | | |
5335 | | #include <assert.h> |
5336 | | |
5337 | | /******************* |
5338 | | *** Memory pool *** |
5339 | | *******************/ |
5340 | | |
5341 | | typedef struct { |
5342 | | int k, y; |
5343 | | hts_pos_t x, end; |
5344 | | } cstate_t; |
5345 | | |
5346 | | static cstate_t g_cstate_null = { -1, 0, 0, 0 }; |
5347 | | |
5348 | | typedef struct __linkbuf_t { |
5349 | | bam1_t b; |
5350 | | hts_pos_t beg, end; |
5351 | | cstate_t s; |
5352 | | struct __linkbuf_t *next; |
5353 | | bam_pileup_cd cd; |
5354 | | } lbnode_t; |
5355 | | |
5356 | | typedef struct { |
5357 | | int cnt, n, max; |
5358 | | lbnode_t **buf; |
5359 | | } mempool_t; |
5360 | | |
5361 | | static mempool_t *mp_init(void) |
5362 | 0 | { |
5363 | 0 | mempool_t *mp; |
5364 | 0 | mp = (mempool_t*)calloc(1, sizeof(mempool_t)); |
5365 | 0 | return mp; |
5366 | 0 | } |
5367 | | static void mp_destroy(mempool_t *mp) |
5368 | 0 | { |
5369 | 0 | int k; |
5370 | 0 | for (k = 0; k < mp->n; ++k) { |
5371 | 0 | free(mp->buf[k]->b.data); |
5372 | 0 | free(mp->buf[k]); |
5373 | 0 | } |
5374 | 0 | free(mp->buf); |
5375 | 0 | free(mp); |
5376 | 0 | } |
5377 | | static inline lbnode_t *mp_alloc(mempool_t *mp) |
5378 | 0 | { |
5379 | 0 | ++mp->cnt; |
5380 | 0 | if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t)); |
5381 | 0 | else return mp->buf[--mp->n]; |
5382 | 0 | } |
5383 | | static inline void mp_free(mempool_t *mp, lbnode_t *p) |
5384 | 0 | { |
5385 | 0 | --mp->cnt; p->next = 0; // clear lbnode_t::next here |
5386 | 0 | if (mp->n == mp->max) { |
5387 | 0 | mp->max = mp->max? mp->max<<1 : 256; |
5388 | 0 | mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max); |
5389 | 0 | } |
5390 | 0 | mp->buf[mp->n++] = p; |
5391 | 0 | } |
5392 | | |
5393 | | /********************** |
5394 | | *** CIGAR resolver *** |
5395 | | **********************/ |
5396 | | |
5397 | | /* s->k: the index of the CIGAR operator that has just been processed. |
5398 | | s->x: the reference coordinate of the start of s->k |
5399 | | s->y: the query coordinate of the start of s->k |
5400 | | */ |
5401 | | static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s) |
5402 | 0 | { |
5403 | 0 | #define _cop(c) ((c)&BAM_CIGAR_MASK) |
5404 | 0 | #define _cln(c) ((c)>>BAM_CIGAR_SHIFT) |
5405 | |
|
5406 | 0 | bam1_t *b = p->b; |
5407 | 0 | bam1_core_t *c = &b->core; |
5408 | 0 | uint32_t *cigar = bam_get_cigar(b); |
5409 | 0 | int k; |
5410 | | // determine the current CIGAR operation |
5411 | | //fprintf(stderr, "%s\tpos=%ld\tend=%ld\t(%d,%ld,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y); |
5412 | 0 | if (s->k == -1) { // never processed |
5413 | 0 | p->qpos = 0; |
5414 | 0 | if (c->n_cigar == 1) { // just one operation, save a loop |
5415 | 0 | if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0; |
5416 | 0 | } else { // find the first match or deletion |
5417 | 0 | for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) { |
5418 | 0 | int op = _cop(cigar[k]); |
5419 | 0 | int l = _cln(cigar[k]); |
5420 | 0 | if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || |
5421 | 0 | op == BAM_CEQUAL || op == BAM_CDIFF) break; |
5422 | 0 | else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; |
5423 | 0 | } |
5424 | 0 | assert(k < c->n_cigar); |
5425 | 0 | s->k = k; |
5426 | 0 | } |
5427 | 0 | } else { // the read has been processed before |
5428 | 0 | int op, l = _cln(cigar[s->k]); |
5429 | 0 | if (pos - s->x >= l) { // jump to the next operation |
5430 | 0 | assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case |
5431 | 0 | op = _cop(cigar[s->k+1]); |
5432 | 0 | if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop |
5433 | 0 | if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; |
5434 | 0 | s->x += l; |
5435 | 0 | ++s->k; |
5436 | 0 | } else { // find the next M/D/N/=/X |
5437 | 0 | if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; |
5438 | 0 | s->x += l; |
5439 | 0 | for (k = s->k + 1; k < c->n_cigar; ++k) { |
5440 | 0 | op = _cop(cigar[k]), l = _cln(cigar[k]); |
5441 | 0 | if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break; |
5442 | 0 | else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; |
5443 | 0 | } |
5444 | 0 | s->k = k; |
5445 | 0 | } |
5446 | 0 | assert(s->k < c->n_cigar); // otherwise a bug |
5447 | 0 | } // else, do nothing |
5448 | 0 | } |
5449 | 0 | { // collect pileup information |
5450 | 0 | int op, l; |
5451 | 0 | op = _cop(cigar[s->k]); l = _cln(cigar[s->k]); |
5452 | 0 | p->is_del = p->indel = p->is_refskip = 0; |
5453 | 0 | if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation |
5454 | 0 | int op2 = _cop(cigar[s->k+1]); |
5455 | 0 | int l2 = _cln(cigar[s->k+1]); |
5456 | 0 | if (op2 == BAM_CDEL && op != BAM_CDEL) { |
5457 | | // At start of a new deletion, merge e.g. 1D2D to 3D. |
5458 | | // Within a deletion (the 2D in 1D2D) we keep p->indel=0 |
5459 | | // and rely on is_del=1 as we would for 3D. |
5460 | 0 | p->indel = -(int)l2; |
5461 | 0 | for (k = s->k+2; k < c->n_cigar; ++k) { |
5462 | 0 | op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); |
5463 | 0 | if (op2 == BAM_CDEL) p->indel -= l2; |
5464 | 0 | else break; |
5465 | 0 | } |
5466 | 0 | } else if (op2 == BAM_CINS) { |
5467 | 0 | p->indel = l2; |
5468 | 0 | for (k = s->k+2; k < c->n_cigar; ++k) { |
5469 | 0 | op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); |
5470 | 0 | if (op2 == BAM_CINS) p->indel += l2; |
5471 | 0 | else if (op2 != BAM_CPAD) break; |
5472 | 0 | } |
5473 | 0 | } else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { |
5474 | 0 | int l3 = 0; |
5475 | 0 | for (k = s->k + 2; k < c->n_cigar; ++k) { |
5476 | 0 | op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); |
5477 | 0 | if (op2 == BAM_CINS) l3 += l2; |
5478 | 0 | else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break; |
5479 | 0 | } |
5480 | 0 | if (l3 > 0) p->indel = l3; |
5481 | 0 | } |
5482 | 0 | } |
5483 | 0 | if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { |
5484 | 0 | p->qpos = s->y + (pos - s->x); |
5485 | 0 | } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { |
5486 | 0 | p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!! |
5487 | 0 | p->is_refskip = (op == BAM_CREF_SKIP); |
5488 | 0 | } // cannot be other operations; otherwise a bug |
5489 | 0 | p->is_head = (pos == c->pos); p->is_tail = (pos == s->end); |
5490 | 0 | } |
5491 | 0 | p->cigar_ind = s->k; |
5492 | 0 | return 1; |
5493 | 0 | } |
5494 | | |
5495 | | /******************************* |
5496 | | *** Expansion of insertions *** |
5497 | | *******************************/ |
5498 | | |
5499 | | /* |
5500 | | * Fills out the kstring with the padded insertion sequence for the current |
5501 | | * location in 'p'. If this is not an insertion site, the string is blank. |
5502 | | * |
5503 | | * This variant handles base modifications, but only when "m" is non-NULL. |
5504 | | * |
5505 | | * Returns the number of inserted base on success, with string length being |
5506 | | * accessable via ins->l; |
5507 | | * -1 on failure. |
5508 | | */ |
5509 | | int bam_plp_insertion_mod(const bam_pileup1_t *p, |
5510 | | hts_base_mod_state *m, |
5511 | 0 | kstring_t *ins, int *del_len) { |
5512 | 0 | int j, k, indel, nb = 0; |
5513 | 0 | uint32_t *cigar; |
5514 | |
|
5515 | 0 | if (p->indel <= 0) { |
5516 | 0 | if (ks_resize(ins, 1) < 0) |
5517 | 0 | return -1; |
5518 | 0 | ins->l = 0; |
5519 | 0 | ins->s[0] = '\0'; |
5520 | 0 | return 0; |
5521 | 0 | } |
5522 | | |
5523 | 0 | if (del_len) |
5524 | 0 | *del_len = 0; |
5525 | | |
5526 | | // Measure indel length including pads |
5527 | 0 | indel = 0; |
5528 | 0 | k = p->cigar_ind+1; |
5529 | 0 | cigar = bam_get_cigar(p->b); |
5530 | 0 | while (k < p->b->core.n_cigar) { |
5531 | 0 | switch (cigar[k] & BAM_CIGAR_MASK) { |
5532 | 0 | case BAM_CPAD: |
5533 | 0 | case BAM_CINS: |
5534 | 0 | indel += (cigar[k] >> BAM_CIGAR_SHIFT); |
5535 | 0 | break; |
5536 | 0 | default: |
5537 | 0 | k = p->b->core.n_cigar; |
5538 | 0 | break; |
5539 | 0 | } |
5540 | 0 | k++; |
5541 | 0 | } |
5542 | 0 | nb = ins->l = indel; |
5543 | | |
5544 | | // Produce sequence |
5545 | 0 | if (ks_resize(ins, indel+1) < 0) |
5546 | 0 | return -1; |
5547 | 0 | indel = 0; |
5548 | 0 | k = p->cigar_ind+1; |
5549 | 0 | j = 1; |
5550 | 0 | while (k < p->b->core.n_cigar) { |
5551 | 0 | int l, c; |
5552 | 0 | switch (cigar[k] & BAM_CIGAR_MASK) { |
5553 | 0 | case BAM_CPAD: |
5554 | 0 | for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++) |
5555 | 0 | ins->s[indel++] = '*'; |
5556 | 0 | break; |
5557 | 0 | case BAM_CINS: |
5558 | 0 | for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) { |
5559 | 0 | c = p->qpos + j - p->is_del < p->b->core.l_qseq |
5560 | 0 | ? seq_nt16_str[bam_seqi(bam_get_seq(p->b), |
5561 | 0 | p->qpos + j - p->is_del)] |
5562 | 0 | : 'N'; |
5563 | 0 | ins->s[indel++] = c; |
5564 | 0 | int nm; |
5565 | 0 | hts_base_mod mod[256]; |
5566 | 0 | if (m && (nm = bam_mods_at_qpos(p->b, p->qpos + j - p->is_del, |
5567 | 0 | m, mod, 256)) > 0) { |
5568 | 0 | int o_indel = indel; |
5569 | 0 | if (ks_resize(ins, ins->l + nm*16+3) < 0) |
5570 | 0 | return -1; |
5571 | 0 | ins->s[indel++] = '['; |
5572 | 0 | int j; |
5573 | 0 | for (j = 0; j < nm; j++) { |
5574 | 0 | char qual[20]; |
5575 | 0 | if (mod[j].qual >= 0) |
5576 | 0 | snprintf(qual, sizeof(qual), "%d", mod[j].qual); |
5577 | 0 | else |
5578 | 0 | *qual=0; |
5579 | 0 | if (mod[j].modified_base < 0) |
5580 | | // ChEBI |
5581 | 0 | indel += snprintf(&ins->s[indel], ins->m - indel, |
5582 | 0 | "%c(%d)%s", |
5583 | 0 | "+-"[mod[j].strand], |
5584 | 0 | -mod[j].modified_base, |
5585 | 0 | qual); |
5586 | 0 | else |
5587 | 0 | indel += snprintf(&ins->s[indel], ins->m - indel, |
5588 | 0 | "%c%c%s", |
5589 | 0 | "+-"[mod[j].strand], |
5590 | 0 | mod[j].modified_base, |
5591 | 0 | qual); |
5592 | 0 | } |
5593 | 0 | ins->s[indel++] = ']'; |
5594 | 0 | ins->l += indel - o_indel; // grow by amount we used |
5595 | 0 | } |
5596 | 0 | } |
5597 | 0 | break; |
5598 | 0 | case BAM_CDEL: |
5599 | | // eg cigar 1M2I1D gives mpileup output in T+2AA-1C style |
5600 | 0 | if (del_len) |
5601 | 0 | *del_len = cigar[k]>>BAM_CIGAR_SHIFT; |
5602 | | // fall through |
5603 | 0 | default: |
5604 | 0 | k = p->b->core.n_cigar; |
5605 | 0 | break; |
5606 | 0 | } |
5607 | 0 | k++; |
5608 | 0 | } |
5609 | 0 | ins->s[indel] = '\0'; |
5610 | 0 | ins->l = indel; // string length |
5611 | |
|
5612 | 0 | return nb; // base length |
5613 | 0 | } |
5614 | | |
5615 | | /* |
5616 | | * Fills out the kstring with the padded insertion sequence for the current |
5617 | | * location in 'p'. If this is not an insertion site, the string is blank. |
5618 | | * |
5619 | | * This is the original interface with no capability for reporting base |
5620 | | * modifications. |
5621 | | * |
5622 | | * Returns the length of insertion string on success; |
5623 | | * -1 on failure. |
5624 | | */ |
5625 | 0 | int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) { |
5626 | 0 | return bam_plp_insertion_mod(p, NULL, ins, del_len); |
5627 | 0 | } |
5628 | | |
5629 | | /*********************** |
5630 | | *** Pileup iterator *** |
5631 | | ***********************/ |
5632 | | |
5633 | | // Dictionary of overlapping reads |
5634 | | KHASH_MAP_INIT_STR(olap_hash, lbnode_t *) |
5635 | | typedef khash_t(olap_hash) olap_hash_t; |
5636 | | |
5637 | | struct bam_plp_s { |
5638 | | mempool_t *mp; |
5639 | | lbnode_t *head, *tail; |
5640 | | int32_t tid, max_tid; |
5641 | | hts_pos_t pos, max_pos; |
5642 | | int is_eof, max_plp, error, maxcnt; |
5643 | | uint64_t id; |
5644 | | bam_pileup1_t *plp; |
5645 | | // for the "auto" interface only |
5646 | | bam1_t *b; |
5647 | | bam_plp_auto_f func; |
5648 | | void *data; |
5649 | | olap_hash_t *overlaps; |
5650 | | |
5651 | | // For notification of creation and destruction events |
5652 | | // and associated client-owned pointer. |
5653 | | int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd); |
5654 | | int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd); |
5655 | | }; |
5656 | | |
5657 | | bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data) |
5658 | 0 | { |
5659 | 0 | bam_plp_t iter; |
5660 | 0 | iter = (bam_plp_t)calloc(1, sizeof(struct bam_plp_s)); |
5661 | 0 | iter->mp = mp_init(); |
5662 | 0 | iter->head = iter->tail = mp_alloc(iter->mp); |
5663 | 0 | iter->max_tid = iter->max_pos = -1; |
5664 | 0 | iter->maxcnt = 8000; |
5665 | 0 | if (func) { |
5666 | 0 | iter->func = func; |
5667 | 0 | iter->data = data; |
5668 | 0 | iter->b = bam_init1(); |
5669 | 0 | } |
5670 | 0 | return iter; |
5671 | 0 | } |
5672 | | |
5673 | | int bam_plp_init_overlaps(bam_plp_t iter) |
5674 | 0 | { |
5675 | 0 | iter->overlaps = kh_init(olap_hash); // hash for tweaking quality of bases in overlapping reads |
5676 | 0 | return iter->overlaps ? 0 : -1; |
5677 | 0 | } |
5678 | | |
5679 | | void bam_plp_destroy(bam_plp_t iter) |
5680 | 0 | { |
5681 | 0 | lbnode_t *p, *pnext; |
5682 | 0 | if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps); |
5683 | 0 | for (p = iter->head; p != NULL; p = pnext) { |
5684 | 0 | if (iter->plp_destruct && p != iter->tail) |
5685 | 0 | iter->plp_destruct(iter->data, &p->b, &p->cd); |
5686 | 0 | pnext = p->next; |
5687 | 0 | mp_free(iter->mp, p); |
5688 | 0 | } |
5689 | 0 | mp_destroy(iter->mp); |
5690 | 0 | if (iter->b) bam_destroy1(iter->b); |
5691 | 0 | free(iter->plp); |
5692 | 0 | free(iter); |
5693 | 0 | } |
5694 | | |
5695 | | void bam_plp_constructor(bam_plp_t plp, |
5696 | 0 | int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) { |
5697 | 0 | plp->plp_construct = func; |
5698 | 0 | } |
5699 | | |
5700 | | void bam_plp_destructor(bam_plp_t plp, |
5701 | 0 | int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) { |
5702 | 0 | plp->plp_destruct = func; |
5703 | 0 | } |
5704 | | |
5705 | | //--------------------------------- |
5706 | | //--- Tweak overlapping reads |
5707 | | //--------------------------------- |
5708 | | |
5709 | | /** |
5710 | | * cigar_iref2iseq_set() - find the first CMATCH setting the ref and the read index |
5711 | | * cigar_iref2iseq_next() - get the next CMATCH base |
5712 | | * @cigar: pointer to current cigar block (rw) |
5713 | | * @cigar_max: pointer just beyond the last cigar block |
5714 | | * @icig: position within the current cigar block (rw) |
5715 | | * @iseq: position in the sequence (rw) |
5716 | | * @iref: position with respect to the beginning of the read (iref_pos - b->core.pos) (rw) |
5717 | | * |
5718 | | * Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered, |
5719 | | * or -2 on error. |
5720 | | */ |
5721 | | static inline int cigar_iref2iseq_set(const uint32_t **cigar, |
5722 | | const uint32_t *cigar_max, |
5723 | | hts_pos_t *icig, |
5724 | | hts_pos_t *iseq, |
5725 | | hts_pos_t *iref) |
5726 | 0 | { |
5727 | 0 | hts_pos_t pos = *iref; |
5728 | 0 | if ( pos < 0 ) return -1; |
5729 | 0 | *icig = 0; |
5730 | 0 | *iseq = 0; |
5731 | 0 | *iref = 0; |
5732 | 0 | while ( *cigar<cigar_max ) |
5733 | 0 | { |
5734 | 0 | int cig = (**cigar) & BAM_CIGAR_MASK; |
5735 | 0 | int ncig = (**cigar) >> BAM_CIGAR_SHIFT; |
5736 | |
|
5737 | 0 | if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } |
5738 | 0 | if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; } |
5739 | 0 | if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) |
5740 | 0 | { |
5741 | 0 | pos -= ncig; |
5742 | 0 | if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; } |
5743 | 0 | (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig; |
5744 | 0 | continue; |
5745 | 0 | } |
5746 | 0 | if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } |
5747 | 0 | if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) |
5748 | 0 | { |
5749 | 0 | pos -= ncig; |
5750 | 0 | if ( pos<0 ) pos = 0; |
5751 | 0 | (*cigar)++; *icig = 0; *iref += ncig; |
5752 | 0 | continue; |
5753 | 0 | } |
5754 | 0 | hts_log_error("Unexpected cigar %d", cig); |
5755 | 0 | return -2; |
5756 | 0 | } |
5757 | 0 | *iseq = -1; |
5758 | 0 | return -1; |
5759 | 0 | } |
5760 | | static inline int cigar_iref2iseq_next(const uint32_t **cigar, |
5761 | | const uint32_t *cigar_max, |
5762 | | hts_pos_t *icig, |
5763 | | hts_pos_t *iseq, |
5764 | | hts_pos_t *iref) |
5765 | 0 | { |
5766 | 0 | while ( *cigar < cigar_max ) |
5767 | 0 | { |
5768 | 0 | int cig = (**cigar) & BAM_CIGAR_MASK; |
5769 | 0 | int ncig = (**cigar) >> BAM_CIGAR_SHIFT; |
5770 | |
|
5771 | 0 | if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) |
5772 | 0 | { |
5773 | 0 | if ( *icig >= ncig - 1 ) { *icig = -1; (*cigar)++; continue; } |
5774 | 0 | (*iseq)++; (*icig)++; (*iref)++; |
5775 | 0 | return BAM_CMATCH; |
5776 | 0 | } |
5777 | 0 | if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = -1; continue; } |
5778 | 0 | if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; } |
5779 | 0 | if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; } |
5780 | 0 | if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = -1; continue; } |
5781 | 0 | hts_log_error("Unexpected cigar %d", cig); |
5782 | 0 | return -2; |
5783 | 0 | } |
5784 | 0 | *iseq = -1; |
5785 | 0 | *iref = -1; |
5786 | 0 | return -1; |
5787 | 0 | } |
5788 | | |
5789 | | // Given overlapping read 'a' (left) and 'b' (right) on the same |
5790 | | // template, adjust quality values to zero for either a or b. |
5791 | | // Note versions 1.12 and earlier always removed quality from 'b' for |
5792 | | // matching bases. Now we select a or b semi-randomly based on name hash. |
5793 | | // Returns 0 on success, |
5794 | | // -1 on failure |
5795 | | static int tweak_overlap_quality(bam1_t *a, bam1_t *b) |
5796 | 0 | { |
5797 | 0 | const uint32_t *a_cigar = bam_get_cigar(a), |
5798 | 0 | *a_cigar_max = a_cigar + a->core.n_cigar; |
5799 | 0 | const uint32_t *b_cigar = bam_get_cigar(b), |
5800 | 0 | *b_cigar_max = b_cigar + b->core.n_cigar; |
5801 | 0 | hts_pos_t a_icig = 0, a_iseq = 0; |
5802 | 0 | hts_pos_t b_icig = 0, b_iseq = 0; |
5803 | 0 | uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b); |
5804 | 0 | uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b); |
5805 | |
|
5806 | 0 | hts_pos_t iref = b->core.pos; |
5807 | 0 | hts_pos_t a_iref = iref - a->core.pos; |
5808 | 0 | hts_pos_t b_iref = iref - b->core.pos; |
5809 | |
|
5810 | 0 | int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, |
5811 | 0 | &a_icig, &a_iseq, &a_iref); |
5812 | 0 | if ( a_ret<0 ) |
5813 | | // no overlap or error |
5814 | 0 | return a_ret<-1 ? -1:0; |
5815 | | |
5816 | 0 | int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, |
5817 | 0 | &b_icig, &b_iseq, &b_iref); |
5818 | 0 | if ( b_ret<0 ) |
5819 | | // no overlap or error |
5820 | 0 | return b_ret<-1 ? -1:0; |
5821 | | |
5822 | | // Determine which seq is the one getting modified qualities. |
5823 | 0 | uint8_t amul, bmul; |
5824 | 0 | if (__ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(a))) & 1) { |
5825 | 0 | amul = 1; |
5826 | 0 | bmul = 0; |
5827 | 0 | } else { |
5828 | 0 | amul = 0; |
5829 | 0 | bmul = 1; |
5830 | 0 | } |
5831 | | |
5832 | | // Loop over the overlapping region nulling qualities in either |
5833 | | // seq a or b. |
5834 | 0 | int err = 0; |
5835 | 0 | while ( 1 ) { |
5836 | | // Step to next matching reference position in a and b |
5837 | 0 | while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos ) |
5838 | 0 | a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, |
5839 | 0 | &a_icig, &a_iseq, &a_iref); |
5840 | 0 | if ( a_ret<0 ) { // done |
5841 | 0 | err = a_ret<-1?-1:0; |
5842 | 0 | break; |
5843 | 0 | } |
5844 | | |
5845 | 0 | while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos ) |
5846 | 0 | b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, |
5847 | 0 | &b_iseq, &b_iref); |
5848 | 0 | if ( b_ret<0 ) { // done |
5849 | 0 | err = b_ret<-1?-1:0; |
5850 | 0 | break; |
5851 | 0 | } |
5852 | | |
5853 | 0 | if ( iref < a_iref + a->core.pos ) |
5854 | 0 | iref = a_iref + a->core.pos; |
5855 | |
|
5856 | 0 | if ( iref < b_iref + b->core.pos ) |
5857 | 0 | iref = b_iref + b->core.pos; |
5858 | |
|
5859 | 0 | iref++; |
5860 | | |
5861 | | // If A or B has a deletion then we catch up the other to this point. |
5862 | | // We also amend quality values using the same rules for mismatch. |
5863 | 0 | if (a_iref+a->core.pos != b_iref+b->core.pos) { |
5864 | 0 | if (a_iref+a->core.pos < b_iref+b->core.pos |
5865 | 0 | && b_cigar > bam_get_cigar(b) |
5866 | 0 | && bam_cigar_op(b_cigar[-1]) == BAM_CDEL) { |
5867 | | // Del in B means it's moved on further than A |
5868 | 0 | do { |
5869 | 0 | a_qual[a_iseq] = amul |
5870 | 0 | ? a_qual[a_iseq]*0.8 |
5871 | 0 | : 0; |
5872 | 0 | a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, |
5873 | 0 | &a_icig, &a_iseq, &a_iref); |
5874 | 0 | if (a_ret < 0) |
5875 | 0 | return -(a_ret<-1); // 0 or -1 |
5876 | 0 | } while (a_iref + a->core.pos < b_iref+b->core.pos); |
5877 | 0 | } else if (a_cigar > bam_get_cigar(a) |
5878 | 0 | && bam_cigar_op(a_cigar[-1]) == BAM_CDEL) { |
5879 | | // Del in A means it's moved on further than B |
5880 | 0 | do { |
5881 | 0 | b_qual[b_iseq] = bmul |
5882 | 0 | ? b_qual[b_iseq]*0.8 |
5883 | 0 | : 0; |
5884 | 0 | b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, |
5885 | 0 | &b_icig, &b_iseq, &b_iref); |
5886 | 0 | if (b_ret < 0) |
5887 | 0 | return -(b_ret<-1); // 0 or -1 |
5888 | 0 | } while (b_iref + b->core.pos < a_iref+a->core.pos); |
5889 | 0 | } else { |
5890 | | // Anything else, eg ref-skip, we don't support here |
5891 | 0 | continue; |
5892 | 0 | } |
5893 | 0 | } |
5894 | | |
5895 | | // fprintf(stderr, "a_cig=%ld,%ld b_cig=%ld,%ld iref=%ld " |
5896 | | // "a_iref=%ld b_iref=%ld a_iseq=%ld b_iseq=%ld\n", |
5897 | | // a_cigar-bam_get_cigar(a), a_icig, |
5898 | | // b_cigar-bam_get_cigar(b), b_icig, |
5899 | | // iref, a_iref+a->core.pos+1, b_iref+b->core.pos+1, |
5900 | | // a_iseq, b_iseq); |
5901 | | |
5902 | 0 | if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq) |
5903 | | // Fell off end of sequence, bad CIGAR? |
5904 | 0 | return -1; |
5905 | | |
5906 | | // We're finally at the same ref base in both a and b. |
5907 | | // Check if the bases match (confident) or mismatch |
5908 | | // (not so confident). |
5909 | 0 | if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) { |
5910 | | // We are very confident about this base. Use sum of quals |
5911 | 0 | int qual = a_qual[a_iseq] + b_qual[b_iseq]; |
5912 | 0 | a_qual[a_iseq] = amul * (qual>200 ? 200 : qual); |
5913 | 0 | b_qual[b_iseq] = bmul * (qual>200 ? 200 : qual);; |
5914 | 0 | } else { |
5915 | | // Not so confident about anymore given the mismatch. |
5916 | | // Reduce qual for lowest quality base. |
5917 | 0 | if ( a_qual[a_iseq] > b_qual[b_iseq] ) { |
5918 | | // A highest qual base; keep |
5919 | 0 | a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; |
5920 | 0 | b_qual[b_iseq] = 0; |
5921 | 0 | } else if (a_qual[a_iseq] < b_qual[b_iseq] ) { |
5922 | | // B highest qual base; keep |
5923 | 0 | b_qual[b_iseq] = 0.8 * b_qual[b_iseq]; |
5924 | 0 | a_qual[a_iseq] = 0; |
5925 | 0 | } else { |
5926 | | // Both equal, so pick randomly |
5927 | 0 | a_qual[a_iseq] = amul * 0.8 * a_qual[a_iseq]; |
5928 | 0 | b_qual[b_iseq] = bmul * 0.8 * b_qual[b_iseq]; |
5929 | 0 | } |
5930 | 0 | } |
5931 | 0 | } |
5932 | | |
5933 | 0 | return err; |
5934 | 0 | } |
5935 | | |
5936 | | // Fix overlapping reads. Simple soft-clipping did not give good results. |
5937 | | // Lowering qualities of unwanted bases is more selective and works better. |
5938 | | // |
5939 | | // Returns 0 on success, -1 on failure |
5940 | | static int overlap_push(bam_plp_t iter, lbnode_t *node) |
5941 | 0 | { |
5942 | 0 | if ( !iter->overlaps ) return 0; |
5943 | | |
5944 | | // mapped mates and paired reads only |
5945 | 0 | if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return 0; |
5946 | | |
5947 | | // no overlap possible, unless some wild cigar |
5948 | 0 | if ( (node->b.core.mtid >= 0 && node->b.core.tid != node->b.core.mtid) |
5949 | 0 | || (llabs(node->b.core.isize) >= 2*node->b.core.l_qseq |
5950 | 0 | && node->b.core.mpos >= node->end) // for those wild cigars |
5951 | 0 | ) return 0; |
5952 | | |
5953 | 0 | khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b)); |
5954 | 0 | if ( kitr==kh_end(iter->overlaps) ) |
5955 | 0 | { |
5956 | | // Only add reads where the mate is still to arrive |
5957 | 0 | if (node->b.core.mpos >= node->b.core.pos || |
5958 | 0 | ((node->b.core.flag & BAM_FPAIRED) && node->b.core.mpos == -1)) { |
5959 | 0 | int ret; |
5960 | 0 | kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret); |
5961 | 0 | if (ret < 0) return -1; |
5962 | 0 | kh_value(iter->overlaps, kitr) = node; |
5963 | 0 | } |
5964 | 0 | } |
5965 | 0 | else |
5966 | 0 | { |
5967 | 0 | lbnode_t *a = kh_value(iter->overlaps, kitr); |
5968 | 0 | int err = tweak_overlap_quality(&a->b, &node->b); |
5969 | 0 | kh_del(olap_hash, iter->overlaps, kitr); |
5970 | 0 | assert(a->end-1 == a->s.end); |
5971 | 0 | return err; |
5972 | 0 | } |
5973 | 0 | return 0; |
5974 | 0 | } |
5975 | | |
5976 | | static void overlap_remove(bam_plp_t iter, const bam1_t *b) |
5977 | 0 | { |
5978 | 0 | if ( !iter->overlaps ) return; |
5979 | | |
5980 | 0 | khiter_t kitr; |
5981 | 0 | if ( b ) |
5982 | 0 | { |
5983 | 0 | kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b)); |
5984 | 0 | if ( kitr!=kh_end(iter->overlaps) ) |
5985 | 0 | kh_del(olap_hash, iter->overlaps, kitr); |
5986 | 0 | } |
5987 | 0 | else |
5988 | 0 | { |
5989 | | // remove all |
5990 | 0 | for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++) |
5991 | 0 | if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr); |
5992 | 0 | } |
5993 | 0 | } |
5994 | | |
5995 | | |
5996 | | |
5997 | | // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns |
5998 | | // pointer to the piled records if next position is ready or NULL if there is not enough records in the |
5999 | | // buffer yet (the current position is still the maximum position across all buffered reads). |
6000 | | const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp) |
6001 | 0 | { |
6002 | 0 | if (iter->error) { *_n_plp = -1; return NULL; } |
6003 | 0 | *_n_plp = 0; |
6004 | 0 | if (iter->is_eof && iter->head == iter->tail) return NULL; |
6005 | 0 | while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) { |
6006 | 0 | int n_plp = 0; |
6007 | | // write iter->plp at iter->pos |
6008 | 0 | lbnode_t **pptr = &iter->head; |
6009 | 0 | while (*pptr != iter->tail) { |
6010 | 0 | lbnode_t *p = *pptr; |
6011 | 0 | if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove |
6012 | 0 | overlap_remove(iter, &p->b); |
6013 | 0 | if (iter->plp_destruct) |
6014 | 0 | iter->plp_destruct(iter->data, &p->b, &p->cd); |
6015 | 0 | *pptr = p->next; mp_free(iter->mp, p); |
6016 | 0 | } |
6017 | 0 | else { |
6018 | 0 | if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup |
6019 | 0 | if (n_plp == iter->max_plp) { // then double the capacity |
6020 | 0 | iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256; |
6021 | 0 | iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp); |
6022 | 0 | } |
6023 | 0 | iter->plp[n_plp].b = &p->b; |
6024 | 0 | iter->plp[n_plp].cd = p->cd; |
6025 | 0 | if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true... |
6026 | 0 | } |
6027 | 0 | pptr = &(*pptr)->next; |
6028 | 0 | } |
6029 | 0 | } |
6030 | 0 | *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos; |
6031 | | // update iter->tid and iter->pos |
6032 | 0 | if (iter->head != iter->tail) { |
6033 | 0 | if (iter->tid > iter->head->b.core.tid) { |
6034 | 0 | hts_log_error("Unsorted input. Pileup aborts"); |
6035 | 0 | iter->error = 1; |
6036 | 0 | *_n_plp = -1; |
6037 | 0 | return NULL; |
6038 | 0 | } |
6039 | 0 | } |
6040 | 0 | if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence |
6041 | 0 | iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference |
6042 | 0 | } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid |
6043 | 0 | iter->pos = iter->head->beg; // jump to the next position |
6044 | 0 | } else ++iter->pos; // scan contiguously |
6045 | | // return |
6046 | 0 | if (n_plp) return iter->plp; |
6047 | 0 | if (iter->is_eof && iter->head == iter->tail) break; |
6048 | 0 | } |
6049 | 0 | return NULL; |
6050 | 0 | } |
6051 | | |
6052 | | const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) |
6053 | 0 | { |
6054 | 0 | hts_pos_t pos64 = 0; |
6055 | 0 | const bam_pileup1_t *p = bam_plp64_next(iter, _tid, &pos64, _n_plp); |
6056 | 0 | if (pos64 < INT_MAX) { |
6057 | 0 | *_pos = pos64; |
6058 | 0 | } else { |
6059 | 0 | hts_log_error("Position %"PRId64" too large", pos64); |
6060 | 0 | *_pos = INT_MAX; |
6061 | 0 | iter->error = 1; |
6062 | 0 | *_n_plp = -1; |
6063 | 0 | return NULL; |
6064 | 0 | } |
6065 | 0 | return p; |
6066 | 0 | } |
6067 | | |
6068 | | int bam_plp_push(bam_plp_t iter, const bam1_t *b) |
6069 | 0 | { |
6070 | 0 | if (iter->error) return -1; |
6071 | 0 | if (b) { |
6072 | 0 | if (b->core.tid < 0) { overlap_remove(iter, b); return 0; } |
6073 | | // Skip only unmapped reads here, any additional filtering must be done in iter->func |
6074 | 0 | if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; } |
6075 | 0 | if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt) |
6076 | 0 | { |
6077 | 0 | overlap_remove(iter, b); |
6078 | 0 | return 0; |
6079 | 0 | } |
6080 | 0 | if (bam_copy1(&iter->tail->b, b) == NULL) |
6081 | 0 | return -1; |
6082 | 0 | iter->tail->b.id = iter->id++; |
6083 | 0 | iter->tail->beg = b->core.pos; |
6084 | | // Use raw rlen rather than bam_endpos() which adjusts rlen=0 to rlen=1 |
6085 | 0 | iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); |
6086 | 0 | iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t |
6087 | 0 | if (b->core.tid < iter->max_tid) { |
6088 | 0 | hts_log_error("The input is not sorted (chromosomes out of order)"); |
6089 | 0 | iter->error = 1; |
6090 | 0 | return -1; |
6091 | 0 | } |
6092 | 0 | if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) { |
6093 | 0 | hts_log_error("The input is not sorted (reads out of order)"); |
6094 | 0 | iter->error = 1; |
6095 | 0 | return -1; |
6096 | 0 | } |
6097 | 0 | iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg; |
6098 | 0 | if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) { |
6099 | 0 | lbnode_t *next = mp_alloc(iter->mp); |
6100 | 0 | if (!next) { |
6101 | 0 | iter->error = 1; |
6102 | 0 | return -1; |
6103 | 0 | } |
6104 | 0 | if (iter->plp_construct) { |
6105 | 0 | if (iter->plp_construct(iter->data, &iter->tail->b, |
6106 | 0 | &iter->tail->cd) < 0) { |
6107 | 0 | mp_free(iter->mp, next); |
6108 | 0 | iter->error = 1; |
6109 | 0 | return -1; |
6110 | 0 | } |
6111 | 0 | } |
6112 | 0 | if (overlap_push(iter, iter->tail) < 0) { |
6113 | 0 | mp_free(iter->mp, next); |
6114 | 0 | iter->error = 1; |
6115 | 0 | return -1; |
6116 | 0 | } |
6117 | 0 | iter->tail->next = next; |
6118 | 0 | iter->tail = iter->tail->next; |
6119 | 0 | } |
6120 | 0 | } else iter->is_eof = 1; |
6121 | 0 | return 0; |
6122 | 0 | } |
6123 | | |
6124 | | const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp) |
6125 | 0 | { |
6126 | 0 | const bam_pileup1_t *plp; |
6127 | 0 | if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; } |
6128 | 0 | if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp; |
6129 | 0 | else { // no pileup line can be obtained; read alignments |
6130 | 0 | *_n_plp = 0; |
6131 | 0 | if (iter->is_eof) return 0; |
6132 | 0 | int ret; |
6133 | 0 | while ( (ret=iter->func(iter->data, iter->b)) >= 0) { |
6134 | 0 | if (bam_plp_push(iter, iter->b) < 0) { |
6135 | 0 | *_n_plp = -1; |
6136 | 0 | return 0; |
6137 | 0 | } |
6138 | 0 | if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp; |
6139 | | // otherwise no pileup line can be returned; read the next alignment. |
6140 | 0 | } |
6141 | 0 | if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; } |
6142 | 0 | if (bam_plp_push(iter, 0) < 0) { |
6143 | 0 | *_n_plp = -1; |
6144 | 0 | return 0; |
6145 | 0 | } |
6146 | 0 | if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp; |
6147 | 0 | return 0; |
6148 | 0 | } |
6149 | 0 | } |
6150 | | |
6151 | | const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) |
6152 | 0 | { |
6153 | 0 | hts_pos_t pos64 = 0; |
6154 | 0 | const bam_pileup1_t *p = bam_plp64_auto(iter, _tid, &pos64, _n_plp); |
6155 | 0 | if (pos64 < INT_MAX) { |
6156 | 0 | *_pos = pos64; |
6157 | 0 | } else { |
6158 | 0 | hts_log_error("Position %"PRId64" too large", pos64); |
6159 | 0 | *_pos = INT_MAX; |
6160 | 0 | iter->error = 1; |
6161 | 0 | *_n_plp = -1; |
6162 | 0 | return NULL; |
6163 | 0 | } |
6164 | 0 | return p; |
6165 | 0 | } |
6166 | | |
6167 | | void bam_plp_reset(bam_plp_t iter) |
6168 | 0 | { |
6169 | 0 | overlap_remove(iter, NULL); |
6170 | 0 | iter->max_tid = iter->max_pos = -1; |
6171 | 0 | iter->tid = iter->pos = 0; |
6172 | 0 | iter->is_eof = 0; |
6173 | 0 | while (iter->head != iter->tail) { |
6174 | 0 | lbnode_t *p = iter->head; |
6175 | 0 | iter->head = p->next; |
6176 | 0 | mp_free(iter->mp, p); |
6177 | 0 | } |
6178 | 0 | } |
6179 | | |
6180 | | void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt) |
6181 | 0 | { |
6182 | 0 | iter->maxcnt = maxcnt; |
6183 | 0 | } |
6184 | | |
6185 | | /************************ |
6186 | | *** Mpileup iterator *** |
6187 | | ************************/ |
6188 | | |
6189 | | struct bam_mplp_s { |
6190 | | int n; |
6191 | | int32_t min_tid, *tid; |
6192 | | hts_pos_t min_pos, *pos; |
6193 | | bam_plp_t *iter; |
6194 | | int *n_plp; |
6195 | | const bam_pileup1_t **plp; |
6196 | | }; |
6197 | | |
6198 | | bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) |
6199 | 0 | { |
6200 | 0 | int i; |
6201 | 0 | bam_mplp_t iter; |
6202 | 0 | iter = (bam_mplp_t)calloc(1, sizeof(struct bam_mplp_s)); |
6203 | 0 | iter->pos = (hts_pos_t*)calloc(n, sizeof(hts_pos_t)); |
6204 | 0 | iter->tid = (int32_t*)calloc(n, sizeof(int32_t)); |
6205 | 0 | iter->n_plp = (int*)calloc(n, sizeof(int)); |
6206 | 0 | iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*)); |
6207 | 0 | iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t)); |
6208 | 0 | iter->n = n; |
6209 | 0 | iter->min_pos = HTS_POS_MAX; |
6210 | 0 | iter->min_tid = (uint32_t)-1; |
6211 | 0 | for (i = 0; i < n; ++i) { |
6212 | 0 | iter->iter[i] = bam_plp_init(func, data[i]); |
6213 | 0 | iter->pos[i] = iter->min_pos; |
6214 | 0 | iter->tid[i] = iter->min_tid; |
6215 | 0 | } |
6216 | 0 | return iter; |
6217 | 0 | } |
6218 | | |
6219 | | int bam_mplp_init_overlaps(bam_mplp_t iter) |
6220 | 0 | { |
6221 | 0 | int i, r = 0; |
6222 | 0 | for (i = 0; i < iter->n; ++i) |
6223 | 0 | r |= bam_plp_init_overlaps(iter->iter[i]); |
6224 | 0 | return r == 0 ? 0 : -1; |
6225 | 0 | } |
6226 | | |
6227 | | void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt) |
6228 | 0 | { |
6229 | 0 | int i; |
6230 | 0 | for (i = 0; i < iter->n; ++i) |
6231 | 0 | iter->iter[i]->maxcnt = maxcnt; |
6232 | 0 | } |
6233 | | |
6234 | | void bam_mplp_destroy(bam_mplp_t iter) |
6235 | 0 | { |
6236 | 0 | int i; |
6237 | 0 | for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]); |
6238 | 0 | free(iter->iter); free(iter->pos); free(iter->tid); |
6239 | 0 | free(iter->n_plp); free(iter->plp); |
6240 | 0 | free(iter); |
6241 | 0 | } |
6242 | | |
6243 | | int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp) |
6244 | 0 | { |
6245 | 0 | int i, ret = 0; |
6246 | 0 | hts_pos_t new_min_pos = HTS_POS_MAX; |
6247 | 0 | uint32_t new_min_tid = (uint32_t)-1; |
6248 | 0 | for (i = 0; i < iter->n; ++i) { |
6249 | 0 | if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) { |
6250 | 0 | int tid; |
6251 | 0 | hts_pos_t pos; |
6252 | 0 | iter->plp[i] = bam_plp64_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]); |
6253 | 0 | if ( iter->iter[i]->error ) return -1; |
6254 | 0 | if (iter->plp[i]) { |
6255 | 0 | iter->tid[i] = tid; |
6256 | 0 | iter->pos[i] = pos; |
6257 | 0 | } else { |
6258 | 0 | iter->tid[i] = 0; |
6259 | 0 | iter->pos[i] = 0; |
6260 | 0 | } |
6261 | 0 | } |
6262 | 0 | if (iter->plp[i]) { |
6263 | 0 | if (iter->tid[i] < new_min_tid) { |
6264 | 0 | new_min_tid = iter->tid[i]; |
6265 | 0 | new_min_pos = iter->pos[i]; |
6266 | 0 | } else if (iter->tid[i] == new_min_tid && iter->pos[i] < new_min_pos) { |
6267 | 0 | new_min_pos = iter->pos[i]; |
6268 | 0 | } |
6269 | 0 | } |
6270 | 0 | } |
6271 | 0 | iter->min_pos = new_min_pos; |
6272 | 0 | iter->min_tid = new_min_tid; |
6273 | 0 | if (new_min_pos == HTS_POS_MAX) return 0; |
6274 | 0 | *_tid = new_min_tid; *_pos = new_min_pos; |
6275 | 0 | for (i = 0; i < iter->n; ++i) { |
6276 | 0 | if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) { |
6277 | 0 | n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i]; |
6278 | 0 | ++ret; |
6279 | 0 | } else n_plp[i] = 0, plp[i] = 0; |
6280 | 0 | } |
6281 | 0 | return ret; |
6282 | 0 | } |
6283 | | |
6284 | | int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) |
6285 | 0 | { |
6286 | 0 | hts_pos_t pos64 = 0; |
6287 | 0 | int ret = bam_mplp64_auto(iter, _tid, &pos64, n_plp, plp); |
6288 | 0 | if (ret >= 0) { |
6289 | 0 | if (pos64 < INT_MAX) { |
6290 | 0 | *_pos = pos64; |
6291 | 0 | } else { |
6292 | 0 | hts_log_error("Position %"PRId64" too large", pos64); |
6293 | 0 | *_pos = INT_MAX; |
6294 | 0 | return -1; |
6295 | 0 | } |
6296 | 0 | } |
6297 | 0 | return ret; |
6298 | 0 | } |
6299 | | |
6300 | | void bam_mplp_reset(bam_mplp_t iter) |
6301 | 0 | { |
6302 | 0 | int i; |
6303 | 0 | iter->min_pos = HTS_POS_MAX; |
6304 | 0 | iter->min_tid = (uint32_t)-1; |
6305 | 0 | for (i = 0; i < iter->n; ++i) { |
6306 | 0 | bam_plp_reset(iter->iter[i]); |
6307 | 0 | iter->pos[i] = HTS_POS_MAX; |
6308 | 0 | iter->tid[i] = (uint32_t)-1; |
6309 | 0 | iter->n_plp[i] = 0; |
6310 | 0 | iter->plp[i] = NULL; |
6311 | 0 | } |
6312 | 0 | } |
6313 | | |
6314 | | void bam_mplp_constructor(bam_mplp_t iter, |
6315 | 0 | int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) { |
6316 | 0 | int i; |
6317 | 0 | for (i = 0; i < iter->n; ++i) |
6318 | 0 | bam_plp_constructor(iter->iter[i], func); |
6319 | 0 | } |
6320 | | |
6321 | | void bam_mplp_destructor(bam_mplp_t iter, |
6322 | 0 | int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) { |
6323 | 0 | int i; |
6324 | 0 | for (i = 0; i < iter->n; ++i) |
6325 | 0 | bam_plp_destructor(iter->iter[i], func); |
6326 | 0 | } |
6327 | | |
6328 | | #endif // ~!defined(BAM_NO_PILEUP) |