Line | Count | Source (jump to first uncovered line) |
1 | | /* sam.c -- SAM and BAM file I/O and manipulation. |
2 | | |
3 | | Copyright (C) 2008-2010, 2012-2025 Genome Research Ltd. |
4 | | Copyright (C) 2010, 2012, 2013 Broad Institute. |
5 | | |
6 | | Author: Heng Li <lh3@sanger.ac.uk> |
7 | | |
8 | | Permission is hereby granted, free of charge, to any person obtaining a copy |
9 | | of this software and associated documentation files (the "Software"), to deal |
10 | | in the Software without restriction, including without limitation the rights |
11 | | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
12 | | copies of the Software, and to permit persons to whom the Software is |
13 | | furnished to do so, subject to the following conditions: |
14 | | |
15 | | The above copyright notice and this permission notice shall be included in |
16 | | all copies or substantial portions of the Software. |
17 | | |
18 | | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
19 | | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
20 | | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
21 | | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
22 | | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
23 | | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
24 | | DEALINGS IN THE SOFTWARE. */ |
25 | | |
26 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
27 | | #include <config.h> |
28 | | |
29 | | #include <strings.h> |
30 | | #include <stdio.h> |
31 | | #include <stdlib.h> |
32 | | #include <string.h> |
33 | | #include <errno.h> |
34 | | #include <zlib.h> |
35 | | #include <assert.h> |
36 | | #include <signal.h> |
37 | | #include <inttypes.h> |
38 | | #include <unistd.h> |
39 | | |
40 | | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
41 | | #include "fuzz_settings.h" |
42 | | #endif |
43 | | |
44 | | // Suppress deprecation message for cigar_tab, which we initialise |
45 | | #include "htslib/hts_defs.h" |
46 | | #undef HTS_DEPRECATED |
47 | | #define HTS_DEPRECATED(message) |
48 | | |
49 | | #include "htslib/sam.h" |
50 | | #include "htslib/bgzf.h" |
51 | | #include "cram/cram.h" |
52 | | #include "hts_internal.h" |
53 | | #include "sam_internal.h" |
54 | | #include "htslib/hfile.h" |
55 | | #include "htslib/hts_endian.h" |
56 | | #include "htslib/hts_expr.h" |
57 | | #include "header.h" |
58 | | |
59 | | #include "htslib/khash.h" |
60 | | KHASH_DECLARE(s2i, kh_cstr_t, int64_t) |
61 | | KHASH_SET_INIT_INT(tag) |
62 | | |
63 | | #ifndef EFTYPE |
64 | 0 | #define EFTYPE ENOEXEC |
65 | | #endif |
66 | | #ifndef EOVERFLOW |
67 | | #define EOVERFLOW ERANGE |
68 | | #endif |
69 | | |
70 | | /********************** |
71 | | *** BAM header I/O *** |
72 | | **********************/ |
73 | | |
74 | | HTSLIB_EXPORT |
75 | | const int8_t bam_cigar_table[256] = { |
76 | | // 0 .. 47 |
77 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
78 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
79 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
80 | | |
81 | | // 48 .. 63 (including =) |
82 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, BAM_CEQUAL, -1, -1, |
83 | | |
84 | | // 64 .. 79 (including MIDNHB) |
85 | | -1, -1, BAM_CBACK, -1, BAM_CDEL, -1, -1, -1, |
86 | | BAM_CHARD_CLIP, BAM_CINS, -1, -1, -1, BAM_CMATCH, BAM_CREF_SKIP, -1, |
87 | | |
88 | | // 80 .. 95 (including SPX) |
89 | | BAM_CPAD, -1, -1, BAM_CSOFT_CLIP, -1, -1, -1, -1, |
90 | | BAM_CDIFF, -1, -1, -1, -1, -1, -1, -1, |
91 | | |
92 | | // 96 .. 127 |
93 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
94 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
95 | | |
96 | | // 128 .. 255 |
97 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
98 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
99 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
100 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
101 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
102 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
103 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
104 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 |
105 | | }; |
106 | | |
107 | | sam_hdr_t *sam_hdr_init(void) |
108 | 44.4k | { |
109 | 44.4k | sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t)); |
110 | 44.4k | if (bh == NULL) return NULL; |
111 | | |
112 | 44.4k | bh->cigar_tab = bam_cigar_table; |
113 | 44.4k | return bh; |
114 | 44.4k | } |
115 | | |
116 | | void sam_hdr_destroy(sam_hdr_t *bh) |
117 | 122k | { |
118 | 122k | int32_t i; |
119 | | |
120 | 122k | if (bh == NULL) return; |
121 | | |
122 | 56.4k | if (bh->ref_count > 0) { |
123 | 12.0k | --bh->ref_count; |
124 | 12.0k | return; |
125 | 12.0k | } |
126 | | |
127 | 44.4k | if (bh->target_name) { |
128 | 43.3k | for (i = 0; i < bh->n_targets; ++i) |
129 | 30.6k | free(bh->target_name[i]); |
130 | 12.7k | free(bh->target_name); |
131 | 12.7k | free(bh->target_len); |
132 | 12.7k | } |
133 | 44.4k | free(bh->text); |
134 | 44.4k | if (bh->hrecs) |
135 | 39.3k | sam_hrecs_free(bh->hrecs); |
136 | 44.4k | if (bh->sdict) |
137 | 44.4k | kh_destroy(s2i, (khash_t(s2i) *) bh->sdict); |
138 | 44.4k | free(bh); |
139 | 44.4k | } |
140 | | |
141 | | // Copy the sam_hdr_t::sdict hash, used to store the real lengths of long |
142 | | // references before sam_hdr_t::hrecs is populated |
143 | | int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h) |
144 | 72 | { |
145 | 72 | const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict; |
146 | 72 | khash_t(s2i) *dest_long_refs = kh_init(s2i); |
147 | 72 | int i; |
148 | 72 | if (!dest_long_refs) return -1; |
149 | | |
150 | 1.48k | for (i = 0; i < h->n_targets; i++) { |
151 | 1.41k | int ret; |
152 | 1.41k | khiter_t ksrc, kdest; |
153 | 1.41k | if (h->target_len[i] < UINT32_MAX) continue; |
154 | 502 | ksrc = kh_get(s2i, src_long_refs, h->target_name[i]); |
155 | 502 | if (ksrc == kh_end(src_long_refs)) continue; |
156 | 502 | kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret); |
157 | 502 | if (ret < 0) { |
158 | 0 | kh_destroy(s2i, dest_long_refs); |
159 | 0 | return -1; |
160 | 0 | } |
161 | 502 | kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc); |
162 | 502 | } |
163 | | |
164 | 72 | h->sdict = dest_long_refs; |
165 | 72 | return 0; |
166 | 72 | } |
167 | | |
168 | | sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0) |
169 | 14.4k | { |
170 | 14.4k | if (h0 == NULL) return NULL; |
171 | 14.4k | sam_hdr_t *h; |
172 | 14.4k | if ((h = sam_hdr_init()) == NULL) return NULL; |
173 | | // copy the simple data |
174 | 14.4k | h->n_targets = 0; |
175 | 14.4k | h->ignore_sam_err = h0->ignore_sam_err; |
176 | 14.4k | h->l_text = 0; |
177 | | |
178 | | // Then the pointery stuff |
179 | | |
180 | 14.4k | if (!h0->hrecs) { |
181 | 938 | h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t)); |
182 | 938 | if (!h->target_len) goto fail; |
183 | 938 | h->target_name = (char**)calloc(h0->n_targets, sizeof(char*)); |
184 | 938 | if (!h->target_name) goto fail; |
185 | | |
186 | 938 | int i; |
187 | 2.82k | for (i = 0; i < h0->n_targets; ++i) { |
188 | 1.89k | h->target_len[i] = h0->target_len[i]; |
189 | 1.89k | h->target_name[i] = strdup(h0->target_name[i]); |
190 | 1.89k | if (!h->target_name[i]) break; |
191 | 1.89k | } |
192 | 938 | h->n_targets = i; |
193 | 938 | if (i < h0->n_targets) goto fail; |
194 | | |
195 | 938 | if (h0->sdict) { |
196 | 72 | if (sam_hdr_dup_sdict(h0, h) < 0) goto fail; |
197 | 72 | } |
198 | 938 | } |
199 | | |
200 | 14.4k | if (h0->hrecs) { |
201 | 13.4k | kstring_t tmp = { 0, 0, NULL }; |
202 | 13.4k | if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) { |
203 | 0 | free(ks_release(&tmp)); |
204 | 0 | goto fail; |
205 | 0 | } |
206 | | |
207 | 13.4k | h->l_text = tmp.l; |
208 | 13.4k | h->text = ks_release(&tmp); |
209 | | |
210 | 13.4k | if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0) |
211 | 0 | goto fail; |
212 | 13.4k | } else { |
213 | 938 | h->l_text = h0->text ? h0->l_text : 0; |
214 | 938 | h->text = malloc(h->l_text + 1); |
215 | 938 | if (!h->text) goto fail; |
216 | 938 | if (h0->text) |
217 | 938 | memcpy(h->text, h0->text, h->l_text); |
218 | 938 | h->text[h->l_text] = '\0'; |
219 | 938 | } |
220 | | |
221 | 14.4k | return h; |
222 | | |
223 | 0 | fail: |
224 | 0 | sam_hdr_destroy(h); |
225 | 0 | return NULL; |
226 | 14.4k | } |
227 | | |
228 | | sam_hdr_t *bam_hdr_read(BGZF *fp) |
229 | 2.87k | { |
230 | 2.87k | sam_hdr_t *h; |
231 | 2.87k | uint8_t buf[4]; |
232 | 2.87k | int magic_len, has_EOF; |
233 | 2.87k | int32_t i, name_len, num_names = 0; |
234 | 2.87k | size_t bufsize; |
235 | 2.87k | ssize_t bytes; |
236 | | // check EOF |
237 | 2.87k | has_EOF = bgzf_check_EOF(fp); |
238 | 2.87k | if (has_EOF < 0) { |
239 | 0 | perror("[W::bam_hdr_read] bgzf_check_EOF"); |
240 | 2.87k | } else if (has_EOF == 0) { |
241 | 2.87k | hts_log_warning("EOF marker is absent. The input is probably truncated"); |
242 | 2.87k | } |
243 | | // read "BAM1" |
244 | 2.87k | magic_len = bgzf_read(fp, buf, 4); |
245 | 2.87k | if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) { |
246 | 0 | hts_log_error("Invalid BAM binary header"); |
247 | 0 | return 0; |
248 | 0 | } |
249 | 2.87k | h = sam_hdr_init(); |
250 | 2.87k | if (!h) goto nomem; |
251 | | |
252 | | // read plain text and the number of reference sequences |
253 | 2.87k | bytes = bgzf_read(fp, buf, 4); |
254 | 2.87k | if (bytes != 4) goto read_err; |
255 | 2.86k | h->l_text = le_to_u32(buf); |
256 | | |
257 | 2.86k | bufsize = h->l_text + 1; |
258 | 2.86k | if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed |
259 | 2.86k | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
260 | 2.86k | if (bufsize > FUZZ_ALLOC_LIMIT) goto nomem; |
261 | 2.85k | #endif |
262 | 2.85k | h->text = (char*)malloc(bufsize); |
263 | 2.85k | if (!h->text) goto nomem; |
264 | 2.85k | h->text[h->l_text] = 0; // make sure it is NULL terminated |
265 | 2.85k | bytes = bgzf_read(fp, h->text, h->l_text); |
266 | 2.85k | if (bytes != h->l_text) goto read_err; |
267 | | |
268 | 2.64k | bytes = bgzf_read(fp, &h->n_targets, 4); |
269 | 2.64k | if (bytes != 4) goto read_err; |
270 | 2.63k | if (fp->is_be) ed_swap_4p(&h->n_targets); |
271 | | |
272 | 2.63k | if (h->n_targets < 0) goto invalid; |
273 | | |
274 | | // read reference sequence names and lengths |
275 | 2.56k | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
276 | 2.56k | if (h->n_targets > (FUZZ_ALLOC_LIMIT - bufsize)/(sizeof(char*)+sizeof(uint32_t))) |
277 | 24 | goto nomem; |
278 | 2.54k | #endif |
279 | 2.54k | if (h->n_targets > 0) { |
280 | 906 | h->target_name = (char**)calloc(h->n_targets, sizeof(char*)); |
281 | 906 | if (!h->target_name) goto nomem; |
282 | 906 | h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t)); |
283 | 906 | if (!h->target_len) goto nomem; |
284 | 906 | } |
285 | 1.63k | else { |
286 | 1.63k | h->target_name = NULL; |
287 | 1.63k | h->target_len = NULL; |
288 | 1.63k | } |
289 | | |
290 | 3.50k | for (i = 0; i != h->n_targets; ++i) { |
291 | 1.26k | bytes = bgzf_read(fp, &name_len, 4); |
292 | 1.26k | if (bytes != 4) goto read_err; |
293 | 1.18k | if (fp->is_be) ed_swap_4p(&name_len); |
294 | 1.18k | if (name_len <= 0) goto invalid; |
295 | | |
296 | 1.10k | h->target_name[i] = (char*)malloc(name_len); |
297 | 1.10k | if (!h->target_name[i]) goto nomem; |
298 | 1.10k | num_names++; |
299 | | |
300 | 1.10k | bytes = bgzf_read(fp, h->target_name[i], name_len); |
301 | 1.10k | if (bytes != name_len) goto read_err; |
302 | | |
303 | 993 | if (h->target_name[i][name_len - 1] != '\0') { |
304 | | /* Fix missing NUL-termination. Is this being too nice? |
305 | | We could alternatively bail out with an error. */ |
306 | 663 | char *new_name; |
307 | 663 | if (name_len == INT32_MAX) goto invalid; |
308 | 663 | new_name = realloc(h->target_name[i], name_len + 1); |
309 | 663 | if (new_name == NULL) goto nomem; |
310 | 663 | h->target_name[i] = new_name; |
311 | 663 | h->target_name[i][name_len] = '\0'; |
312 | 663 | } |
313 | | |
314 | 993 | bytes = bgzf_read(fp, &h->target_len[i], 4); |
315 | 993 | if (bytes != 4) goto read_err; |
316 | 960 | if (fp->is_be) ed_swap_4p(&h->target_len[i]); |
317 | 960 | } |
318 | 2.23k | return h; |
319 | | |
320 | 33 | nomem: |
321 | 33 | hts_log_error("Out of memory"); |
322 | 33 | goto clean; |
323 | | |
324 | 465 | read_err: |
325 | 465 | if (bytes < 0) { |
326 | 9 | hts_log_error("Error reading BGZF stream"); |
327 | 456 | } else { |
328 | 456 | hts_log_error("Truncated BAM header"); |
329 | 456 | } |
330 | 465 | goto clean; |
331 | | |
332 | 147 | invalid: |
333 | 147 | hts_log_error("Invalid BAM binary header"); |
334 | | |
335 | 645 | clean: |
336 | 645 | if (h != NULL) { |
337 | 645 | h->n_targets = num_names; // ensure we free only allocated target_names |
338 | 645 | sam_hdr_destroy(h); |
339 | 645 | } |
340 | 645 | return NULL; |
341 | 147 | } |
342 | | |
343 | | int bam_hdr_write(BGZF *fp, const sam_hdr_t *h) |
344 | 8.98k | { |
345 | 8.98k | int32_t i, name_len, x; |
346 | 8.98k | kstring_t hdr_ks = { 0, 0, NULL }; |
347 | 8.98k | char *text; |
348 | 8.98k | uint32_t l_text; |
349 | | |
350 | 8.98k | if (!h) return -1; |
351 | | |
352 | 8.98k | if (h->hrecs) { |
353 | 8.04k | if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1; |
354 | 8.04k | if (hdr_ks.l > UINT32_MAX) { |
355 | 0 | hts_log_error("Header too long for BAM format"); |
356 | 0 | free(hdr_ks.s); |
357 | 0 | return -1; |
358 | 8.04k | } else if (hdr_ks.l > INT32_MAX) { |
359 | 0 | hts_log_warning("Header too long for BAM specification (>2GB)"); |
360 | 0 | hts_log_warning("Output file may not be portable"); |
361 | 0 | } |
362 | 8.04k | text = hdr_ks.s; |
363 | 8.04k | l_text = hdr_ks.l; |
364 | 8.04k | } else { |
365 | 938 | if (h->l_text > UINT32_MAX) { |
366 | 0 | hts_log_error("Header too long for BAM format"); |
367 | 0 | return -1; |
368 | 938 | } else if (h->l_text > INT32_MAX) { |
369 | 0 | hts_log_warning("Header too long for BAM specification (>2GB)"); |
370 | 0 | hts_log_warning("Output file may not be portable"); |
371 | 0 | } |
372 | 938 | text = h->text; |
373 | 938 | l_text = h->l_text; |
374 | 938 | } |
375 | | // write "BAM1" |
376 | 8.98k | if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; } |
377 | | // write plain text and the number of reference sequences |
378 | 8.98k | if (fp->is_be) { |
379 | 0 | x = ed_swap_4(l_text); |
380 | 0 | if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; } |
381 | 0 | if (l_text) { |
382 | 0 | if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; } |
383 | 0 | } |
384 | 0 | x = ed_swap_4(h->n_targets); |
385 | 0 | if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; } |
386 | 8.98k | } else { |
387 | 8.98k | if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; } |
388 | 8.98k | if (l_text) { |
389 | 4.34k | if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; } |
390 | 4.34k | } |
391 | 8.98k | if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; } |
392 | 8.98k | } |
393 | 8.98k | free(hdr_ks.s); |
394 | | // write sequence names and lengths |
395 | 16.2k | for (i = 0; i != h->n_targets; ++i) { |
396 | 7.24k | char *p = h->target_name[i]; |
397 | 7.24k | name_len = strlen(p) + 1; |
398 | 7.24k | if (fp->is_be) { |
399 | 0 | x = ed_swap_4(name_len); |
400 | 0 | if (bgzf_write(fp, &x, 4) < 0) return -1; |
401 | 7.24k | } else { |
402 | 7.24k | if (bgzf_write(fp, &name_len, 4) < 0) return -1; |
403 | 7.24k | } |
404 | 7.24k | if (bgzf_write(fp, p, name_len) < 0) return -1; |
405 | 7.24k | if (fp->is_be) { |
406 | 0 | x = ed_swap_4(h->target_len[i]); |
407 | 0 | if (bgzf_write(fp, &x, 4) < 0) return -1; |
408 | 7.24k | } else { |
409 | 7.24k | if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1; |
410 | 7.24k | } |
411 | 7.24k | } |
412 | 8.98k | if (bgzf_flush(fp) < 0) return -1; |
413 | 8.98k | return 0; |
414 | 8.98k | } |
415 | | |
416 | | const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, |
417 | 0 | hts_pos_t *beg, hts_pos_t *end, int flags) { |
418 | 0 | return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags); |
419 | 0 | } |
420 | | |
421 | | /************************* |
422 | | *** BAM alignment I/O *** |
423 | | *************************/ |
424 | | |
425 | | bam1_t *bam_init1(void) |
426 | 633k | { |
427 | 633k | return (bam1_t*)calloc(1, sizeof(bam1_t)); |
428 | 633k | } |
429 | | |
430 | | int sam_realloc_bam_data(bam1_t *b, size_t desired) |
431 | 653k | { |
432 | 653k | uint32_t new_m_data; |
433 | 653k | uint8_t *new_data; |
434 | 653k | new_m_data = desired; |
435 | 653k | kroundup32(new_m_data); // next power of 2 |
436 | 653k | new_m_data += 32; // reduces malloc arena migrations? |
437 | 653k | if (new_m_data < desired) { |
438 | 0 | errno = ENOMEM; // Not strictly true but we can't store the size |
439 | 0 | return -1; |
440 | 0 | } |
441 | 653k | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
442 | 653k | if (new_m_data > FUZZ_ALLOC_LIMIT) { |
443 | 54 | errno = ENOMEM; |
444 | 54 | return -1; |
445 | 54 | } |
446 | 653k | #endif |
447 | 653k | if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) { |
448 | 653k | new_data = realloc(b->data, new_m_data); |
449 | 653k | } else { |
450 | 0 | if ((new_data = malloc(new_m_data)) != NULL) { |
451 | 0 | if (b->l_data > 0) |
452 | 0 | memcpy(new_data, b->data, |
453 | 0 | b->l_data < b->m_data ? b->l_data : b->m_data); |
454 | 0 | bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA)); |
455 | 0 | } |
456 | 0 | } |
457 | 653k | if (!new_data) return -1; |
458 | 653k | b->data = new_data; |
459 | 653k | b->m_data = new_m_data; |
460 | 653k | return 0; |
461 | 653k | } |
462 | | |
463 | | void bam_destroy1(bam1_t *b) |
464 | 43.9M | { |
465 | 43.9M | if (b == 0) return; |
466 | 633k | if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) { |
467 | 633k | free(b->data); |
468 | 633k | if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) { |
469 | | // In case of reuse |
470 | 0 | b->data = NULL; |
471 | 0 | b->m_data = 0; |
472 | 0 | b->l_data = 0; |
473 | 0 | } |
474 | 633k | } |
475 | | |
476 | 633k | if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0) |
477 | 633k | free(b); |
478 | 633k | } |
479 | | |
480 | | bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) |
481 | 1.51M | { |
482 | 1.51M | if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL; |
483 | 1.51M | memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data |
484 | 1.51M | memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest |
485 | 1.51M | bdst->l_data = bsrc->l_data; |
486 | 1.51M | bdst->id = bsrc->id; |
487 | 1.51M | return bdst; |
488 | 1.51M | } |
489 | | |
490 | | bam1_t *bam_dup1(const bam1_t *bsrc) |
491 | 607k | { |
492 | 607k | if (bsrc == NULL) return NULL; |
493 | 607k | bam1_t *bdst = bam_init1(); |
494 | 607k | if (bdst == NULL) return NULL; |
495 | 607k | if (bam_copy1(bdst, bsrc) == NULL) { |
496 | 0 | bam_destroy1(bdst); |
497 | 0 | return NULL; |
498 | 0 | } |
499 | 607k | return bdst; |
500 | 607k | } |
501 | | |
502 | | static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar, |
503 | | hts_pos_t *rlen, hts_pos_t *qlen) |
504 | 1.61k | { |
505 | 1.61k | int k; |
506 | 1.61k | *rlen = *qlen = 0; |
507 | 15.5k | for (k = 0; k < n_cigar; ++k) { |
508 | 13.9k | int type = bam_cigar_type(bam_cigar_op(cigar[k])); |
509 | 13.9k | int len = bam_cigar_oplen(cigar[k]); |
510 | 13.9k | if (type & 1) *qlen += len; |
511 | 13.9k | if (type & 2) *rlen += len; |
512 | 13.9k | } |
513 | 1.61k | } |
514 | | |
515 | | static int subtract_check_underflow(size_t length, size_t *limit) |
516 | 21.6M | { |
517 | 21.6M | if (length <= *limit) { |
518 | 21.6M | *limit -= length; |
519 | 21.6M | return 0; |
520 | 21.6M | } |
521 | | |
522 | 0 | return -1; |
523 | 21.6M | } |
524 | | |
525 | | int bam_set1(bam1_t *bam, |
526 | | size_t l_qname, const char *qname, |
527 | | uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq, |
528 | | size_t n_cigar, const uint32_t *cigar, |
529 | | int32_t mtid, hts_pos_t mpos, hts_pos_t isize, |
530 | | size_t l_seq, const char *seq, const char *qual, |
531 | | size_t l_aux) |
532 | 4.32M | { |
533 | | // use a default qname "*" if none is provided |
534 | 4.32M | if (l_qname == 0) { |
535 | 1.05M | l_qname = 1; |
536 | 1.05M | qname = "*"; |
537 | 1.05M | } |
538 | | |
539 | | // note: the qname is stored nul terminated and padded as described in the |
540 | | // documentation for the bam1_t struct. |
541 | 4.32M | size_t qname_nuls = 4 - l_qname % 4; |
542 | | |
543 | | // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos(). |
544 | | // can't use bam_endpos() directly as some fields not yet set up. |
545 | 4.32M | hts_pos_t rlen = 0, qlen = 0; |
546 | 4.32M | if (!(flag & BAM_FUNMAP)) { |
547 | 0 | bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen); |
548 | 0 | } |
549 | 4.32M | if (rlen == 0) { |
550 | 4.32M | rlen = 1; |
551 | 4.32M | } |
552 | | |
553 | | // validate parameters |
554 | 4.32M | if (l_qname > 254) { |
555 | 78 | hts_log_error("Query name too long"); |
556 | 78 | errno = EINVAL; |
557 | 78 | return -1; |
558 | 78 | } |
559 | 4.32M | if (HTS_POS_MAX - rlen <= pos) { |
560 | 0 | hts_log_error("Read ends beyond highest supported position"); |
561 | 0 | errno = EINVAL; |
562 | 0 | return -1; |
563 | 0 | } |
564 | 4.32M | if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) { |
565 | 0 | hts_log_error("Mapped query must have a CIGAR"); |
566 | 0 | errno = EINVAL; |
567 | 0 | return -1; |
568 | 0 | } |
569 | 4.32M | if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) { |
570 | 0 | hts_log_error("CIGAR and query sequence are of different length"); |
571 | 0 | errno = EINVAL; |
572 | 0 | return -1; |
573 | 0 | } |
574 | | |
575 | 4.32M | size_t limit = INT32_MAX; |
576 | 4.32M | int u = subtract_check_underflow(l_qname + qname_nuls, &limit); |
577 | 4.32M | u += subtract_check_underflow(n_cigar * 4, &limit); |
578 | 4.32M | u += subtract_check_underflow((l_seq + 1) / 2, &limit); |
579 | 4.32M | u += subtract_check_underflow(l_seq, &limit); |
580 | 4.32M | u += subtract_check_underflow(l_aux, &limit); |
581 | 4.32M | if (u != 0) { |
582 | 0 | hts_log_error("Size overflow"); |
583 | 0 | errno = EINVAL; |
584 | 0 | return -1; |
585 | 0 | } |
586 | | |
587 | | // re-allocate the data buffer as needed. |
588 | 4.32M | size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq; |
589 | 4.32M | if (realloc_bam_data(bam, data_len + l_aux) < 0) { |
590 | 0 | return -1; |
591 | 0 | } |
592 | | |
593 | 4.32M | bam->l_data = (int)data_len; |
594 | 4.32M | bam->core.pos = pos; |
595 | 4.32M | bam->core.tid = tid; |
596 | 4.32M | bam->core.bin = bam_reg2bin(pos, pos + rlen); |
597 | 4.32M | bam->core.qual = mapq; |
598 | 4.32M | bam->core.l_extranul = (uint8_t)(qname_nuls - 1); |
599 | 4.32M | bam->core.flag = flag; |
600 | 4.32M | bam->core.l_qname = (uint16_t)(l_qname + qname_nuls); |
601 | 4.32M | bam->core.n_cigar = (uint32_t)n_cigar; |
602 | 4.32M | bam->core.l_qseq = (int32_t)l_seq; |
603 | 4.32M | bam->core.mtid = mtid; |
604 | 4.32M | bam->core.mpos = mpos; |
605 | 4.32M | bam->core.isize = isize; |
606 | | |
607 | 4.32M | uint8_t *cp = bam->data; |
608 | 4.32M | strncpy((char *)cp, qname, l_qname); |
609 | 4.32M | int i; |
610 | 16.6M | for (i = 0; i < qname_nuls; i++) { |
611 | 12.3M | cp[l_qname + i] = '\0'; |
612 | 12.3M | } |
613 | 4.32M | cp += l_qname + qname_nuls; |
614 | | |
615 | 4.32M | if (n_cigar > 0) { |
616 | 0 | memcpy(cp, cigar, n_cigar * 4); |
617 | 0 | } |
618 | 4.32M | cp += n_cigar * 4; |
619 | | |
620 | 363M | #define NN 16 |
621 | 4.32M | const uint8_t *useq = (uint8_t *)seq; |
622 | 34.2M | for (i = 0; i + NN < l_seq; i += NN) { |
623 | 29.9M | int j; |
624 | 29.9M | const uint8_t *u2 = useq+i; |
625 | 269M | for (j = 0; j < NN/2; j++) |
626 | 239M | cp[j] = (seq_nt16_table[u2[j*2]]<<4) | seq_nt16_table[u2[j*2+1]]; |
627 | 29.9M | cp += NN/2; |
628 | 29.9M | } |
629 | 6.20M | for (; i + 1 < l_seq; i += 2) { |
630 | 1.88M | *cp++ = (seq_nt16_table[useq[i]] << 4) | seq_nt16_table[useq[i + 1]]; |
631 | 1.88M | } |
632 | | |
633 | 4.61M | for (; i < l_seq; i++) { |
634 | 297k | *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4; |
635 | 297k | } |
636 | | |
637 | 4.32M | if (qual) { |
638 | 723 | memcpy(cp, qual, l_seq); |
639 | 723 | } |
640 | 4.32M | else { |
641 | 4.32M | memset(cp, '\xff', l_seq); |
642 | 4.32M | } |
643 | | |
644 | 4.32M | return (int)data_len; |
645 | 4.32M | } |
646 | | |
647 | | hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar) |
648 | 1.54M | { |
649 | 1.54M | int k; |
650 | 1.54M | hts_pos_t l; |
651 | 3.73M | for (k = l = 0; k < n_cigar; ++k) |
652 | 2.18M | if (bam_cigar_type(bam_cigar_op(cigar[k]))&1) |
653 | 1.94M | l += bam_cigar_oplen(cigar[k]); |
654 | 1.54M | return l; |
655 | 1.54M | } |
656 | | |
657 | | hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar) |
658 | 143k | { |
659 | 143k | int k; |
660 | 143k | hts_pos_t l; |
661 | 7.34M | for (k = l = 0; k < n_cigar; ++k) |
662 | 7.20M | if (bam_cigar_type(bam_cigar_op(cigar[k]))&2) |
663 | 6.63M | l += bam_cigar_oplen(cigar[k]); |
664 | 143k | return l; |
665 | 143k | } |
666 | | |
667 | | hts_pos_t bam_endpos(const bam1_t *b) |
668 | 1.40k | { |
669 | 1.40k | hts_pos_t rlen = (b->core.flag & BAM_FUNMAP)? 0 : bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); |
670 | 1.40k | if (rlen == 0) rlen = 1; |
671 | 1.40k | return b->core.pos + rlen; |
672 | 1.40k | } |
673 | | |
674 | | static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG |
675 | 233k | { |
676 | 233k | bam1_core_t *c = &b->core; |
677 | | |
678 | | // Bail out as fast as possible for the easy case |
679 | 233k | uint32_t test_CG = BAM_CSOFT_CLIP | (c->l_qseq << BAM_CIGAR_SHIFT); |
680 | 233k | if (c->n_cigar == 0 || test_CG != *bam_get_cigar(b)) |
681 | 153k | return 0; |
682 | | |
683 | | // The above isn't fool proof - we may have old CIGAR tags that aren't used, |
684 | | // but this is much less likely so do as a secondary check. |
685 | 80.1k | if (c->tid < 0 || c->pos < 0) |
686 | 42.1k | return 0; |
687 | | |
688 | | // Do we have a CG tag? |
689 | 37.9k | uint8_t *CG = bam_aux_get(b, "CG"); |
690 | 37.9k | int saved_errno = errno; |
691 | 37.9k | if (!CG) { |
692 | 36.0k | if (errno != ENOENT) return -1; // Bad aux data |
693 | 36.0k | errno = saved_errno; // restore errno on expected no-CG-tag case |
694 | 36.0k | return 0; |
695 | 36.0k | } |
696 | | |
697 | | // Now we start with the serious work migrating CG to CIGAR |
698 | 1.89k | uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, |
699 | 1.89k | *cigar0, CG_len, fake_bytes; |
700 | 1.89k | cigar0 = bam_get_cigar(b); |
701 | 1.89k | fake_bytes = c->n_cigar * 4; |
702 | 1.89k | if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i')) |
703 | 482 | return 0; // not of type B,I |
704 | 1.41k | CG_len = le_to_u32(CG + 2); |
705 | | // don't move if the real CIGAR length is shorter than the fake cigar length |
706 | 1.41k | if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; |
707 | | |
708 | | // move from the CG tag to the right position |
709 | 1.40k | cigar_st = (uint8_t*)cigar0 - b->data; |
710 | 1.40k | c->n_cigar = CG_len; |
711 | 1.40k | n_cigar4 = c->n_cigar * 4; |
712 | 1.40k | CG_st = CG - b->data - 2; |
713 | 1.40k | CG_en = CG_st + 8 + n_cigar4; |
714 | 1.40k | if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1; |
715 | | // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place |
716 | 1.40k | b->l_data = b->l_data - fake_bytes + n_cigar4; |
717 | | // insert c->n_cigar-fake_bytes empty space to make room |
718 | 1.40k | memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); |
719 | | // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR |
720 | 1.40k | memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); |
721 | 1.40k | if (ori_len > CG_en) // move data after the CG tag |
722 | 157 | memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en); |
723 | 1.40k | b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4) |
724 | 1.40k | if (recal_bin) |
725 | 1.40k | b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5); |
726 | 1.40k | if (give_warning) |
727 | 1.40k | hts_log_warning("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar); |
728 | 1.40k | return 1; |
729 | 1.40k | } |
730 | | |
731 | | static inline int aux_type2size(uint8_t type) |
732 | 1.88M | { |
733 | 1.88M | switch (type) { |
734 | 1.05M | case 'A': case 'c': case 'C': |
735 | 1.05M | return 1; |
736 | 137k | case 's': case 'S': |
737 | 137k | return 2; |
738 | 385k | case 'i': case 'I': case 'f': |
739 | 385k | return 4; |
740 | 12.1k | case 'd': |
741 | 12.1k | return 8; |
742 | 292k | case 'Z': case 'H': case 'B': |
743 | 292k | return type; |
744 | 564 | default: |
745 | 564 | return 0; |
746 | 1.88M | } |
747 | 1.88M | } |
748 | | |
749 | | static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host) |
750 | 0 | { |
751 | 0 | uint32_t *cigar = (uint32_t*)(data + c->l_qname); |
752 | 0 | uint32_t i; |
753 | 0 | for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]); |
754 | 0 | } |
755 | | |
756 | | // Fix bad records where qname is not terminated correctly. |
757 | 1.51k | static int fixup_missing_qname_nul(bam1_t *b) { |
758 | 1.51k | bam1_core_t *c = &b->core; |
759 | | |
760 | | // Note this is called before c->l_extranul is added to c->l_qname |
761 | 1.51k | if (c->l_extranul > 0) { |
762 | 737 | b->data[c->l_qname++] = '\0'; |
763 | 737 | c->l_extranul--; |
764 | 778 | } else { |
765 | 778 | if (b->l_data > INT_MAX - 4) return -1; |
766 | 778 | if (realloc_bam_data(b, b->l_data + 4) < 0) return -1; |
767 | 778 | b->l_data += 4; |
768 | 778 | b->data[c->l_qname++] = '\0'; |
769 | 778 | c->l_extranul = 3; |
770 | 778 | } |
771 | 1.51k | return 0; |
772 | 1.51k | } |
773 | | |
774 | | /* |
775 | | * Note a second interface that returns a bam pointer instead would avoid bam_copy1 |
776 | | * in multi-threaded handling. This may be worth considering for htslib2. |
777 | | */ |
778 | | int bam_read1(BGZF *fp, bam1_t *b) |
779 | 3.47k | { |
780 | 3.47k | bam1_core_t *c = &b->core; |
781 | 3.47k | int32_t block_len, ret, i; |
782 | 3.47k | uint32_t new_l_data; |
783 | 3.47k | uint8_t tmp[32], *x; |
784 | | |
785 | 3.47k | b->l_data = 0; |
786 | | |
787 | 3.47k | if ((ret = bgzf_read_small(fp, &block_len, 4)) != 4) { |
788 | 262 | if (ret == 0) return -1; // normal end-of-file |
789 | 132 | else return -2; // truncated |
790 | 262 | } |
791 | 3.21k | if (fp->is_be) |
792 | 0 | ed_swap_4p(&block_len); |
793 | 3.21k | if (block_len < 32) return -4; // block_len includes core data |
794 | 2.83k | if (fp->block_length - fp->block_offset > 32) { |
795 | | // Avoid bgzf_read and a temporary copy to a local buffer |
796 | 2.48k | x = (uint8_t *)fp->uncompressed_block + fp->block_offset; |
797 | 2.48k | fp->block_offset += 32; |
798 | 2.48k | } else { |
799 | 343 | x = tmp; |
800 | 343 | if (bgzf_read(fp, x, 32) != 32) return -3; |
801 | 343 | } |
802 | | |
803 | 2.55k | c->tid = le_to_u32(x); |
804 | 2.55k | c->pos = le_to_i32(x+4); |
805 | 2.55k | uint32_t x2 = le_to_u32(x+8); |
806 | 2.55k | c->bin = x2>>16; |
807 | 2.55k | c->qual = x2>>8&0xff; |
808 | 2.55k | c->l_qname = x2&0xff; |
809 | 2.55k | c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0; |
810 | 2.55k | uint32_t x3 = le_to_u32(x+12); |
811 | 2.55k | c->flag = x3>>16; |
812 | 2.55k | c->n_cigar = x3&0xffff; |
813 | 2.55k | c->l_qseq = le_to_u32(x+16); |
814 | 2.55k | c->mtid = le_to_u32(x+20); |
815 | 2.55k | c->mpos = le_to_i32(x+24); |
816 | 2.55k | c->isize = le_to_i32(x+28); |
817 | | |
818 | 2.55k | new_l_data = block_len - 32 + c->l_extranul; |
819 | 2.55k | if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4; |
820 | 2.43k | if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul |
821 | 2.43k | + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data) |
822 | 144 | return -4; |
823 | 2.29k | if (realloc_bam_data(b, new_l_data) < 0) return -4; |
824 | 2.24k | b->l_data = new_l_data; |
825 | | |
826 | 2.24k | if (bgzf_read_small(fp, b->data, c->l_qname) != c->l_qname) return -4; |
827 | 2.18k | if (b->data[c->l_qname - 1] != '\0') { // try to fix missing nul termination |
828 | 1.51k | if (fixup_missing_qname_nul(b) < 0) return -4; |
829 | 1.51k | } |
830 | 5.90k | for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0'; |
831 | 2.18k | c->l_qname += c->l_extranul; |
832 | 2.18k | if (b->l_data < c->l_qname || |
833 | 2.18k | bgzf_read_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname) |
834 | 189 | return -4; |
835 | 1.99k | if (fp->is_be) swap_data(c, b->l_data, b->data, 0); |
836 | 1.99k | if (bam_tag2cigar(b, 0, 0) < 0) |
837 | 23 | return -4; |
838 | | |
839 | | // TODO: consider making this conditional |
840 | 1.96k | if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency |
841 | 1.61k | hts_pos_t rlen, qlen; |
842 | 1.61k | bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen); |
843 | 1.61k | if ((b->core.flag & BAM_FUNMAP) || rlen == 0) rlen = 1; |
844 | 1.61k | b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5); |
845 | | // Sanity check for broken CIGAR alignments |
846 | 1.61k | if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) { |
847 | 51 | hts_log_error("CIGAR and query sequence lengths differ for %s", |
848 | 51 | bam_get_qname(b)); |
849 | 51 | return -4; |
850 | 51 | } |
851 | 1.61k | } |
852 | | |
853 | 1.91k | return 4 + block_len; |
854 | 1.96k | } |
855 | | |
856 | | int bam_write1(BGZF *fp, const bam1_t *b) |
857 | 1.51M | { |
858 | 1.51M | const bam1_core_t *c = &b->core; |
859 | 1.51M | uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y; |
860 | 1.51M | int i, ok; |
861 | 1.51M | if (c->l_qname - c->l_extranul > 255) { |
862 | 5 | hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b)); |
863 | 5 | errno = EOVERFLOW; |
864 | 5 | return -1; |
865 | 5 | } |
866 | 1.51M | if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR |
867 | 1.51M | if (c->pos > INT_MAX || |
868 | 1.51M | c->mpos > INT_MAX || |
869 | 1.51M | c->isize < INT_MIN || c->isize > INT_MAX) { |
870 | 409 | hts_log_error("Positional data is too large for BAM format"); |
871 | 409 | return -1; |
872 | 409 | } |
873 | 1.51M | x[0] = c->tid; |
874 | 1.51M | x[1] = c->pos; |
875 | 1.51M | x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul); |
876 | 1.51M | if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2; |
877 | 1.51M | else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff); |
878 | 1.51M | x[4] = c->l_qseq; |
879 | 1.51M | x[5] = c->mtid; |
880 | 1.51M | x[6] = c->mpos; |
881 | 1.51M | x[7] = c->isize; |
882 | 1.51M | ok = (bgzf_flush_try(fp, 4 + block_len) >= 0); |
883 | 1.51M | if (fp->is_be) { |
884 | 0 | for (i = 0; i < 8; ++i) ed_swap_4p(x + i); |
885 | 0 | y = block_len; |
886 | 0 | if (ok) ok = (bgzf_write_small(fp, ed_swap_4p(&y), 4) >= 0); |
887 | 0 | swap_data(c, b->l_data, b->data, 1); |
888 | 1.51M | } else { |
889 | 1.51M | if (ok) ok = (bgzf_write_small(fp, &block_len, 4) >= 0); |
890 | 1.51M | } |
891 | 1.51M | if (ok) ok = (bgzf_write_small(fp, x, 32) >= 0); |
892 | 1.51M | if (ok) ok = (bgzf_write_small(fp, b->data, c->l_qname - c->l_extranul) >= 0); |
893 | 1.51M | if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally |
894 | 1.51M | if (ok) ok = (bgzf_write_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0); |
895 | 1.51M | } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag |
896 | 17 | uint8_t buf[8]; |
897 | 17 | uint32_t cigar_st, cigar_en, cigar[2]; |
898 | 17 | hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b)); |
899 | 17 | if (cigreflen >= (1<<28)) { |
900 | | // Length of reference covered is greater than the biggest |
901 | | // CIGAR operation currently allowed. |
902 | 1 | hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos |
903 | 1 | " cannot be written in BAM. Try writing SAM or CRAM instead.\n", |
904 | 1 | bam_get_qname(b), c->n_cigar, cigreflen); |
905 | 1 | return -1; |
906 | 1 | } |
907 | 16 | cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; |
908 | 16 | cigar_en = cigar_st + c->n_cigar * 4; |
909 | 16 | cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP; |
910 | 16 | cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP; |
911 | 16 | u32_to_le(cigar[0], buf); |
912 | 16 | u32_to_le(cigar[1], buf + 4); |
913 | 16 | if (ok) ok = (bgzf_write_small(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N |
914 | 16 | if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR |
915 | 16 | if (ok) ok = (bgzf_write_small(fp, "CGBI", 4) >= 0); // write CG:B,I |
916 | 16 | u32_to_le(c->n_cigar, buf); |
917 | 16 | if (ok) ok = (bgzf_write_small(fp, buf, 4) >= 0); // write the true CIGAR length |
918 | 16 | if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR |
919 | 16 | } |
920 | 1.51M | if (fp->is_be) swap_data(c, b->l_data, b->data, 0); |
921 | 1.51M | return ok? 4 + block_len : -1; |
922 | 1.51M | } |
923 | | |
924 | | /* |
925 | | * Write a BAM file and append to the in-memory index simultaneously. |
926 | | */ |
927 | 1.51M | static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) { |
928 | 1.51M | BGZF *bfp = fp->fp.bgzf; |
929 | | |
930 | 1.51M | if (!fp->idx) |
931 | 1.51M | return bam_write1(bfp, b); |
932 | | |
933 | 0 | uint32_t block_len = b->l_data - b->core.l_extranul + 32; |
934 | 0 | if (bgzf_flush_try(bfp, 4 + block_len) < 0) |
935 | 0 | return -1; |
936 | 0 | if (!bfp->mt) |
937 | 0 | hts_idx_amend_last(fp->idx, bgzf_tell(bfp)); |
938 | |
|
939 | 0 | int ret = bam_write1(bfp, b); |
940 | 0 | if (ret < 0) |
941 | 0 | return -1; |
942 | | |
943 | 0 | if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) { |
944 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
945 | 0 | bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); |
946 | 0 | ret = -1; |
947 | 0 | } |
948 | |
|
949 | 0 | return ret; |
950 | 0 | } |
951 | | |
952 | | /* |
953 | | * Set the qname in a BAM record |
954 | | */ |
955 | | int bam_set_qname(bam1_t *rec, const char *qname) |
956 | 0 | { |
957 | 0 | if (!rec) return -1; |
958 | 0 | if (!qname || !*qname) return -1; |
959 | | |
960 | 0 | size_t old_len = rec->core.l_qname; |
961 | 0 | size_t new_len = strlen(qname) + 1; |
962 | 0 | if (new_len < 1 || new_len > 255) return -1; |
963 | | |
964 | 0 | int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0; |
965 | |
|
966 | 0 | size_t new_data_len = rec->l_data - old_len + new_len + extranul; |
967 | 0 | if (realloc_bam_data(rec, new_data_len) < 0) return -1; |
968 | | |
969 | | // Make room |
970 | 0 | if (new_len + extranul != rec->core.l_qname) |
971 | 0 | memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname); |
972 | | // Copy in new name and pad if needed |
973 | 0 | memcpy(rec->data, qname, new_len); |
974 | 0 | int n; |
975 | 0 | for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0'; |
976 | |
|
977 | 0 | rec->l_data = new_data_len; |
978 | 0 | rec->core.l_qname = new_len + extranul; |
979 | 0 | rec->core.l_extranul = extranul; |
980 | |
|
981 | 0 | return 0; |
982 | 0 | } |
983 | | |
984 | | /******************** |
985 | | *** BAM indexing *** |
986 | | ********************/ |
987 | | |
988 | | static hts_idx_t *sam_index(htsFile *fp, int min_shift) |
989 | 0 | { |
990 | 0 | int n_lvls, i, fmt, ret; |
991 | 0 | bam1_t *b; |
992 | 0 | hts_idx_t *idx; |
993 | 0 | sam_hdr_t *h; |
994 | 0 | h = sam_hdr_read(fp); |
995 | 0 | if (h == NULL) return NULL; |
996 | 0 | if (min_shift > 0) { |
997 | 0 | hts_pos_t max_len = 0, s; |
998 | 0 | for (i = 0; i < h->n_targets; ++i) { |
999 | 0 | hts_pos_t len = sam_hdr_tid2len(h, i); |
1000 | 0 | if (max_len < len) max_len = len; |
1001 | 0 | } |
1002 | 0 | max_len += 256; |
1003 | 0 | for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3); |
1004 | 0 | fmt = HTS_FMT_CSI; |
1005 | 0 | } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI; |
1006 | 0 | idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); |
1007 | 0 | b = bam_init1(); |
1008 | 0 | while ((ret = sam_read1(fp, h, b)) >= 0) { |
1009 | 0 | ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)); |
1010 | 0 | if (ret < 0) { // unsorted or doesn't fit |
1011 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); |
1012 | 0 | goto err; |
1013 | 0 | } |
1014 | 0 | } |
1015 | 0 | if (ret < -1) goto err; // corrupted BAM file |
1016 | | |
1017 | 0 | hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf)); |
1018 | 0 | sam_hdr_destroy(h); |
1019 | 0 | bam_destroy1(b); |
1020 | 0 | return idx; |
1021 | | |
1022 | 0 | err: |
1023 | 0 | bam_destroy1(b); |
1024 | 0 | hts_idx_destroy(idx); |
1025 | 0 | return NULL; |
1026 | 0 | } |
1027 | | |
1028 | | int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads) |
1029 | 0 | { |
1030 | 0 | hts_idx_t *idx; |
1031 | 0 | htsFile *fp; |
1032 | 0 | int ret = 0; |
1033 | |
|
1034 | 0 | if ((fp = hts_open(fn, "r")) == 0) return -2; |
1035 | 0 | if (nthreads) |
1036 | 0 | hts_set_threads(fp, nthreads); |
1037 | |
|
1038 | 0 | switch (fp->format.format) { |
1039 | 0 | case cram: |
1040 | |
|
1041 | 0 | ret = cram_index_build(fp->fp.cram, fn, fnidx); |
1042 | 0 | break; |
1043 | | |
1044 | 0 | case bam: |
1045 | 0 | case sam: |
1046 | 0 | if (fp->format.compression != bgzf) { |
1047 | 0 | hts_log_error("%s file \"%s\" not BGZF compressed", |
1048 | 0 | fp->format.format == bam ? "BAM" : "SAM", fn); |
1049 | 0 | ret = -1; |
1050 | 0 | break; |
1051 | 0 | } |
1052 | 0 | idx = sam_index(fp, min_shift); |
1053 | 0 | if (idx) { |
1054 | 0 | ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI); |
1055 | 0 | if (ret < 0) ret = -4; |
1056 | 0 | hts_idx_destroy(idx); |
1057 | 0 | } |
1058 | 0 | else ret = -1; |
1059 | 0 | break; |
1060 | | |
1061 | 0 | default: |
1062 | 0 | ret = -3; |
1063 | 0 | break; |
1064 | 0 | } |
1065 | 0 | hts_close(fp); |
1066 | |
|
1067 | 0 | return ret; |
1068 | 0 | } |
1069 | | |
1070 | | int sam_index_build2(const char *fn, const char *fnidx, int min_shift) |
1071 | 0 | { |
1072 | 0 | return sam_index_build3(fn, fnidx, min_shift, 0); |
1073 | 0 | } |
1074 | | |
1075 | | int sam_index_build(const char *fn, int min_shift) |
1076 | 0 | { |
1077 | 0 | return sam_index_build3(fn, NULL, min_shift, 0); |
1078 | 0 | } |
1079 | | |
1080 | | // Provide bam_index_build() symbol for binary compatibility with earlier HTSlib |
1081 | | #undef bam_index_build |
1082 | | int bam_index_build(const char *fn, int min_shift) |
1083 | 0 | { |
1084 | 0 | return sam_index_build2(fn, NULL, min_shift); |
1085 | 0 | } |
1086 | | |
1087 | | // Initialise fp->idx for the current format type. |
1088 | | // This must be called after the header has been written but no other data. |
1089 | 0 | int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) { |
1090 | 0 | fp->fnidx = fnidx; |
1091 | 0 | if (fp->format.format == bam || fp->format.format == bcf || |
1092 | 0 | (fp->format.format == sam && fp->format.compression == bgzf)) { |
1093 | 0 | int n_lvls, fmt = HTS_FMT_CSI; |
1094 | 0 | if (min_shift > 0) { |
1095 | 0 | int64_t max_len = 0, s; |
1096 | 0 | int i; |
1097 | 0 | for (i = 0; i < h->n_targets; ++i) |
1098 | 0 | if (max_len < h->target_len[i]) max_len = h->target_len[i]; |
1099 | 0 | max_len += 256; |
1100 | 0 | for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3); |
1101 | |
|
1102 | 0 | } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI; |
1103 | |
|
1104 | 0 | fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); |
1105 | 0 | return fp->idx ? 0 : -1; |
1106 | 0 | } |
1107 | | |
1108 | 0 | if (fp->format.format == cram) { |
1109 | 0 | fp->fp.cram->idxfp = bgzf_open(fnidx, "wg"); |
1110 | 0 | return fp->fp.cram->idxfp ? 0 : -1; |
1111 | 0 | } |
1112 | | |
1113 | 0 | return -1; |
1114 | 0 | } |
1115 | | |
1116 | | // Finishes an index. Call after the last record has been written. |
1117 | | // Returns 0 on success, <0 on failure. |
1118 | 0 | int sam_idx_save(htsFile *fp) { |
1119 | 0 | if (fp->format.format == bam || fp->format.format == bcf || |
1120 | 0 | fp->format.format == vcf || fp->format.format == sam) { |
1121 | 0 | int ret; |
1122 | 0 | if ((ret = sam_state_destroy(fp)) < 0) { |
1123 | 0 | errno = -ret; |
1124 | 0 | return -1; |
1125 | 0 | } |
1126 | 0 | if (!fp->is_bgzf || bgzf_flush(fp->fp.bgzf) < 0) |
1127 | 0 | return -1; |
1128 | 0 | hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf)); |
1129 | |
|
1130 | 0 | if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0) |
1131 | 0 | return -1; |
1132 | | |
1133 | 0 | return hts_idx_save_but_not_close(fp->idx, fp->fnidx, hts_idx_fmt(fp->idx)); |
1134 | |
|
1135 | 0 | } else if (fp->format.format == cram) { |
1136 | | // flushed and closed by cram_close |
1137 | 0 | } |
1138 | | |
1139 | 0 | return 0; |
1140 | 0 | } |
1141 | | |
1142 | | static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) |
1143 | 0 | { |
1144 | 0 | htsFile *fp = (htsFile *)fpv; |
1145 | 0 | bam1_t *b = bv; |
1146 | 0 | fp->line.l = 0; |
1147 | 0 | int ret = sam_read1(fp, fp->bam_header, b); |
1148 | 0 | if (ret >= 0) { |
1149 | 0 | *tid = b->core.tid; |
1150 | 0 | *beg = b->core.pos; |
1151 | 0 | *end = bam_endpos(b); |
1152 | 0 | } |
1153 | 0 | return ret; |
1154 | 0 | } |
1155 | | |
1156 | | // This is used only with read_rest=1 iterators, so need not set tid/beg/end. |
1157 | | static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) |
1158 | 0 | { |
1159 | 0 | htsFile *fp = (htsFile *)fpv; |
1160 | 0 | bam1_t *b = bv; |
1161 | 0 | fp->line.l = 0; |
1162 | 0 | int ret = sam_read1(fp, fp->bam_header, b); |
1163 | 0 | return ret; |
1164 | 0 | } |
1165 | | |
1166 | | // Internal (for now) func used by bam_sym_lookup. This is copied from |
1167 | | // samtools/bam.c. |
1168 | | static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b) |
1169 | 0 | { |
1170 | 0 | const char *rg; |
1171 | 0 | kstring_t lib = { 0, 0, NULL }; |
1172 | 0 | rg = (char *)bam_aux_get(b, "RG"); |
1173 | |
|
1174 | 0 | if (!rg) |
1175 | 0 | return NULL; |
1176 | 0 | else |
1177 | 0 | rg++; |
1178 | | |
1179 | 0 | if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib) < 0) |
1180 | 0 | return NULL; |
1181 | | |
1182 | 0 | static char LB_text[1024]; |
1183 | 0 | int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1; |
1184 | |
|
1185 | 0 | memcpy(LB_text, lib.s, len); |
1186 | 0 | LB_text[len] = 0; |
1187 | |
|
1188 | 0 | free(lib.s); |
1189 | |
|
1190 | 0 | return LB_text; |
1191 | 0 | } |
1192 | | |
1193 | | |
1194 | | // Bam record pointer and SAM header combined |
1195 | | typedef struct { |
1196 | | const sam_hdr_t *h; |
1197 | | const bam1_t *b; |
1198 | | } hb_pair; |
1199 | | |
1200 | | // Looks up variable names in str and replaces them with their value. |
1201 | | // Also supports aux tags. |
1202 | | // |
1203 | | // Note the expression parser deliberately overallocates str size so it |
1204 | | // is safe to use memcmp over strcmp. |
1205 | | static int bam_sym_lookup(void *data, char *str, char **end, |
1206 | 0 | hts_expr_val_t *res) { |
1207 | 0 | hb_pair *hb = (hb_pair *)data; |
1208 | 0 | const bam1_t *b = hb->b; |
1209 | |
|
1210 | 0 | res->is_str = 0; |
1211 | 0 | switch(*str) { |
1212 | 0 | case 'c': |
1213 | 0 | if (memcmp(str, "cigar", 5) == 0) { |
1214 | 0 | *end = str+5; |
1215 | 0 | res->is_str = 1; |
1216 | 0 | ks_clear(&res->s); |
1217 | 0 | uint32_t *cigar = bam_get_cigar(b); |
1218 | 0 | int i, n = b->core.n_cigar, r = 0; |
1219 | 0 | if (n) { |
1220 | 0 | for (i = 0; i < n; i++) { |
1221 | 0 | r |= kputw (bam_cigar_oplen(cigar[i]), &res->s) < 0; |
1222 | 0 | r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s) < 0; |
1223 | 0 | } |
1224 | 0 | r |= kputs("", &res->s) < 0; |
1225 | 0 | } else { |
1226 | 0 | r |= kputs("*", &res->s) < 0; |
1227 | 0 | } |
1228 | 0 | return r ? -1 : 0; |
1229 | 0 | } |
1230 | 0 | break; |
1231 | | |
1232 | 0 | case 'e': |
1233 | 0 | if (memcmp(str, "endpos", 6) == 0) { |
1234 | 0 | *end = str+6; |
1235 | 0 | res->d = bam_endpos(b); |
1236 | 0 | return 0; |
1237 | 0 | } |
1238 | 0 | break; |
1239 | | |
1240 | 0 | case 'f': |
1241 | 0 | if (memcmp(str, "flag", 4) == 0) { |
1242 | 0 | str = *end = str+4; |
1243 | 0 | if (*str != '.') { |
1244 | 0 | res->d = b->core.flag; |
1245 | 0 | return 0; |
1246 | 0 | } else { |
1247 | 0 | str++; |
1248 | 0 | if (!memcmp(str, "paired", 6)) { |
1249 | 0 | *end = str+6; |
1250 | 0 | res->d = b->core.flag & BAM_FPAIRED; |
1251 | 0 | return 0; |
1252 | 0 | } else if (!memcmp(str, "proper_pair", 11)) { |
1253 | 0 | *end = str+11; |
1254 | 0 | res->d = b->core.flag & BAM_FPROPER_PAIR; |
1255 | 0 | return 0; |
1256 | 0 | } else if (!memcmp(str, "unmap", 5)) { |
1257 | 0 | *end = str+5; |
1258 | 0 | res->d = b->core.flag & BAM_FUNMAP; |
1259 | 0 | return 0; |
1260 | 0 | } else if (!memcmp(str, "munmap", 6)) { |
1261 | 0 | *end = str+6; |
1262 | 0 | res->d = b->core.flag & BAM_FMUNMAP; |
1263 | 0 | return 0; |
1264 | 0 | } else if (!memcmp(str, "reverse", 7)) { |
1265 | 0 | *end = str+7; |
1266 | 0 | res->d = b->core.flag & BAM_FREVERSE; |
1267 | 0 | return 0; |
1268 | 0 | } else if (!memcmp(str, "mreverse", 8)) { |
1269 | 0 | *end = str+8; |
1270 | 0 | res->d = b->core.flag & BAM_FMREVERSE; |
1271 | 0 | return 0; |
1272 | 0 | } else if (!memcmp(str, "read1", 5)) { |
1273 | 0 | *end = str+5; |
1274 | 0 | res->d = b->core.flag & BAM_FREAD1; |
1275 | 0 | return 0; |
1276 | 0 | } else if (!memcmp(str, "read2", 5)) { |
1277 | 0 | *end = str+5; |
1278 | 0 | res->d = b->core.flag & BAM_FREAD2; |
1279 | 0 | return 0; |
1280 | 0 | } else if (!memcmp(str, "secondary", 9)) { |
1281 | 0 | *end = str+9; |
1282 | 0 | res->d = b->core.flag & BAM_FSECONDARY; |
1283 | 0 | return 0; |
1284 | 0 | } else if (!memcmp(str, "qcfail", 6)) { |
1285 | 0 | *end = str+6; |
1286 | 0 | res->d = b->core.flag & BAM_FQCFAIL; |
1287 | 0 | return 0; |
1288 | 0 | } else if (!memcmp(str, "dup", 3)) { |
1289 | 0 | *end = str+3; |
1290 | 0 | res->d = b->core.flag & BAM_FDUP; |
1291 | 0 | return 0; |
1292 | 0 | } else if (!memcmp(str, "supplementary", 13)) { |
1293 | 0 | *end = str+13; |
1294 | 0 | res->d = b->core.flag & BAM_FSUPPLEMENTARY; |
1295 | 0 | return 0; |
1296 | 0 | } else { |
1297 | 0 | hts_log_error("Unrecognised flag string"); |
1298 | 0 | return -1; |
1299 | 0 | } |
1300 | 0 | } |
1301 | 0 | } |
1302 | 0 | break; |
1303 | | |
1304 | 0 | case 'h': |
1305 | 0 | if (memcmp(str, "hclen", 5) == 0) { |
1306 | 0 | int hclen = 0; |
1307 | 0 | uint32_t *cigar = bam_get_cigar(b); |
1308 | 0 | uint32_t ncigar = b->core.n_cigar; |
1309 | | |
1310 | | // left |
1311 | 0 | if (ncigar > 0 && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP) |
1312 | 0 | hclen = bam_cigar_oplen(cigar[0]); |
1313 | | |
1314 | | // right |
1315 | 0 | if (ncigar > 1 && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP) |
1316 | 0 | hclen += bam_cigar_oplen(cigar[ncigar-1]); |
1317 | |
|
1318 | 0 | *end = str+5; |
1319 | 0 | res->d = hclen; |
1320 | 0 | return 0; |
1321 | 0 | } |
1322 | 0 | break; |
1323 | | |
1324 | 0 | case 'l': |
1325 | 0 | if (memcmp(str, "library", 7) == 0) { |
1326 | 0 | *end = str+7; |
1327 | 0 | res->is_str = 1; |
1328 | 0 | const char *lib = bam_get_library(hb->h, b); |
1329 | 0 | kputs(lib ? lib : "", ks_clear(&res->s)); |
1330 | 0 | return 0; |
1331 | 0 | } |
1332 | 0 | break; |
1333 | | |
1334 | 0 | case 'm': |
1335 | 0 | if (memcmp(str, "mapq", 4) == 0) { |
1336 | 0 | *end = str+4; |
1337 | 0 | res->d = b->core.qual; |
1338 | 0 | return 0; |
1339 | 0 | } else if (memcmp(str, "mpos", 4) == 0) { |
1340 | 0 | *end = str+4; |
1341 | 0 | res->d = b->core.mpos+1; |
1342 | 0 | return 0; |
1343 | 0 | } else if (memcmp(str, "mrname", 6) == 0) { |
1344 | 0 | *end = str+6; |
1345 | 0 | res->is_str = 1; |
1346 | 0 | const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); |
1347 | 0 | kputs(rn ? rn : "*", ks_clear(&res->s)); |
1348 | 0 | return 0; |
1349 | 0 | } else if (memcmp(str, "mrefid", 6) == 0) { |
1350 | 0 | *end = str+6; |
1351 | 0 | res->d = b->core.mtid; |
1352 | 0 | return 0; |
1353 | 0 | } |
1354 | 0 | break; |
1355 | | |
1356 | 0 | case 'n': |
1357 | 0 | if (memcmp(str, "ncigar", 6) == 0) { |
1358 | 0 | *end = str+6; |
1359 | 0 | res->d = b->core.n_cigar; |
1360 | 0 | return 0; |
1361 | 0 | } |
1362 | 0 | break; |
1363 | | |
1364 | 0 | case 'p': |
1365 | 0 | if (memcmp(str, "pos", 3) == 0) { |
1366 | 0 | *end = str+3; |
1367 | 0 | res->d = b->core.pos+1; |
1368 | 0 | return 0; |
1369 | 0 | } else if (memcmp(str, "pnext", 5) == 0) { |
1370 | 0 | *end = str+5; |
1371 | 0 | res->d = b->core.mpos+1; |
1372 | 0 | return 0; |
1373 | 0 | } |
1374 | 0 | break; |
1375 | | |
1376 | 0 | case 'q': |
1377 | 0 | if (memcmp(str, "qlen", 4) == 0) { |
1378 | 0 | *end = str+4; |
1379 | 0 | res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)); |
1380 | 0 | return 0; |
1381 | 0 | } else if (memcmp(str, "qname", 5) == 0) { |
1382 | 0 | *end = str+5; |
1383 | 0 | res->is_str = 1; |
1384 | 0 | kputs(bam_get_qname(b), ks_clear(&res->s)); |
1385 | 0 | return 0; |
1386 | 0 | } else if (memcmp(str, "qual", 4) == 0) { |
1387 | 0 | *end = str+4; |
1388 | 0 | ks_clear(&res->s); |
1389 | 0 | if (ks_resize(&res->s, b->core.l_qseq+1) < 0) |
1390 | 0 | return -1; |
1391 | 0 | memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq); |
1392 | 0 | res->s.l = b->core.l_qseq; |
1393 | 0 | res->is_str = 1; |
1394 | 0 | return 0; |
1395 | 0 | } |
1396 | 0 | break; |
1397 | | |
1398 | 0 | case 'r': |
1399 | 0 | if (memcmp(str, "rlen", 4) == 0) { |
1400 | 0 | *end = str+4; |
1401 | 0 | res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); |
1402 | 0 | return 0; |
1403 | 0 | } else if (memcmp(str, "rname", 5) == 0) { |
1404 | 0 | *end = str+5; |
1405 | 0 | res->is_str = 1; |
1406 | 0 | const char *rn = sam_hdr_tid2name(hb->h, b->core.tid); |
1407 | 0 | kputs(rn ? rn : "*", ks_clear(&res->s)); |
1408 | 0 | return 0; |
1409 | 0 | } else if (memcmp(str, "rnext", 5) == 0) { |
1410 | 0 | *end = str+5; |
1411 | 0 | res->is_str = 1; |
1412 | 0 | const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); |
1413 | 0 | kputs(rn ? rn : "*", ks_clear(&res->s)); |
1414 | 0 | return 0; |
1415 | 0 | } else if (memcmp(str, "refid", 5) == 0) { |
1416 | 0 | *end = str+5; |
1417 | 0 | res->d = b->core.tid; |
1418 | 0 | return 0; |
1419 | 0 | } |
1420 | 0 | break; |
1421 | | |
1422 | 0 | case 's': |
1423 | 0 | if (memcmp(str, "seq", 3) == 0) { |
1424 | 0 | *end = str+3; |
1425 | 0 | ks_clear(&res->s); |
1426 | 0 | if (ks_resize(&res->s, b->core.l_qseq+1) < 0) |
1427 | 0 | return -1; |
1428 | 0 | nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq); |
1429 | 0 | res->s.s[b->core.l_qseq] = 0; |
1430 | 0 | res->s.l = b->core.l_qseq; |
1431 | 0 | res->is_str = 1; |
1432 | 0 | return 0; |
1433 | 0 | } else if (memcmp(str, "sclen", 5) == 0) { |
1434 | 0 | int sclen = 0; |
1435 | 0 | uint32_t *cigar = bam_get_cigar(b); |
1436 | 0 | int ncigar = b->core.n_cigar; |
1437 | 0 | int left = 0; |
1438 | | |
1439 | | // left |
1440 | 0 | if (ncigar > 0 |
1441 | 0 | && bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) |
1442 | 0 | left = 0, sclen += bam_cigar_oplen(cigar[0]); |
1443 | 0 | else if (ncigar > 1 |
1444 | 0 | && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP |
1445 | 0 | && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP) |
1446 | 0 | left = 1, sclen += bam_cigar_oplen(cigar[1]); |
1447 | | |
1448 | | // right |
1449 | 0 | if (ncigar-1 > left |
1450 | 0 | && bam_cigar_op(cigar[ncigar-1]) == BAM_CSOFT_CLIP) |
1451 | 0 | sclen += bam_cigar_oplen(cigar[ncigar-1]); |
1452 | 0 | else if (ncigar-2 > left |
1453 | 0 | && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP |
1454 | 0 | && bam_cigar_op(cigar[ncigar-2]) == BAM_CSOFT_CLIP) |
1455 | 0 | sclen += bam_cigar_oplen(cigar[ncigar-2]); |
1456 | |
|
1457 | 0 | *end = str+5; |
1458 | 0 | res->d = sclen; |
1459 | 0 | return 0; |
1460 | 0 | } |
1461 | 0 | break; |
1462 | | |
1463 | 0 | case 't': |
1464 | 0 | if (memcmp(str, "tlen", 4) == 0) { |
1465 | 0 | *end = str+4; |
1466 | 0 | res->d = b->core.isize; |
1467 | 0 | return 0; |
1468 | 0 | } |
1469 | 0 | break; |
1470 | | |
1471 | 0 | case '[': |
1472 | 0 | if (*str == '[' && str[1] && str[2] && str[3] == ']') { |
1473 | | /* aux tags */ |
1474 | 0 | *end = str+4; |
1475 | |
|
1476 | 0 | uint8_t *aux = bam_aux_get(b, str+1); |
1477 | 0 | if (aux) { |
1478 | | // we define the truth of a tag to be its presence, even if 0. |
1479 | 0 | res->is_true = 1; |
1480 | 0 | switch (*aux) { |
1481 | 0 | case 'Z': |
1482 | 0 | case 'H': |
1483 | 0 | res->is_str = 1; |
1484 | 0 | kputs((char *)aux+1, ks_clear(&res->s)); |
1485 | 0 | break; |
1486 | | |
1487 | 0 | case 'A': |
1488 | 0 | res->is_str = 1; |
1489 | 0 | kputsn((char *)aux+1, 1, ks_clear(&res->s)); |
1490 | 0 | break; |
1491 | | |
1492 | 0 | case 'i': case 'I': |
1493 | 0 | case 's': case 'S': |
1494 | 0 | case 'c': case 'C': |
1495 | 0 | res->is_str = 0; |
1496 | 0 | res->d = bam_aux2i(aux); |
1497 | 0 | break; |
1498 | | |
1499 | 0 | case 'f': |
1500 | 0 | case 'd': |
1501 | 0 | res->is_str = 0; |
1502 | 0 | res->d = bam_aux2f(aux); |
1503 | 0 | break; |
1504 | | |
1505 | 0 | default: |
1506 | 0 | hts_log_error("Aux type '%c not yet supported by filters", |
1507 | 0 | *aux); |
1508 | 0 | return -1; |
1509 | 0 | } |
1510 | 0 | return 0; |
1511 | |
|
1512 | 0 | } else { |
1513 | | // hence absent tags are always false (and strings) |
1514 | 0 | res->is_str = 1; |
1515 | 0 | res->s.l = 0; |
1516 | 0 | res->d = 0; |
1517 | 0 | res->is_true = 0; |
1518 | 0 | return 0; |
1519 | 0 | } |
1520 | 0 | } |
1521 | 0 | break; |
1522 | 0 | } |
1523 | | |
1524 | | // All successful matches in switch should return 0. |
1525 | | // So if we didn't match, it's a parse error. |
1526 | 0 | return -1; |
1527 | 0 | } |
1528 | | |
1529 | | // Returns 1 when accepted by the filter, 0 if not, -1 on error. |
1530 | | int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt) |
1531 | 0 | { |
1532 | 0 | hb_pair hb = {h, b}; |
1533 | 0 | hts_expr_val_t res = HTS_EXPR_VAL_INIT; |
1534 | 0 | if (hts_filter_eval2(filt, &hb, bam_sym_lookup, &res)) { |
1535 | 0 | hts_log_error("Couldn't process filter expression"); |
1536 | 0 | hts_expr_val_free(&res); |
1537 | 0 | return -1; |
1538 | 0 | } |
1539 | | |
1540 | 0 | int t = res.is_true; |
1541 | 0 | hts_expr_val_free(&res); |
1542 | |
|
1543 | 0 | return t; |
1544 | 0 | } |
1545 | | |
1546 | | static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) |
1547 | 0 | { |
1548 | 0 | htsFile *fp = fpv; |
1549 | 0 | bam1_t *b = bv; |
1550 | 0 | int pass_filter, ret; |
1551 | |
|
1552 | 0 | do { |
1553 | 0 | ret = cram_get_bam_seq(fp->fp.cram, &b); |
1554 | 0 | if (ret < 0) |
1555 | 0 | return cram_eof(fp->fp.cram) ? -1 : -2; |
1556 | | |
1557 | 0 | if (bam_tag2cigar(b, 1, 1) < 0) |
1558 | 0 | return -2; |
1559 | | |
1560 | 0 | *tid = b->core.tid; |
1561 | 0 | *beg = b->core.pos; |
1562 | 0 | *end = bam_endpos(b); |
1563 | |
|
1564 | 0 | if (fp->filter) { |
1565 | 0 | pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter); |
1566 | 0 | if (pass_filter < 0) |
1567 | 0 | return -2; |
1568 | 0 | } else { |
1569 | 0 | pass_filter = 1; |
1570 | 0 | } |
1571 | 0 | } while (pass_filter == 0); |
1572 | | |
1573 | 0 | return ret; |
1574 | 0 | } |
1575 | | |
1576 | | static int cram_pseek(void *fp, int64_t offset, int whence) |
1577 | 0 | { |
1578 | 0 | cram_fd *fd = (cram_fd *)fp; |
1579 | |
|
1580 | 0 | if ((0 != cram_seek(fd, offset, SEEK_SET)) |
1581 | 0 | && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR))) |
1582 | 0 | return -1; |
1583 | | |
1584 | 0 | fd->curr_position = offset; |
1585 | |
|
1586 | 0 | if (fd->ctr) { |
1587 | 0 | cram_free_container(fd->ctr); |
1588 | 0 | if (fd->ctr_mt && fd->ctr_mt != fd->ctr) |
1589 | 0 | cram_free_container(fd->ctr_mt); |
1590 | |
|
1591 | 0 | fd->ctr = NULL; |
1592 | 0 | fd->ctr_mt = NULL; |
1593 | 0 | fd->ooc = 0; |
1594 | 0 | } |
1595 | |
|
1596 | 0 | return 0; |
1597 | 0 | } |
1598 | | |
1599 | | /* |
1600 | | * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only |
1601 | | * after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered |
1602 | | * container previously fetched. It was designed like this to integrate with the functionality |
1603 | | * of the iterator stepping logic. |
1604 | | */ |
1605 | | |
1606 | | static int64_t cram_ptell(void *fp) |
1607 | 0 | { |
1608 | 0 | cram_fd *fd = (cram_fd *)fp; |
1609 | 0 | cram_container *c; |
1610 | 0 | cram_slice *s; |
1611 | 0 | int64_t ret = -1L; |
1612 | |
|
1613 | 0 | if (fd) { |
1614 | 0 | if ((c = fd->ctr) != NULL) { |
1615 | 0 | if ((s = c->slice) != NULL && s->max_rec) { |
1616 | 0 | if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1)) |
1617 | 0 | fd->curr_position += c->offset + c->length; |
1618 | 0 | } |
1619 | 0 | } |
1620 | 0 | ret = fd->curr_position; |
1621 | 0 | } |
1622 | |
|
1623 | 0 | return ret; |
1624 | 0 | } |
1625 | | |
1626 | | static int bam_pseek(void *fp, int64_t offset, int whence) |
1627 | 0 | { |
1628 | 0 | BGZF *fd = (BGZF *)fp; |
1629 | |
|
1630 | 0 | return bgzf_seek(fd, offset, whence); |
1631 | 0 | } |
1632 | | |
1633 | | static int64_t bam_ptell(void *fp) |
1634 | 0 | { |
1635 | 0 | BGZF *fd = (BGZF *)fp; |
1636 | 0 | if (!fd) |
1637 | 0 | return -1L; |
1638 | | |
1639 | 0 | return bgzf_tell(fd); |
1640 | 0 | } |
1641 | | |
1642 | | |
1643 | | |
1644 | | static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags) |
1645 | 0 | { |
1646 | 0 | switch (fp->format.format) { |
1647 | 0 | case bam: |
1648 | 0 | case sam: |
1649 | 0 | return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags); |
1650 | | |
1651 | 0 | case cram: { |
1652 | 0 | if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL; |
1653 | | |
1654 | | // Cons up a fake "index" just pointing at the associated cram_fd: |
1655 | 0 | hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t)); |
1656 | 0 | if (idx == NULL) return NULL; |
1657 | 0 | idx->fmt = HTS_FMT_CRAI; |
1658 | 0 | idx->cram = fp->fp.cram; |
1659 | 0 | return (hts_idx_t *) idx; |
1660 | 0 | } |
1661 | | |
1662 | 0 | default: |
1663 | 0 | return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t |
1664 | 0 | } |
1665 | 0 | } |
1666 | | |
1667 | | hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags) |
1668 | 0 | { |
1669 | 0 | return index_load(fp, fn, fnidx, flags); |
1670 | 0 | } |
1671 | | |
1672 | 0 | hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) { |
1673 | 0 | return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE); |
1674 | 0 | } |
1675 | | |
1676 | | hts_idx_t *sam_index_load(htsFile *fp, const char *fn) |
1677 | 0 | { |
1678 | 0 | return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE); |
1679 | 0 | } |
1680 | | |
1681 | | static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec) |
1682 | 0 | { |
1683 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1684 | 0 | hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t)); |
1685 | 0 | if (iter == NULL) return NULL; |
1686 | | |
1687 | | // Cons up a dummy iterator for which hts_itr_next() will simply invoke |
1688 | | // the readrec function: |
1689 | 0 | iter->is_cram = 1; |
1690 | 0 | iter->read_rest = 1; |
1691 | 0 | iter->off = NULL; |
1692 | 0 | iter->bins.a = NULL; |
1693 | 0 | iter->readrec = readrec; |
1694 | |
|
1695 | 0 | if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) { |
1696 | 0 | cram_range r = { tid, beg+1, end }; |
1697 | 0 | int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r); |
1698 | |
|
1699 | 0 | iter->curr_off = 0; |
1700 | | // The following fields are not required by hts_itr_next(), but are |
1701 | | // filled in in case user code wants to look at them. |
1702 | 0 | iter->tid = tid; |
1703 | 0 | iter->beg = beg; |
1704 | 0 | iter->end = end; |
1705 | |
|
1706 | 0 | switch (ret) { |
1707 | 0 | case 0: |
1708 | 0 | break; |
1709 | | |
1710 | 0 | case -2: |
1711 | | // No data vs this ref, so mark iterator as completed. |
1712 | | // Same as HTS_IDX_NONE. |
1713 | 0 | iter->finished = 1; |
1714 | 0 | break; |
1715 | | |
1716 | 0 | default: |
1717 | 0 | free(iter); |
1718 | 0 | return NULL; |
1719 | 0 | } |
1720 | 0 | } |
1721 | 0 | else switch (tid) { |
1722 | 0 | case HTS_IDX_REST: |
1723 | 0 | iter->curr_off = 0; |
1724 | 0 | break; |
1725 | 0 | case HTS_IDX_NONE: |
1726 | 0 | iter->curr_off = 0; |
1727 | 0 | iter->finished = 1; |
1728 | 0 | break; |
1729 | 0 | default: |
1730 | 0 | hts_log_error("Query with tid=%d not implemented for CRAM files", tid); |
1731 | 0 | abort(); |
1732 | 0 | break; |
1733 | 0 | } |
1734 | | |
1735 | 0 | return iter; |
1736 | 0 | } |
1737 | | |
1738 | | hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end) |
1739 | 0 | { |
1740 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1741 | 0 | if (idx == NULL) |
1742 | 0 | return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest); |
1743 | 0 | else if (cidx->fmt == HTS_FMT_CRAI) |
1744 | 0 | return cram_itr_query(idx, tid, beg, end, sam_readrec); |
1745 | 0 | else |
1746 | 0 | return hts_itr_query(idx, tid, beg, end, sam_readrec); |
1747 | 0 | } |
1748 | | |
1749 | | static int cram_name2id(void *fdv, const char *ref) |
1750 | 0 | { |
1751 | 0 | cram_fd *fd = (cram_fd *) fdv; |
1752 | 0 | return sam_hdr_name2tid(fd->header, ref); |
1753 | 0 | } |
1754 | | |
1755 | | hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region) |
1756 | 0 | { |
1757 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1758 | 0 | return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, |
1759 | 0 | cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query, |
1760 | 0 | sam_readrec); |
1761 | 0 | } |
1762 | | |
1763 | | hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount) |
1764 | 0 | { |
1765 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1766 | 0 | hts_reglist_t *r_list = NULL; |
1767 | 0 | int r_count = 0; |
1768 | |
|
1769 | 0 | if (!cidx || !hdr) |
1770 | 0 | return NULL; |
1771 | | |
1772 | 0 | hts_itr_t *itr = NULL; |
1773 | 0 | if (cidx->fmt == HTS_FMT_CRAI) { |
1774 | 0 | r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id); |
1775 | 0 | if (!r_list) |
1776 | 0 | return NULL; |
1777 | 0 | itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram, |
1778 | 0 | hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell); |
1779 | 0 | } else { |
1780 | 0 | r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, (hts_name2id_f)(bam_name2id)); |
1781 | 0 | if (!r_list) |
1782 | 0 | return NULL; |
1783 | 0 | itr = hts_itr_regions(idx, r_list, r_count, (hts_name2id_f)(bam_name2id), hdr, |
1784 | 0 | hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell); |
1785 | 0 | } |
1786 | | |
1787 | 0 | if (!itr) |
1788 | 0 | hts_reglist_free(r_list, r_count); |
1789 | |
|
1790 | 0 | return itr; |
1791 | 0 | } |
1792 | | |
1793 | | hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount) |
1794 | 0 | { |
1795 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1796 | |
|
1797 | 0 | if(!cidx || !hdr || !reglist) |
1798 | 0 | return NULL; |
1799 | | |
1800 | 0 | if (cidx->fmt == HTS_FMT_CRAI) |
1801 | 0 | return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram, |
1802 | 0 | hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell); |
1803 | 0 | else |
1804 | 0 | return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr, |
1805 | 0 | hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell); |
1806 | 0 | } |
1807 | | |
1808 | | /********************** |
1809 | | *** SAM header I/O *** |
1810 | | **********************/ |
1811 | | |
1812 | | #include "htslib/kseq.h" |
1813 | | #include "htslib/kstring.h" |
1814 | | |
1815 | | sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text) |
1816 | 0 | { |
1817 | 0 | sam_hdr_t *bh = sam_hdr_init(); |
1818 | 0 | if (!bh) return NULL; |
1819 | | |
1820 | 0 | if (sam_hdr_add_lines(bh, text, l_text) != 0) { |
1821 | 0 | sam_hdr_destroy(bh); |
1822 | 0 | return NULL; |
1823 | 0 | } |
1824 | | |
1825 | 0 | return bh; |
1826 | 0 | } |
1827 | | |
1828 | 95.8k | static int valid_sam_header_type(const char *s) { |
1829 | 95.8k | if (s[0] != '@') return 0; |
1830 | 95.8k | switch (s[1]) { |
1831 | 1.41k | case 'H': |
1832 | 1.41k | return s[2] == 'D' && s[3] == '\t'; |
1833 | 63 | case 'S': |
1834 | 63 | return s[2] == 'Q' && s[3] == '\t'; |
1835 | 84.9k | case 'R': |
1836 | 87.4k | case 'P': |
1837 | 87.4k | return s[2] == 'G' && s[3] == '\t'; |
1838 | 6.88k | case 'C': |
1839 | 6.88k | return s[2] == 'O'; |
1840 | 95.8k | } |
1841 | 60 | return 0; |
1842 | 95.8k | } |
1843 | | |
1844 | | // Minimal sanitisation of a header to ensure. |
1845 | | // - null terminated string. |
1846 | | // - all lines start with @ (also implies no blank lines). |
1847 | | // |
1848 | | // Much more could be done, but currently is not, including: |
1849 | | // - checking header types are known (HD, SQ, etc). |
1850 | | // - syntax (eg checking tab separated fields). |
1851 | | // - validating n_targets matches @SQ records. |
1852 | | // - validating target lengths against @SQ records. |
1853 | 20.3k | static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) { |
1854 | 20.3k | if (!h) |
1855 | 645 | return NULL; |
1856 | | |
1857 | | // Special case for empty headers. |
1858 | 19.7k | if (h->l_text == 0) |
1859 | 6.95k | return h; |
1860 | | |
1861 | 12.7k | size_t i; |
1862 | 12.7k | unsigned int lnum = 0; |
1863 | 12.7k | char *cp = h->text, last = '\n'; |
1864 | 28.5M | for (i = 0; i < h->l_text; i++) { |
1865 | | // NB: l_text excludes terminating nul. This finds early ones. |
1866 | 28.5M | if (cp[i] == 0) |
1867 | 3.87k | break; |
1868 | | |
1869 | | // Error on \n[^@], including duplicate newlines |
1870 | 28.5M | if (last == '\n') { |
1871 | 60.7k | lnum++; |
1872 | 60.7k | if (cp[i] != '@') { |
1873 | 24 | hts_log_error("Malformed SAM header at line %u", lnum); |
1874 | 24 | sam_hdr_destroy(h); |
1875 | 24 | return NULL; |
1876 | 24 | } |
1877 | 60.7k | } |
1878 | | |
1879 | 28.5M | last = cp[i]; |
1880 | 28.5M | } |
1881 | | |
1882 | 12.7k | if (i < h->l_text) { // Early nul found. Complain if not just padding. |
1883 | 3.87k | size_t j = i; |
1884 | 19.0k | while (j < h->l_text && cp[j] == '\0') j++; |
1885 | 3.87k | if (j < h->l_text) |
1886 | 3.71k | hts_log_warning("Unexpected NUL character in header. Possibly truncated"); |
1887 | 3.87k | } |
1888 | | |
1889 | | // Add trailing newline and/or trailing nul if required. |
1890 | 12.7k | if (last != '\n') { |
1891 | 3.74k | hts_log_warning("Missing trailing newline on SAM header. Possibly truncated"); |
1892 | | |
1893 | 3.74k | if (h->l_text < 2 || i >= h->l_text - 2) { |
1894 | 396 | if (h->l_text >= SIZE_MAX - 2) { |
1895 | 0 | hts_log_error("No room for extra newline"); |
1896 | 0 | sam_hdr_destroy(h); |
1897 | 0 | return NULL; |
1898 | 0 | } |
1899 | | |
1900 | 396 | cp = realloc(h->text, (size_t) h->l_text+2); |
1901 | 396 | if (!cp) { |
1902 | 0 | sam_hdr_destroy(h); |
1903 | 0 | return NULL; |
1904 | 0 | } |
1905 | 396 | h->text = cp; |
1906 | 396 | } |
1907 | 3.74k | cp[i++] = '\n'; |
1908 | | |
1909 | | // l_text may be larger already due to multiple nul padding |
1910 | 3.74k | if (h->l_text < i) |
1911 | 60 | h->l_text = i; |
1912 | 3.74k | cp[h->l_text] = '\0'; |
1913 | 3.74k | } |
1914 | | |
1915 | 12.7k | return h; |
1916 | 12.7k | } |
1917 | | |
1918 | 1.77k | static void known_stderr(const char *tool, const char *advice) { |
1919 | 1.77k | hts_log_warning("SAM file corrupted by embedded %s error/log message", tool); |
1920 | 1.77k | hts_log_warning("%s", advice); |
1921 | 1.77k | } |
1922 | | |
1923 | 18.7k | static void warn_if_known_stderr(const char *line) { |
1924 | 18.7k | if (strstr(line, "M::bwa_idx_load_from_disk") != NULL) |
1925 | 318 | known_stderr("bwa", "Use `bwa mem -o file.sam ...` or `bwa sampe -f file.sam ...` instead of `bwa ... > file.sam`"); |
1926 | 18.4k | else if (strstr(line, "M::mem_pestat") != NULL) |
1927 | 1.23k | known_stderr("bwa", "Use `bwa mem -o file.sam ...` instead of `bwa mem ... > file.sam`"); |
1928 | 17.2k | else if (strstr(line, "loaded/built the index") != NULL) |
1929 | 213 | known_stderr("minimap2", "Use `minimap2 -o file.sam ...` instead of `minimap2 ... > file.sam`"); |
1930 | 18.7k | } |
1931 | | |
1932 | 12.5k | static sam_hdr_t *sam_hdr_create(htsFile* fp) { |
1933 | 12.5k | kstring_t str = { 0, 0, NULL }; |
1934 | 12.5k | khint_t k; |
1935 | 12.5k | sam_hdr_t* h = sam_hdr_init(); |
1936 | 12.5k | const char *q, *r; |
1937 | 12.5k | char* sn = NULL; |
1938 | 12.5k | khash_t(s2i) *d = kh_init(s2i); |
1939 | 12.5k | khash_t(s2i) *long_refs = NULL; |
1940 | 12.5k | if (!h || !d) |
1941 | 0 | goto error; |
1942 | | |
1943 | 12.5k | int ret, has_SQ = 0; |
1944 | 12.5k | int next_c = '@'; |
1945 | 160k | while (next_c == '@' && (ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) { |
1946 | 148k | if (fp->line.s[0] != '@') |
1947 | 63 | break; |
1948 | | |
1949 | 148k | if (fp->line.l > 3 && strncmp(fp->line.s, "@SQ", 3) == 0) { |
1950 | 52.8k | has_SQ = 1; |
1951 | 52.8k | hts_pos_t ln = -1; |
1952 | 173k | for (q = fp->line.s + 4;; ++q) { |
1953 | 173k | if (strncmp(q, "SN:", 3) == 0) { |
1954 | 52.5k | q += 3; |
1955 | 750M | for (r = q;*r != '\t' && *r != '\n' && *r != '\0';++r); |
1956 | | |
1957 | 52.5k | if (sn) { |
1958 | 13.5k | hts_log_warning("SQ header line has more than one SN: tag"); |
1959 | 13.5k | free(sn); |
1960 | 13.5k | } |
1961 | 52.5k | sn = (char*)calloc(r - q + 1, 1); |
1962 | 52.5k | if (!sn) |
1963 | 0 | goto error; |
1964 | | |
1965 | 52.5k | strncpy(sn, q, r - q); |
1966 | 52.5k | q = r; |
1967 | 121k | } else { |
1968 | 121k | if (strncmp(q, "LN:", 3) == 0) { |
1969 | 45.7k | hts_pos_t tmp = strtoll(q + 3, (char**)&q, 10); |
1970 | 45.7k | if (ln != -1 && ln != tmp) { //duplicate & different LN |
1971 | 285 | hts_log_error("Header includes @SQ line \"%s\" with" |
1972 | 285 | " multiple LN: tag with different values.", sn); |
1973 | 285 | goto error; |
1974 | 45.4k | } else { |
1975 | 45.4k | ln = tmp; |
1976 | 45.4k | } |
1977 | 45.7k | } |
1978 | 121k | } |
1979 | | |
1980 | 26.0M | while (*q != '\t' && *q != '\n' && *q != '\0') |
1981 | 25.9M | ++q; |
1982 | 173k | if (*q == '\0' || *q == '\n') |
1983 | 52.5k | break; |
1984 | 173k | } |
1985 | 52.5k | if (sn) { |
1986 | 38.9k | if (ln >= 0) { |
1987 | 33.9k | int absent; |
1988 | 33.9k | k = kh_put(s2i, d, sn, &absent); |
1989 | 33.9k | if (absent < 0) |
1990 | 0 | goto error; |
1991 | | |
1992 | 33.9k | if (!absent) { |
1993 | 14.5k | hts_log_warning("Duplicated sequence \"%s\" in file \"%s\"", sn, fp->fn); |
1994 | 14.5k | free(sn); |
1995 | 19.3k | } else { |
1996 | 19.3k | sn = NULL; |
1997 | 19.3k | if (ln >= UINT32_MAX) { |
1998 | | // Stash away ref length that |
1999 | | // doesn't fit in target_len array |
2000 | 4.41k | int k2; |
2001 | 4.41k | if (!long_refs) { |
2002 | 561 | long_refs = kh_init(s2i); |
2003 | 561 | if (!long_refs) |
2004 | 0 | goto error; |
2005 | 561 | } |
2006 | 4.41k | k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent); |
2007 | 4.41k | if (absent < 0) |
2008 | 0 | goto error; |
2009 | 4.41k | kh_val(long_refs, k2) = ln; |
2010 | 4.41k | kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32 |
2011 | 4.41k | | UINT32_MAX); |
2012 | 14.9k | } else { |
2013 | 14.9k | kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln; |
2014 | 14.9k | } |
2015 | 19.3k | } |
2016 | 33.9k | } else { |
2017 | 4.93k | hts_log_warning("Ignored @SQ SN:%s : bad or missing LN tag", sn); |
2018 | 4.93k | warn_if_known_stderr(fp->line.s); |
2019 | 4.93k | free(sn); |
2020 | 4.93k | } |
2021 | 38.9k | } else { |
2022 | 13.6k | hts_log_warning("Ignored @SQ line with missing SN: tag"); |
2023 | 13.6k | warn_if_known_stderr(fp->line.s); |
2024 | 13.6k | } |
2025 | 52.5k | sn = NULL; |
2026 | 52.5k | } |
2027 | 95.8k | else if (!valid_sam_header_type(fp->line.s)) { |
2028 | 201 | hts_log_error("Invalid header line: must start with @HD/@SQ/@RG/@PG/@CO"); |
2029 | 201 | warn_if_known_stderr(fp->line.s); |
2030 | 201 | goto error; |
2031 | 201 | } |
2032 | | |
2033 | 148k | if (kputsn(fp->line.s, fp->line.l, &str) < 0) |
2034 | 0 | goto error; |
2035 | | |
2036 | 148k | if (kputc('\n', &str) < 0) |
2037 | 0 | goto error; |
2038 | | |
2039 | 148k | if (fp->is_bgzf) { |
2040 | 87.1k | next_c = bgzf_peek(fp->fp.bgzf); |
2041 | 87.1k | } else { |
2042 | 61.0k | unsigned char nc; |
2043 | 61.0k | ssize_t pret = hpeek(fp->fp.hfile, &nc, 1); |
2044 | 61.0k | next_c = pret > 0 ? nc : pret - 1; |
2045 | 61.0k | } |
2046 | 148k | if (next_c < -1) |
2047 | 3 | goto error; |
2048 | 148k | } |
2049 | 12.0k | if (next_c != '@') |
2050 | 11.9k | fp->line.l = 0; |
2051 | | |
2052 | 12.0k | if (ret < -1) |
2053 | 21 | goto error; |
2054 | | |
2055 | 12.0k | if (!has_SQ && fp->fn_aux) { |
2056 | 0 | kstring_t line = { 0, 0, NULL }; |
2057 | | |
2058 | | /* The reference index (.fai) is actually needed here */ |
2059 | 0 | char *fai_fn = fp->fn_aux; |
2060 | 0 | char *fn_delim = strstr(fp->fn_aux, HTS_IDX_DELIM); |
2061 | 0 | if (fn_delim) |
2062 | 0 | fai_fn = fn_delim + strlen(HTS_IDX_DELIM); |
2063 | |
|
2064 | 0 | hFILE* f = hopen(fai_fn, "r"); |
2065 | 0 | int e = 0, absent; |
2066 | 0 | if (f == NULL) |
2067 | 0 | goto error; |
2068 | | |
2069 | 0 | while (line.l = 0, kgetline(&line, (kgets_func*) hgets, f) >= 0) { |
2070 | 0 | char* tab = strchr(line.s, '\t'); |
2071 | 0 | hts_pos_t ln; |
2072 | |
|
2073 | 0 | if (tab == NULL) |
2074 | 0 | continue; |
2075 | | |
2076 | 0 | sn = (char*)calloc(tab-line.s+1, 1); |
2077 | 0 | if (!sn) { |
2078 | 0 | e = 1; |
2079 | 0 | break; |
2080 | 0 | } |
2081 | 0 | memcpy(sn, line.s, tab-line.s); |
2082 | 0 | k = kh_put(s2i, d, sn, &absent); |
2083 | 0 | if (absent < 0) { |
2084 | 0 | e = 1; |
2085 | 0 | break; |
2086 | 0 | } |
2087 | | |
2088 | 0 | ln = strtoll(tab, NULL, 10); |
2089 | |
|
2090 | 0 | if (!absent) { |
2091 | 0 | hts_log_warning("Duplicated sequence \"%s\" in the file \"%s\"", sn, fai_fn); |
2092 | 0 | free(sn); |
2093 | 0 | sn = NULL; |
2094 | 0 | } else { |
2095 | 0 | sn = NULL; |
2096 | 0 | if (ln >= UINT32_MAX) { |
2097 | | // Stash away ref length that |
2098 | | // doesn't fit in target_len array |
2099 | 0 | khint_t k2; |
2100 | 0 | int absent = -1; |
2101 | 0 | if (!long_refs) { |
2102 | 0 | long_refs = kh_init(s2i); |
2103 | 0 | if (!long_refs) { |
2104 | 0 | e = 1; |
2105 | 0 | break; |
2106 | 0 | } |
2107 | 0 | } |
2108 | 0 | k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent); |
2109 | 0 | if (absent < 0) { |
2110 | 0 | e = 1; |
2111 | 0 | break; |
2112 | 0 | } |
2113 | 0 | kh_val(long_refs, k2) = ln; |
2114 | 0 | kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32 |
2115 | 0 | | UINT32_MAX); |
2116 | 0 | } else { |
2117 | 0 | kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln; |
2118 | 0 | } |
2119 | 0 | has_SQ = 1; |
2120 | 0 | } |
2121 | | |
2122 | 0 | e |= kputs("@SQ\tSN:", &str) < 0; |
2123 | 0 | e |= kputsn(line.s, tab - line.s, &str) < 0; |
2124 | 0 | e |= kputs("\tLN:", &str) < 0; |
2125 | 0 | e |= kputll(ln, &str) < 0; |
2126 | 0 | e |= kputc('\n', &str) < 0; |
2127 | 0 | if (e) |
2128 | 0 | break; |
2129 | 0 | } |
2130 | |
|
2131 | 0 | ks_free(&line); |
2132 | 0 | if (hclose(f) != 0) { |
2133 | 0 | hts_log_error("Error on closing %s", fai_fn); |
2134 | 0 | e = 1; |
2135 | 0 | } |
2136 | 0 | if (e) |
2137 | 0 | goto error; |
2138 | 0 | } |
2139 | | |
2140 | 12.0k | if (has_SQ) { |
2141 | | // Populate the targets array |
2142 | 7.45k | h->n_targets = kh_size(d); |
2143 | | |
2144 | 7.45k | h->target_name = (char**) malloc(sizeof(char*) * h->n_targets); |
2145 | 7.45k | if (!h->target_name) { |
2146 | 0 | h->n_targets = 0; |
2147 | 0 | goto error; |
2148 | 0 | } |
2149 | | |
2150 | 7.45k | h->target_len = (uint32_t*) malloc(sizeof(uint32_t) * h->n_targets); |
2151 | 7.45k | if (!h->target_len) { |
2152 | 0 | h->n_targets = 0; |
2153 | 0 | goto error; |
2154 | 0 | } |
2155 | | |
2156 | 54.4k | for (k = kh_begin(d); k != kh_end(d); ++k) { |
2157 | 46.9k | if (!kh_exist(d, k)) |
2158 | 27.9k | continue; |
2159 | | |
2160 | 19.0k | h->target_name[kh_val(d, k) >> 32] = (char*) kh_key(d, k); |
2161 | 19.0k | h->target_len[kh_val(d, k) >> 32] = kh_val(d, k) & 0xffffffffUL; |
2162 | 19.0k | kh_val(d, k) >>= 32; |
2163 | 19.0k | } |
2164 | 7.45k | } |
2165 | | |
2166 | | // Repurpose sdict to hold any references longer than UINT32_MAX |
2167 | 12.0k | h->sdict = long_refs; |
2168 | | |
2169 | 12.0k | kh_destroy(s2i, d); |
2170 | | |
2171 | 12.0k | if (str.l == 0) |
2172 | 63 | kputsn("", 0, &str); |
2173 | 12.0k | h->l_text = str.l; |
2174 | 12.0k | h->text = ks_release(&str); |
2175 | 12.0k | fp->bam_header = sam_hdr_sanitise(h); |
2176 | 12.0k | fp->bam_header->ref_count = 1; |
2177 | | |
2178 | 12.0k | return fp->bam_header; |
2179 | | |
2180 | 510 | error: |
2181 | 510 | if (h && d && (!h->target_name || !h->target_len)) { |
2182 | 1.27k | for (k = kh_begin(d); k != kh_end(d); ++k) |
2183 | 768 | if (kh_exist(d, k)) free((void *)kh_key(d, k)); |
2184 | 510 | } |
2185 | 510 | sam_hdr_destroy(h); |
2186 | 510 | ks_free(&str); |
2187 | 510 | kh_destroy(s2i, d); |
2188 | 510 | kh_destroy(s2i, long_refs); |
2189 | 510 | if (sn) free(sn); |
2190 | 510 | return NULL; |
2191 | 12.0k | } |
2192 | | |
2193 | | sam_hdr_t *sam_hdr_read(htsFile *fp) |
2194 | 28.1k | { |
2195 | 28.1k | if (!fp) { |
2196 | 0 | errno = EINVAL; |
2197 | 0 | return NULL; |
2198 | 0 | } |
2199 | | |
2200 | 28.1k | switch (fp->format.format) { |
2201 | 2.87k | case bam: |
2202 | 2.87k | return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf)); |
2203 | | |
2204 | 5.42k | case cram: |
2205 | 5.42k | return sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header)); |
2206 | | |
2207 | 12.5k | case sam: |
2208 | 12.5k | return sam_hdr_create(fp); |
2209 | | |
2210 | 411 | case fastq_format: |
2211 | 7.27k | case fasta_format: |
2212 | 7.27k | return sam_hdr_init(); |
2213 | | |
2214 | 0 | case empty_format: |
2215 | 0 | errno = EPIPE; |
2216 | 0 | return NULL; |
2217 | | |
2218 | 0 | default: |
2219 | 0 | errno = EFTYPE; |
2220 | 0 | return NULL; |
2221 | 28.1k | } |
2222 | 28.1k | } |
2223 | | |
2224 | | int sam_hdr_write(htsFile *fp, const sam_hdr_t *h) |
2225 | 26.9k | { |
2226 | 26.9k | if (!fp || !h) { |
2227 | 0 | errno = EINVAL; |
2228 | 0 | return -1; |
2229 | 0 | } |
2230 | | |
2231 | 26.9k | switch (fp->format.format) { |
2232 | 8.98k | case binary_format: |
2233 | 8.98k | fp->format.category = sequence_data; |
2234 | 8.98k | fp->format.format = bam; |
2235 | | /* fall-through */ |
2236 | 8.98k | case bam: |
2237 | 8.98k | if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1; |
2238 | 8.98k | break; |
2239 | | |
2240 | 8.98k | case cram: { |
2241 | 8.98k | cram_fd *fd = fp->fp.cram; |
2242 | 8.98k | if (cram_set_header2(fd, h) < 0) return -1; |
2243 | 7.93k | if (fp->fn_aux) |
2244 | 0 | cram_load_reference(fd, fp->fn_aux); |
2245 | 7.93k | if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1; |
2246 | 7.93k | } |
2247 | 7.93k | break; |
2248 | | |
2249 | 8.98k | case text_format: |
2250 | 8.98k | fp->format.category = sequence_data; |
2251 | 8.98k | fp->format.format = sam; |
2252 | | /* fall-through */ |
2253 | 8.98k | case sam: { |
2254 | 8.98k | if (!h->hrecs && !h->text) |
2255 | 0 | return 0; |
2256 | 8.98k | char *text; |
2257 | 8.98k | kstring_t hdr_ks = { 0, 0, NULL }; |
2258 | 8.98k | size_t l_text; |
2259 | 8.98k | ssize_t bytes; |
2260 | 8.98k | int r = 0, no_sq = 0; |
2261 | | |
2262 | 8.98k | if (h->hrecs) { |
2263 | 8.04k | if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) |
2264 | 0 | return -1; |
2265 | 8.04k | text = hdr_ks.s; |
2266 | 8.04k | l_text = hdr_ks.l; |
2267 | 8.04k | } else { |
2268 | 938 | const char *p = NULL; |
2269 | 1.16k | do { |
2270 | 1.16k | const char *q = p == NULL ? h->text : p + 4; |
2271 | 1.16k | p = strstr(q, "@SQ\t"); |
2272 | 1.16k | } while (!(p == NULL || p == h->text || *(p - 1) == '\n')); |
2273 | 938 | no_sq = p == NULL; |
2274 | 938 | text = h->text; |
2275 | 938 | l_text = h->l_text; |
2276 | 938 | } |
2277 | | |
2278 | 8.98k | if (fp->is_bgzf) { |
2279 | 0 | bytes = bgzf_write(fp->fp.bgzf, text, l_text); |
2280 | 8.98k | } else { |
2281 | 8.98k | bytes = hwrite(fp->fp.hfile, text, l_text); |
2282 | 8.98k | } |
2283 | 8.98k | free(hdr_ks.s); |
2284 | 8.98k | if (bytes != l_text) |
2285 | 0 | return -1; |
2286 | | |
2287 | 8.98k | if (no_sq) { |
2288 | 492 | int i; |
2289 | 1.28k | for (i = 0; i < h->n_targets; ++i) { |
2290 | 789 | fp->line.l = 0; |
2291 | 789 | r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0; |
2292 | 789 | r |= kputs(h->target_name[i], &fp->line) < 0; |
2293 | 789 | r |= kputsn("\tLN:", 4, &fp->line) < 0; |
2294 | 789 | r |= kputw(h->target_len[i], &fp->line) < 0; |
2295 | 789 | r |= kputc('\n', &fp->line) < 0; |
2296 | 789 | if (r != 0) |
2297 | 0 | return -1; |
2298 | | |
2299 | 789 | if (fp->is_bgzf) { |
2300 | 0 | bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l); |
2301 | 789 | } else { |
2302 | 789 | bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l); |
2303 | 789 | } |
2304 | 789 | if (bytes != fp->line.l) |
2305 | 0 | return -1; |
2306 | 789 | } |
2307 | 492 | } |
2308 | 8.98k | if (fp->is_bgzf) { |
2309 | 0 | if (bgzf_flush(fp->fp.bgzf) != 0) return -1; |
2310 | 8.98k | } else { |
2311 | 8.98k | if (hflush(fp->fp.hfile) != 0) return -1; |
2312 | 8.98k | } |
2313 | 8.98k | } |
2314 | 8.98k | break; |
2315 | | |
2316 | 8.98k | case fastq_format: |
2317 | 0 | case fasta_format: |
2318 | | // Nothing to output; FASTQ has no file headers. |
2319 | 0 | break; |
2320 | | |
2321 | 0 | default: |
2322 | 0 | errno = EBADF; |
2323 | 0 | return -1; |
2324 | 26.9k | } |
2325 | 25.9k | return 0; |
2326 | 26.9k | } |
2327 | | |
2328 | | static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val) |
2329 | 0 | { |
2330 | 0 | char *p, *q, *beg = NULL, *end = NULL, *newtext; |
2331 | 0 | size_t new_l_text; |
2332 | 0 | if (!h || !key) |
2333 | 0 | return -1; |
2334 | | |
2335 | 0 | if (h->l_text > 3) { |
2336 | 0 | if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists |
2337 | 0 | if ((p = strchr(h->text, '\n')) == 0) return -1; |
2338 | 0 | *p = '\0'; // for strstr call |
2339 | |
|
2340 | 0 | char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' }; |
2341 | |
|
2342 | 0 | if ((q = strstr(h->text, tmp)) != 0) { // key exists |
2343 | 0 | *p = '\n'; // change back |
2344 | | |
2345 | | // mark the key:val |
2346 | 0 | beg = q; |
2347 | 0 | for (q += 4; *q != '\n' && *q != '\t'; ++q); |
2348 | 0 | end = q; |
2349 | |
|
2350 | 0 | if (val && (strncmp(beg + 4, val, end - beg - 4) == 0) |
2351 | 0 | && strlen(val) == end - beg - 4) |
2352 | 0 | return 0; // val is the same, no need to change |
2353 | |
|
2354 | 0 | } else { |
2355 | 0 | beg = end = p; |
2356 | 0 | *p = '\n'; |
2357 | 0 | } |
2358 | 0 | } |
2359 | 0 | } |
2360 | 0 | if (beg == NULL) { // no @HD |
2361 | 0 | new_l_text = h->l_text; |
2362 | 0 | if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9) |
2363 | 0 | return -1; |
2364 | 0 | new_l_text += strlen(SAM_FORMAT_VERSION) + 8; |
2365 | 0 | if (val) { |
2366 | 0 | if (new_l_text > SIZE_MAX - strlen(val) - 5) |
2367 | 0 | return -1; |
2368 | 0 | new_l_text += strlen(val) + 4; |
2369 | 0 | } |
2370 | 0 | newtext = (char*)malloc(new_l_text + 1); |
2371 | 0 | if (!newtext) return -1; |
2372 | | |
2373 | 0 | if (val) |
2374 | 0 | snprintf(newtext, new_l_text + 1, |
2375 | 0 | "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text); |
2376 | 0 | else |
2377 | 0 | snprintf(newtext, new_l_text + 1, |
2378 | 0 | "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text); |
2379 | 0 | } else { // has @HD but different or no key |
2380 | 0 | new_l_text = (beg - h->text) + (h->text + h->l_text - end); |
2381 | 0 | if (val) { |
2382 | 0 | if (new_l_text > SIZE_MAX - strlen(val) - 5) |
2383 | 0 | return -1; |
2384 | 0 | new_l_text += strlen(val) + 4; |
2385 | 0 | } |
2386 | 0 | newtext = (char*)malloc(new_l_text + 1); |
2387 | 0 | if (!newtext) return -1; |
2388 | | |
2389 | 0 | if (val) { |
2390 | 0 | snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s", |
2391 | 0 | (int) (beg - h->text), h->text, key, val, end); |
2392 | 0 | } else { //delete key |
2393 | 0 | snprintf(newtext, new_l_text + 1, "%.*s%s", |
2394 | 0 | (int) (beg - h->text), h->text, end); |
2395 | 0 | } |
2396 | 0 | } |
2397 | 0 | free(h->text); |
2398 | 0 | h->text = newtext; |
2399 | 0 | h->l_text = new_l_text; |
2400 | 0 | return 0; |
2401 | 0 | } |
2402 | | |
2403 | | |
2404 | | int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val) |
2405 | 0 | { |
2406 | 0 | if (!h || !key) |
2407 | 0 | return -1; |
2408 | | |
2409 | 0 | if (!h->hrecs) |
2410 | 0 | return old_sam_hdr_change_HD(h, key, val); |
2411 | | |
2412 | 0 | if (val) { |
2413 | 0 | if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0) |
2414 | 0 | return -1; |
2415 | 0 | } else { |
2416 | 0 | if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0) |
2417 | 0 | return -1; |
2418 | 0 | } |
2419 | 0 | return sam_hdr_rebuild(h); |
2420 | 0 | } |
2421 | | /********************** |
2422 | | *** SAM record I/O *** |
2423 | | **********************/ |
2424 | | |
2425 | | // The speed of this code can vary considerably depending on minor code |
2426 | | // changes elsewhere as some of the tight loops are particularly prone to |
2427 | | // speed changes when the instruction blocks are split over a 32-byte |
2428 | | // boundary. To protect against this, we explicitly specify an alignment |
2429 | | // for this function. If this is insufficient, we may also wish to |
2430 | | // consider alignment of blocks within this function via |
2431 | | // __attribute__((optimize("align-loops=5"))) (gcc) or clang equivalents. |
2432 | | // However it's not very portable. |
2433 | | // Instead we break into separate functions so we can explicitly specify |
2434 | | // use __attribute__((aligned(32))) instead and force consistent loop |
2435 | | // alignment. |
2436 | 270k | static inline int64_t grow_B_array(bam1_t *b, uint32_t *n, size_t size) { |
2437 | | // Avoid overflow on 32-bit platforms, but it breaks BAM anyway |
2438 | 270k | if (*n > INT32_MAX*0.666) { |
2439 | 0 | errno = ENOMEM; |
2440 | 0 | return -1; |
2441 | 0 | } |
2442 | | |
2443 | 270k | size_t bytes = (size_t)size * (size_t)(*n>>1); |
2444 | 270k | if (possibly_expand_bam_data(b, bytes) < 0) { |
2445 | 0 | hts_log_error("Out of memory"); |
2446 | 0 | return -1; |
2447 | 0 | } |
2448 | | |
2449 | 270k | (*n)+=*n>>1; |
2450 | 270k | return 0; |
2451 | 270k | } |
2452 | | |
2453 | | |
2454 | | // This ensures that q always ends up at the next comma after |
2455 | | // reading a number even if it's followed by junk. It |
2456 | | // prevents the possibility of trying to read more than n items. |
2457 | 10.6M | #define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0) |
2458 | | |
2459 | | HTS_ALIGN32 |
2460 | | static char *sam_parse_Bc_vals(bam1_t *b, char *q, uint32_t *nused, |
2461 | 37.6k | uint32_t *nalloc, int *overflow) { |
2462 | 4.04M | while (*q == ',') { |
2463 | 4.00M | if ((*nused)++ >= (*nalloc)) { |
2464 | 255k | if (grow_B_array(b, nalloc, 1) < 0) |
2465 | 0 | return NULL; |
2466 | 255k | } |
2467 | 4.00M | *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, overflow); |
2468 | 4.00M | b->l_data++; |
2469 | 4.00M | } |
2470 | 37.6k | return q; |
2471 | 37.6k | } |
2472 | | |
2473 | | HTS_ALIGN32 |
2474 | | static char *sam_parse_BC_vals(bam1_t *b, char *q, uint32_t *nused, |
2475 | 17.7k | uint32_t *nalloc, int *overflow) { |
2476 | 539k | while (*q == ',') { |
2477 | 521k | if ((*nused)++ >= (*nalloc)) { |
2478 | 1.30k | if (grow_B_array(b, nalloc, 1) < 0) |
2479 | 0 | return NULL; |
2480 | 1.30k | } |
2481 | 521k | if (q[1] != '-') { |
2482 | 509k | *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, overflow); |
2483 | 509k | b->l_data++; |
2484 | 509k | } else { |
2485 | 11.7k | *overflow = 1; |
2486 | 11.7k | q++; |
2487 | 11.7k | skip_to_comma_(q); |
2488 | 11.7k | } |
2489 | 521k | } |
2490 | 17.7k | return q; |
2491 | 17.7k | } |
2492 | | |
2493 | | HTS_ALIGN32 |
2494 | | static char *sam_parse_Bs_vals(bam1_t *b, char *q, uint32_t *nused, |
2495 | 8.32k | uint32_t *nalloc, int *overflow) { |
2496 | 241k | while (*q == ',') { |
2497 | 233k | if ((*nused)++ >= (*nalloc)) { |
2498 | 2.79k | if (grow_B_array(b, nalloc, 2) < 0) |
2499 | 0 | return NULL; |
2500 | 2.79k | } |
2501 | 233k | i16_to_le(hts_str2int(q + 1, &q, 16, overflow), |
2502 | 233k | b->data + b->l_data); |
2503 | 233k | b->l_data += 2; |
2504 | 233k | } |
2505 | 8.32k | return q; |
2506 | 8.32k | } |
2507 | | |
2508 | | HTS_ALIGN32 |
2509 | | static char *sam_parse_BS_vals(bam1_t *b, char *q, uint32_t *nused, |
2510 | 5.96k | uint32_t *nalloc, int *overflow) { |
2511 | 5.84M | while (*q == ',') { |
2512 | 5.83M | if ((*nused)++ >= (*nalloc)) { |
2513 | 7.25k | if (grow_B_array(b, nalloc, 2) < 0) |
2514 | 0 | return NULL; |
2515 | 7.25k | } |
2516 | 5.83M | if (q[1] != '-') { |
2517 | 5.77M | u16_to_le(hts_str2uint(q + 1, &q, 16, overflow), |
2518 | 5.77M | b->data + b->l_data); |
2519 | 5.77M | b->l_data += 2; |
2520 | 5.77M | } else { |
2521 | 60.8k | *overflow = 1; |
2522 | 60.8k | q++; |
2523 | 60.8k | skip_to_comma_(q); |
2524 | 60.8k | } |
2525 | 5.83M | } |
2526 | 5.96k | return q; |
2527 | 5.96k | } |
2528 | | |
2529 | | HTS_ALIGN32 |
2530 | | static char *sam_parse_Bi_vals(bam1_t *b, char *q, uint32_t *nused, |
2531 | 15.8k | uint32_t *nalloc, int *overflow) { |
2532 | 6.56M | while (*q == ',') { |
2533 | 6.54M | if ((*nused)++ >= (*nalloc)) { |
2534 | 590 | if (grow_B_array(b, nalloc, 4) < 0) |
2535 | 0 | return NULL; |
2536 | 590 | } |
2537 | 6.54M | i32_to_le(hts_str2int(q + 1, &q, 32, overflow), |
2538 | 6.54M | b->data + b->l_data); |
2539 | 6.54M | b->l_data += 4; |
2540 | 6.54M | } |
2541 | 15.8k | return q; |
2542 | 15.8k | } |
2543 | | |
2544 | | HTS_ALIGN32 |
2545 | | static char *sam_parse_BI_vals(bam1_t *b, char *q, uint32_t *nused, |
2546 | 31.1k | uint32_t *nalloc, int *overflow) { |
2547 | 3.65M | while (*q == ',') { |
2548 | 3.62M | if ((*nused)++ >= (*nalloc)) { |
2549 | 2.15k | if (grow_B_array(b, nalloc, 4) < 0) |
2550 | 0 | return NULL; |
2551 | 2.15k | } |
2552 | 3.62M | if (q[1] != '-') { |
2553 | 3.62M | u32_to_le(hts_str2uint(q + 1, &q, 32, overflow), |
2554 | 3.62M | b->data + b->l_data); |
2555 | 3.62M | b->l_data += 4; |
2556 | 3.62M | } else { |
2557 | 1.62k | *overflow = 1; |
2558 | 1.62k | q++; |
2559 | 1.62k | skip_to_comma_(q); |
2560 | 1.62k | } |
2561 | 3.62M | } |
2562 | 31.1k | return q; |
2563 | 31.1k | } |
2564 | | |
2565 | | HTS_ALIGN32 |
2566 | | static char *sam_parse_Bf_vals(bam1_t *b, char *q, uint32_t *nused, |
2567 | 4.90k | uint32_t *nalloc, int *overflow) { |
2568 | 13.6k | while (*q == ',') { |
2569 | 8.78k | if ((*nused)++ >= (*nalloc)) { |
2570 | 796 | if (grow_B_array(b, nalloc, 4) < 0) |
2571 | 0 | return NULL; |
2572 | 796 | } |
2573 | 8.78k | float_to_le(strtod(q + 1, &q), b->data + b->l_data); |
2574 | 8.78k | b->l_data += 4; |
2575 | 8.78k | } |
2576 | 4.90k | return q; |
2577 | 4.90k | } |
2578 | | |
2579 | | HTS_ALIGN32 |
2580 | | static int sam_parse_B_vals_r(char type, uint32_t nalloc, char *in, |
2581 | | char **end, bam1_t *b, |
2582 | 121k | int *ctr) { |
2583 | | // Protect against infinite recursion when dealing with invalid input. |
2584 | | // An example string is "XX:B:C,-". The lack of a number means min=0, |
2585 | | // but it overflowed due to "-" and so we repeat ad-infinitum. |
2586 | | // |
2587 | | // Loop detection is the safest solution incase there are other |
2588 | | // strange corner cases with malformed inputs. |
2589 | 121k | if (++(*ctr) > 2) { |
2590 | 69 | hts_log_error("Malformed data in B:%c array", type); |
2591 | 69 | return -1; |
2592 | 69 | } |
2593 | | |
2594 | 121k | int orig_l = b->l_data; |
2595 | 121k | char *q = in; |
2596 | 121k | int32_t size; |
2597 | 121k | size_t bytes; |
2598 | 121k | int overflow = 0; |
2599 | | |
2600 | 121k | size = aux_type2size(type); |
2601 | 121k | if (size <= 0 || size > 4) { |
2602 | 23 | hts_log_error("Unrecognized type B:%c", type); |
2603 | 23 | return -1; |
2604 | 23 | } |
2605 | | |
2606 | | // Ensure space for type + values. |
2607 | | // The first pass through here we don't know the number of entries and |
2608 | | // nalloc == 0. We start with a small working set and then parse the |
2609 | | // data, growing as needed. |
2610 | | // |
2611 | | // If we have a second pass through we do know the number of entries |
2612 | | // and nalloc is already known. We have no need to expand the bam data. |
2613 | 121k | if (!nalloc) |
2614 | 82.6k | nalloc=7; |
2615 | | |
2616 | | // Ensure allocated memory is big enough (for current nalloc estimate) |
2617 | 121k | bytes = (size_t) nalloc * (size_t) size; |
2618 | 121k | if (bytes / size != nalloc |
2619 | 121k | || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) { |
2620 | 0 | hts_log_error("Out of memory"); |
2621 | 0 | return -1; |
2622 | 0 | } |
2623 | | |
2624 | 121k | uint32_t nused = 0; |
2625 | | |
2626 | 121k | b->data[b->l_data++] = 'B'; |
2627 | 121k | b->data[b->l_data++] = type; |
2628 | | // 32-bit B-array length is inserted later once we know it. |
2629 | 121k | int b_len_idx = b->l_data; |
2630 | 121k | b->l_data += sizeof(uint32_t); |
2631 | | |
2632 | 121k | if (type == 'c') { |
2633 | 37.6k | if (!(q = sam_parse_Bc_vals(b, q, &nused, &nalloc, &overflow))) |
2634 | 0 | return -1; |
2635 | 83.9k | } else if (type == 'C') { |
2636 | 17.7k | if (!(q = sam_parse_BC_vals(b, q, &nused, &nalloc, &overflow))) |
2637 | 0 | return -1; |
2638 | 66.2k | } else if (type == 's') { |
2639 | 8.32k | if (!(q = sam_parse_Bs_vals(b, q, &nused, &nalloc, &overflow))) |
2640 | 0 | return -1; |
2641 | 57.9k | } else if (type == 'S') { |
2642 | 5.96k | if (!(q = sam_parse_BS_vals(b, q, &nused, &nalloc, &overflow))) |
2643 | 0 | return -1; |
2644 | 51.9k | } else if (type == 'i') { |
2645 | 15.8k | if (!(q = sam_parse_Bi_vals(b, q, &nused, &nalloc, &overflow))) |
2646 | 0 | return -1; |
2647 | 36.0k | } else if (type == 'I') { |
2648 | 31.1k | if (!(q = sam_parse_BI_vals(b, q, &nused, &nalloc, &overflow))) |
2649 | 0 | return -1; |
2650 | 31.1k | } else if (type == 'f') { |
2651 | 4.90k | if (!(q = sam_parse_Bf_vals(b, q, &nused, &nalloc, &overflow))) |
2652 | 0 | return -1; |
2653 | 4.90k | } |
2654 | 121k | if (*q != '\t' && *q != '\0') { |
2655 | | // Unknown B array type or junk in the numbers |
2656 | 219 | hts_log_error("Malformed B:%c", type); |
2657 | 219 | return -1; |
2658 | 219 | } |
2659 | 121k | i32_to_le(nused, b->data + b_len_idx); |
2660 | | |
2661 | 121k | if (!overflow) { |
2662 | 82.0k | *end = q; |
2663 | 82.0k | return 0; |
2664 | 82.0k | } else { |
2665 | 39.3k | int64_t max = 0, min = 0, val; |
2666 | | // Given type was incorrect. Try to rescue the situation. |
2667 | 39.3k | char *r = q; |
2668 | 39.3k | q = in; |
2669 | 39.3k | overflow = 0; |
2670 | 39.3k | b->l_data = orig_l; |
2671 | | // Find out what range of values is present |
2672 | 10.2M | while (q < r) { |
2673 | 10.2M | val = hts_str2int(q + 1, &q, 64, &overflow); |
2674 | 10.2M | if (max < val) max = val; |
2675 | 10.2M | if (min > val) min = val; |
2676 | 10.2M | skip_to_comma_(q); |
2677 | 10.2M | } |
2678 | | // Retry with appropriate type |
2679 | 39.3k | if (!overflow) { |
2680 | 39.2k | if (min < 0) { |
2681 | 17.4k | if (min >= INT8_MIN && max <= INT8_MAX) { |
2682 | 900 | return sam_parse_B_vals_r('c', nalloc, in, end, b, ctr); |
2683 | 16.5k | } else if (min >= INT16_MIN && max <= INT16_MAX) { |
2684 | 1.28k | return sam_parse_B_vals_r('s', nalloc, in, end, b, ctr); |
2685 | 15.2k | } else if (min >= INT32_MIN && max <= INT32_MAX) { |
2686 | 15.0k | return sam_parse_B_vals_r('i', nalloc, in, end, b, ctr); |
2687 | 15.0k | } |
2688 | 21.8k | } else { |
2689 | 21.8k | if (max < UINT8_MAX) { |
2690 | 942 | return sam_parse_B_vals_r('C', nalloc, in, end, b, ctr); |
2691 | 20.8k | } else if (max <= UINT16_MAX) { |
2692 | 931 | return sam_parse_B_vals_r('S', nalloc, in, end, b, ctr); |
2693 | 19.9k | } else if (max <= UINT32_MAX) { |
2694 | 19.8k | return sam_parse_B_vals_r('I', nalloc, in, end, b, ctr); |
2695 | 19.8k | } |
2696 | 21.8k | } |
2697 | 39.2k | } |
2698 | | // If here then at least one of the values is too big to store |
2699 | 337 | hts_log_error("Numeric value in B array out of allowed range"); |
2700 | 337 | return -1; |
2701 | 39.3k | } |
2702 | 121k | #undef skip_to_comma_ |
2703 | 121k | } |
2704 | | |
2705 | | HTS_ALIGN32 |
2706 | | static int sam_parse_B_vals(char type, char *in, char **end, bam1_t *b) |
2707 | 82.6k | { |
2708 | 82.6k | int ctr = 0; |
2709 | 82.6k | uint32_t nalloc = 0; |
2710 | 82.6k | return sam_parse_B_vals_r(type, nalloc, in, end, b, &ctr); |
2711 | 82.6k | } |
2712 | | |
2713 | 235k | static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) { |
2714 | 235k | if (*v >= '1' && *v <= '9') { |
2715 | 123k | return hts_str2uint(v, rv, 16, overflow); |
2716 | 123k | } |
2717 | 112k | else if (*v == '0') { |
2718 | | // handle single-digit "0" directly; otherwise it's hex or octal |
2719 | 19.6k | if (v[1] == '\t') { *rv = v+1; return 0; } |
2720 | 289 | else { |
2721 | 289 | unsigned long val = strtoul(v, rv, 0); |
2722 | 289 | if (val > 65535) { *overflow = 1; return 65535; } |
2723 | 202 | return val; |
2724 | 289 | } |
2725 | 19.6k | } |
2726 | 92.5k | else { |
2727 | | // TODO implement symbolic flag letters |
2728 | 92.5k | *rv = v; |
2729 | 92.5k | return 0; |
2730 | 92.5k | } |
2731 | 235k | } |
2732 | | |
2733 | | // Parse tag line and append to bam object b. |
2734 | | // Shared by both SAM and FASTQ parsers. |
2735 | | // |
2736 | | // The difference between the two is how lenient we are to recognising |
2737 | | // non-compliant strings. The FASTQ parser glosses over arbitrary |
2738 | | // non-SAM looking strings. |
2739 | | static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient, |
2740 | 233k | khash_t(tag) *tag_whitelist) { |
2741 | 233k | int overflow = 0; |
2742 | 233k | int checkpoint; |
2743 | 233k | char logbuf[40]; |
2744 | 233k | char *q = start, *p = end; |
2745 | | |
2746 | 233k | #define _parse_err(cond, ...) \ |
2747 | 5.26M | do { \ |
2748 | 11.2M | if (cond) { \ |
2749 | 479 | if (lenient) { \ |
2750 | 0 | while (q < p && !isspace_c(*q)) \ |
2751 | 0 | q++; \ |
2752 | 0 | while (q < p && isspace_c(*q)) \ |
2753 | 0 | q++; \ |
2754 | 0 | b->l_data = checkpoint; \ |
2755 | 0 | goto loop; \ |
2756 | 479 | } else { \ |
2757 | 479 | hts_log_error(__VA_ARGS__); \ |
2758 | 479 | goto err_ret; \ |
2759 | 479 | } \ |
2760 | 479 | } \ |
2761 | 5.26M | } while (0) |
2762 | | |
2763 | 4.95M | while (q < p) loop: { |
2764 | 4.95M | char type; |
2765 | 4.95M | checkpoint = b->l_data; |
2766 | 4.95M | if (p - q < 5) { |
2767 | 107 | if (lenient) { |
2768 | 0 | break; |
2769 | 107 | } else { |
2770 | 107 | hts_log_error("Incomplete aux field"); |
2771 | 107 | goto err_ret; |
2772 | 107 | } |
2773 | 107 | } |
2774 | 2.47M | _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id"); |
2775 | | |
2776 | 2.47M | if (lenient && (q[2] | q[4]) != ':') { |
2777 | 0 | while (q < p && !isspace_c(*q)) |
2778 | 0 | q++; |
2779 | 0 | while (q < p && isspace_c(*q)) |
2780 | 0 | q++; |
2781 | 0 | continue; |
2782 | 0 | } |
2783 | | |
2784 | 2.47M | if (tag_whitelist) { |
2785 | 0 | int tt = q[0]*256 + q[1]; |
2786 | 0 | if (kh_get(tag, tag_whitelist, tt) == kh_end(tag_whitelist)) { |
2787 | 0 | while (q < p && *q != '\t') |
2788 | 0 | q++; |
2789 | 0 | continue; |
2790 | 0 | } |
2791 | 0 | } |
2792 | | |
2793 | | // Copy over id |
2794 | 2.47M | if (possibly_expand_bam_data(b, 2) < 0) goto err_ret; |
2795 | 2.47M | memcpy(b->data + b->l_data, q, 2); b->l_data += 2; |
2796 | 2.47M | q += 3; type = *q++; ++q; // q points to value |
2797 | 2.47M | if (type != 'Z' && type != 'H') // the only zero length acceptable fields |
2798 | 2.20M | _parse_err(*q <= '\t', "incomplete aux field"); |
2799 | | |
2800 | | // Ensure enough space for a double + type allocated. |
2801 | 2.47M | if (possibly_expand_bam_data(b, 16) < 0) goto err_ret; |
2802 | | |
2803 | 2.47M | if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { |
2804 | 844k | b->data[b->l_data++] = 'A'; |
2805 | 844k | b->data[b->l_data++] = *q++; |
2806 | 1.63M | } else if (type == 'i' || type == 'I') { |
2807 | 1.21M | if (*q == '-') { |
2808 | 970k | int32_t x = hts_str2int(q, &q, 32, &overflow); |
2809 | 970k | if (x >= INT8_MIN) { |
2810 | 477k | b->data[b->l_data++] = 'c'; |
2811 | 477k | b->data[b->l_data++] = x; |
2812 | 492k | } else if (x >= INT16_MIN) { |
2813 | 146k | b->data[b->l_data++] = 's'; |
2814 | 146k | i16_to_le(x, b->data + b->l_data); |
2815 | 146k | b->l_data += 2; |
2816 | 346k | } else { |
2817 | 346k | b->data[b->l_data++] = 'i'; |
2818 | 346k | i32_to_le(x, b->data + b->l_data); |
2819 | 346k | b->l_data += 4; |
2820 | 346k | } |
2821 | 970k | } else { |
2822 | 242k | uint32_t x = hts_str2uint(q, &q, 32, &overflow); |
2823 | 242k | if (x <= UINT8_MAX) { |
2824 | 153k | b->data[b->l_data++] = 'C'; |
2825 | 153k | b->data[b->l_data++] = x; |
2826 | 153k | } else if (x <= UINT16_MAX) { |
2827 | 74.4k | b->data[b->l_data++] = 'S'; |
2828 | 74.4k | u16_to_le(x, b->data + b->l_data); |
2829 | 74.4k | b->l_data += 2; |
2830 | 74.4k | } else { |
2831 | 14.0k | b->data[b->l_data++] = 'I'; |
2832 | 14.0k | u32_to_le(x, b->data + b->l_data); |
2833 | 14.0k | b->l_data += 4; |
2834 | 14.0k | } |
2835 | 242k | } |
2836 | 1.21M | } else if (type == 'f') { |
2837 | 25.4k | b->data[b->l_data++] = 'f'; |
2838 | 25.4k | float_to_le(strtod(q, &q), b->data + b->l_data); |
2839 | 25.4k | b->l_data += sizeof(float); |
2840 | 392k | } else if (type == 'd') { |
2841 | 36.4k | b->data[b->l_data++] = 'd'; |
2842 | 36.4k | double_to_le(strtod(q, &q), b->data + b->l_data); |
2843 | 36.4k | b->l_data += sizeof(double); |
2844 | 356k | } else if (type == 'Z' || type == 'H') { |
2845 | 273k | char *end = strchr(q, '\t'); |
2846 | 273k | if (!end) end = q + strlen(q); |
2847 | 273k | _parse_err(type == 'H' && ((end-q)&1) != 0, |
2848 | 273k | "hex field does not have an even number of digits"); |
2849 | 273k | b->data[b->l_data++] = type; |
2850 | 273k | if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret; |
2851 | 273k | memcpy(b->data + b->l_data, q, end - q); |
2852 | 273k | b->l_data += end - q; |
2853 | 273k | b->data[b->l_data++] = '\0'; |
2854 | 273k | q = end; |
2855 | 273k | } else if (type == 'B') { |
2856 | 82.6k | type = *q++; // q points to the first ',' following the typing byte |
2857 | 82.6k | _parse_err(*q && *q != ',' && *q != '\t', |
2858 | 82.6k | "B aux field type not followed by ','"); |
2859 | | |
2860 | 82.6k | if (sam_parse_B_vals(type, q, &q, b) < 0) |
2861 | 648 | goto err_ret; |
2862 | 82.6k | } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1)); |
2863 | | |
2864 | 13.9M | while (*q > '\t') { q++; } // Skip any junk to next tab |
2865 | 2.47M | q++; |
2866 | 2.47M | } |
2867 | | |
2868 | 231k | _parse_err(!lenient && overflow != 0, "numeric value out of allowed range"); |
2869 | 231k | #undef _parse_err |
2870 | | |
2871 | 231k | return 0; |
2872 | | |
2873 | 1.23k | err_ret: |
2874 | 1.23k | return -2; |
2875 | 231k | } |
2876 | | |
2877 | | int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) |
2878 | 236k | { |
2879 | 955k | #define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0) |
2880 | | |
2881 | 236k | #if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff |
2882 | | |
2883 | | // Macro that operates on 64-bits at a time. |
2884 | 236k | #define COPY_MINUS_N(to,from,n,l,failed) \ |
2885 | 236k | do { \ |
2886 | 223k | uint64_u *from8 = (uint64_u *)(from); \ |
2887 | 223k | uint64_u *to8 = (uint64_u *)(to); \ |
2888 | 223k | uint64_t uflow = 0; \ |
2889 | 223k | size_t l8 = (l)>>3, i; \ |
2890 | 224k | for (i = 0; i < l8; i++) { \ |
2891 | 203 | to8[i] = from8[i] - (n)*0x0101010101010101UL; \ |
2892 | 203 | uflow |= to8[i]; \ |
2893 | 203 | } \ |
2894 | 225k | for (i<<=3; i < (l); ++i) { \ |
2895 | 1.34k | to[i] = from[i] - (n); \ |
2896 | 1.34k | uflow |= to[i]; \ |
2897 | 1.34k | } \ |
2898 | 223k | failed = (uflow & 0x8080808080808080UL) > 0; \ |
2899 | 223k | } while (0) |
2900 | | |
2901 | | #else |
2902 | | |
2903 | | // Basic version which operates a byte at a time |
2904 | | #define COPY_MINUS_N(to,from,n,l,failed) do { \ |
2905 | | uint8_t uflow = 0; \ |
2906 | | for (i = 0; i < (l); ++i) { \ |
2907 | | (to)[i] = (from)[i] - (n); \ |
2908 | | uflow |= (uint8_t) (to)[i]; \ |
2909 | | } \ |
2910 | | failed = (uflow & 0x80) > 0; \ |
2911 | | } while (0) |
2912 | | |
2913 | | #endif |
2914 | | |
2915 | 446k | #define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l) |
2916 | 3.35M | #define _parse_err(cond, ...) do { if (cond) { hts_log_error(__VA_ARGS__); goto err_ret; } } while (0) |
2917 | 793k | #define _parse_warn(cond, ...) do { if (cond) { hts_log_warning(__VA_ARGS__); } } while (0) |
2918 | | |
2919 | 236k | uint8_t *t; |
2920 | | |
2921 | 236k | char *p = s->s, *q; |
2922 | 236k | int i, overflow = 0; |
2923 | 236k | char logbuf[40]; |
2924 | 236k | hts_pos_t cigreflen; |
2925 | 236k | bam1_core_t *c = &b->core; |
2926 | | |
2927 | 236k | b->l_data = 0; |
2928 | 236k | memset(c, 0, 32); |
2929 | | |
2930 | | // qname |
2931 | 236k | q = _read_token(p); |
2932 | | |
2933 | 235k | _parse_warn(p - q <= 1, "empty query name"); |
2934 | 235k | _parse_err(p - q > 255, "query name too long"); |
2935 | | // resize large enough for name + extranul |
2936 | 235k | if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret; |
2937 | 235k | memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q; |
2938 | | |
2939 | 235k | c->l_extranul = (4 - (b->l_data & 3)) & 3; |
2940 | 235k | memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul); |
2941 | 235k | b->l_data += c->l_extranul; |
2942 | | |
2943 | 235k | c->l_qname = p - q + c->l_extranul; |
2944 | | |
2945 | | // flag |
2946 | 235k | c->flag = parse_sam_flag(p, &p, &overflow); |
2947 | 235k | if (*p++ != '\t') goto err_ret; // malformated flag |
2948 | | |
2949 | | // chr |
2950 | 235k | q = _read_token(p); |
2951 | 235k | if (strcmp(q, "*")) { |
2952 | 223k | _parse_err(h->n_targets == 0, "no SQ lines present in the header"); |
2953 | 223k | c->tid = bam_name2id(h, q); |
2954 | 223k | _parse_err(c->tid < -1, "failed to parse header"); |
2955 | 223k | _parse_warn(c->tid < 0, "unrecognized reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX)); |
2956 | 223k | } else c->tid = -1; |
2957 | | |
2958 | | // pos |
2959 | 235k | c->pos = hts_str2uint(p, &p, 62, &overflow) - 1; |
2960 | 235k | if (*p++ != '\t') goto err_ret; |
2961 | 234k | if (c->pos < 0 && c->tid >= 0) { |
2962 | 6.02k | _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped"); |
2963 | 6.02k | c->tid = -1; |
2964 | 6.02k | } |
2965 | 234k | if (c->tid < 0) c->flag |= BAM_FUNMAP; |
2966 | | |
2967 | | // mapq |
2968 | 234k | c->qual = hts_str2uint(p, &p, 8, &overflow); |
2969 | 234k | if (*p++ != '\t') goto err_ret; |
2970 | | // cigar |
2971 | 234k | if (*p != '*') { |
2972 | 217k | uint32_t *cigar = NULL; |
2973 | 217k | int old_l_data = b->l_data; |
2974 | 217k | int n_cigar = bam_parse_cigar(p, &p, b); |
2975 | 217k | if (n_cigar < 1 || *p++ != '\t') goto err_ret; |
2976 | 217k | cigar = (uint32_t *)(b->data + old_l_data); |
2977 | | |
2978 | | // can't use bam_endpos() directly as some fields not yet set up |
2979 | 217k | cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1; |
2980 | 217k | if (cigreflen == 0) cigreflen = 1; |
2981 | 217k | } else { |
2982 | 16.8k | _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped"); |
2983 | 16.8k | c->flag |= BAM_FUNMAP; |
2984 | 16.8k | q = _read_token(p); |
2985 | 16.8k | cigreflen = 1; |
2986 | 16.8k | } |
2987 | 233k | _parse_err(HTS_POS_MAX - cigreflen <= c->pos, |
2988 | 233k | "read ends beyond highest supported position"); |
2989 | 233k | c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5); |
2990 | | // mate chr |
2991 | 233k | q = _read_token(p); |
2992 | 233k | if (strcmp(q, "=") == 0) { |
2993 | 419 | c->mtid = c->tid; |
2994 | 233k | } else if (strcmp(q, "*") == 0) { |
2995 | 461 | c->mtid = -1; |
2996 | 232k | } else { |
2997 | 232k | c->mtid = bam_name2id(h, q); |
2998 | 232k | _parse_err(c->mtid < -1, "failed to parse header"); |
2999 | 232k | _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX)); |
3000 | 232k | } |
3001 | | // mpos |
3002 | 233k | c->mpos = hts_str2uint(p, &p, 62, &overflow) - 1; |
3003 | 233k | if (*p++ != '\t') goto err_ret; |
3004 | 233k | if (c->mpos < 0 && c->mtid >= 0) { |
3005 | 78.0k | _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped"); |
3006 | 78.0k | c->mtid = -1; |
3007 | 78.0k | } |
3008 | | // tlen |
3009 | 233k | c->isize = hts_str2int(p, &p, 63, &overflow); |
3010 | 233k | if (*p++ != '\t') goto err_ret; |
3011 | 233k | _parse_err(overflow, "number outside allowed range"); |
3012 | | // seq |
3013 | 233k | q = _read_token(p); |
3014 | 233k | if (strcmp(q, "*")) { |
3015 | 213k | _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long"); |
3016 | 213k | c->l_qseq = p - q - 1; |
3017 | 213k | hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname)); |
3018 | 213k | _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length"); |
3019 | 213k | i = (c->l_qseq + 1) >> 1; |
3020 | 213k | _get_mem(uint8_t, &t, b, i); |
3021 | | |
3022 | 213k | unsigned int lqs2 = c->l_qseq&~1, i; |
3023 | 332k | for (i = 0; i < lqs2; i+=2) |
3024 | 119k | t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]]; |
3025 | 222k | for (; i < c->l_qseq; ++i) |
3026 | 9.62k | t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2); |
3027 | 213k | } else c->l_qseq = 0; |
3028 | | // qual |
3029 | 466k | _get_mem(uint8_t, &t, b, c->l_qseq); |
3030 | 466k | if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) { |
3031 | 9.22k | memset(t, 0xff, c->l_qseq); |
3032 | 9.22k | p += 2; |
3033 | 224k | } else { |
3034 | 224k | int failed = 0; |
3035 | 224k | _parse_err(s->l - (p - s->s) < c->l_qseq |
3036 | 224k | || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'), |
3037 | 224k | "SEQ and QUAL are of different length"); |
3038 | 223k | COPY_MINUS_N(t, p, 33, c->l_qseq, failed); |
3039 | 223k | _parse_err(failed, "invalid QUAL character"); |
3040 | 223k | p += c->l_qseq + 1; |
3041 | 223k | } |
3042 | | |
3043 | | // aux |
3044 | 233k | if (aux_parse(p, s->s + s->l, b, 0, NULL) < 0) |
3045 | 1.23k | goto err_ret; |
3046 | | |
3047 | 231k | if (bam_tag2cigar(b, 1, 1) < 0) |
3048 | 0 | return -2; |
3049 | 231k | return 0; |
3050 | | |
3051 | 0 | #undef _parse_warn |
3052 | 0 | #undef _parse_err |
3053 | 0 | #undef _get_mem |
3054 | 0 | #undef _read_token |
3055 | 4.52k | err_ret: |
3056 | 4.52k | return -2; |
3057 | 231k | } |
3058 | | |
3059 | 217k | static uint32_t read_ncigar(const char *q) { |
3060 | 217k | uint32_t n_cigar = 0; |
3061 | 2.73M | for (; *q && *q != '\t'; ++q) |
3062 | 2.51M | if (!isdigit_c(*q)) ++n_cigar; |
3063 | 217k | if (!n_cigar) { |
3064 | 107 | hts_log_error("No CIGAR operations"); |
3065 | 107 | return 0; |
3066 | 107 | } |
3067 | 217k | if (n_cigar >= 2147483647) { |
3068 | 0 | hts_log_error("Too many CIGAR operations"); |
3069 | 0 | return 0; |
3070 | 0 | } |
3071 | | |
3072 | 217k | return n_cigar; |
3073 | 217k | } |
3074 | | |
3075 | | /*! @function |
3076 | | @abstract Parse a CIGAR string into preallocated a uint32_t array |
3077 | | @param in [in] pointer to the source string |
3078 | | @param a_cigar [out] address of the destination uint32_t buffer |
3079 | | @return number of processed input characters; 0 on error |
3080 | | */ |
3081 | 217k | static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) { |
3082 | 217k | int i, overflow = 0; |
3083 | 217k | const char *p = in; |
3084 | 536k | for (i = 0; i < n_cigar; i++) { |
3085 | 319k | uint32_t len; |
3086 | 319k | int op; |
3087 | 319k | char *q; |
3088 | 319k | len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT; |
3089 | 319k | if (q == p) { |
3090 | 174 | hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p); |
3091 | 174 | return 0; |
3092 | 174 | } |
3093 | 318k | if (overflow) { |
3094 | 58 | hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p); |
3095 | 58 | return 0; |
3096 | 58 | } |
3097 | 318k | p = q; |
3098 | 318k | op = bam_cigar_table[(unsigned char)*p++]; |
3099 | 318k | if (op < 0) { |
3100 | 261 | hts_log_error("Unrecognized CIGAR operator"); |
3101 | 261 | return 0; |
3102 | 261 | } |
3103 | 318k | a_cigar[i] = len; |
3104 | 318k | a_cigar[i] |= op; |
3105 | 318k | } |
3106 | | |
3107 | 217k | return p-in; |
3108 | 217k | } |
3109 | | |
3110 | 0 | ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) { |
3111 | 0 | size_t n_cigar = 0; |
3112 | 0 | int diff; |
3113 | |
|
3114 | 0 | if (!in || !a_cigar || !a_mem) { |
3115 | 0 | hts_log_error("NULL pointer arguments"); |
3116 | 0 | return -1; |
3117 | 0 | } |
3118 | 0 | if (end) *end = (char *)in; |
3119 | |
|
3120 | 0 | if (*in == '*') { |
3121 | 0 | if (end) (*end)++; |
3122 | 0 | return 0; |
3123 | 0 | } |
3124 | 0 | n_cigar = read_ncigar(in); |
3125 | 0 | if (!n_cigar) return 0; |
3126 | 0 | if (n_cigar > *a_mem) { |
3127 | 0 | uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar)); |
3128 | 0 | if (a_tmp) { |
3129 | 0 | *a_cigar = a_tmp; |
3130 | 0 | *a_mem = n_cigar; |
3131 | 0 | } else { |
3132 | 0 | hts_log_error("Memory allocation error"); |
3133 | 0 | return -1; |
3134 | 0 | } |
3135 | 0 | } |
3136 | | |
3137 | 0 | if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1; |
3138 | 0 | if (end) *end = (char *)in+diff; |
3139 | |
|
3140 | 0 | return n_cigar; |
3141 | 0 | } |
3142 | | |
3143 | 217k | ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) { |
3144 | 217k | size_t n_cigar = 0; |
3145 | 217k | int diff; |
3146 | | |
3147 | 217k | if (!in || !b) { |
3148 | 0 | hts_log_error("NULL pointer arguments"); |
3149 | 0 | return -1; |
3150 | 0 | } |
3151 | 217k | if (end) *end = (char *)in; |
3152 | | |
3153 | 217k | n_cigar = (*in == '*') ? 0 : read_ncigar(in); |
3154 | 217k | if (!n_cigar && b->core.n_cigar == 0) { |
3155 | 107 | if (end) *end = (char *)in+1; |
3156 | 107 | return 0; |
3157 | 107 | } |
3158 | | |
3159 | 217k | ssize_t cig_diff = n_cigar - b->core.n_cigar; |
3160 | 217k | if (cig_diff > 0 && |
3161 | 217k | possibly_expand_bam_data(b, cig_diff * sizeof(uint32_t)) < 0) { |
3162 | 0 | hts_log_error("Memory allocation error"); |
3163 | 0 | return -1; |
3164 | 0 | } |
3165 | | |
3166 | 217k | uint32_t *cig = bam_get_cigar(b); |
3167 | 217k | if ((uint8_t *)cig != b->data + b->l_data) { |
3168 | | // Modifying an BAM existing BAM record |
3169 | 0 | uint8_t *seq = bam_get_seq(b); |
3170 | 0 | memmove(cig + n_cigar, seq, (b->data + b->l_data) - seq); |
3171 | 0 | } |
3172 | | |
3173 | 217k | if (n_cigar) { |
3174 | 217k | if (!(diff = parse_cigar(in, cig, n_cigar))) |
3175 | 493 | return -1; |
3176 | 217k | } else { |
3177 | 0 | diff = 1; // handle "*" |
3178 | 0 | } |
3179 | | |
3180 | 217k | b->l_data += cig_diff * sizeof(uint32_t); |
3181 | 217k | b->core.n_cigar = n_cigar; |
3182 | 217k | if (end) *end = (char *)in + diff; |
3183 | | |
3184 | 217k | return n_cigar; |
3185 | 217k | } |
3186 | | |
3187 | | /* |
3188 | | * ----------------------------------------------------------------------------- |
3189 | | * SAM threading |
3190 | | */ |
3191 | | // Size of SAM text block (reading) |
3192 | 0 | #define SAM_NBYTES 240000 |
3193 | | |
3194 | | // Number of BAM records (writing, up to NB_mem in size) |
3195 | 0 | #define SAM_NBAM 1000 |
3196 | | |
3197 | | struct SAM_state; |
3198 | | |
3199 | | // Output job - a block of BAM records |
3200 | | typedef struct sp_bams { |
3201 | | struct sp_bams *next; |
3202 | | int serial; |
3203 | | |
3204 | | bam1_t *bams; |
3205 | | int nbams, abams; // used and alloc for bams[] array |
3206 | | size_t bam_mem; // very approximate total size |
3207 | | |
3208 | | struct SAM_state *fd; |
3209 | | } sp_bams; |
3210 | | |
3211 | | // Input job - a block of SAM text |
3212 | | typedef struct sp_lines { |
3213 | | struct sp_lines *next; |
3214 | | int serial; |
3215 | | |
3216 | | char *data; |
3217 | | int data_size; |
3218 | | int alloc; |
3219 | | |
3220 | | struct SAM_state *fd; |
3221 | | sp_bams *bams; |
3222 | | } sp_lines; |
3223 | | |
3224 | | enum sam_cmd { |
3225 | | SAM_NONE = 0, |
3226 | | SAM_CLOSE, |
3227 | | SAM_CLOSE_DONE, |
3228 | | SAM_AT_EOF, |
3229 | | }; |
3230 | | |
3231 | | typedef struct SAM_state { |
3232 | | sam_hdr_t *h; |
3233 | | |
3234 | | hts_tpool *p; |
3235 | | int own_pool; |
3236 | | pthread_mutex_t lines_m; |
3237 | | hts_tpool_process *q; |
3238 | | pthread_t dispatcher; |
3239 | | int dispatcher_set; |
3240 | | |
3241 | | sp_lines *lines; |
3242 | | sp_bams *bams; |
3243 | | |
3244 | | sp_bams *curr_bam; |
3245 | | int curr_idx; |
3246 | | int serial; |
3247 | | |
3248 | | // Be warned: moving these mutexes around in this struct can reduce |
3249 | | // threading performance by up to 70%! |
3250 | | pthread_mutex_t command_m; |
3251 | | pthread_cond_t command_c; |
3252 | | enum sam_cmd command; |
3253 | | |
3254 | | // One of the E* errno codes |
3255 | | int errcode; |
3256 | | |
3257 | | htsFile *fp; |
3258 | | } SAM_state; |
3259 | | |
3260 | | // Returns a SAM_state struct from a generic hFILE. |
3261 | | // |
3262 | | // Returns NULL on failure. |
3263 | 0 | static SAM_state *sam_state_create(htsFile *fp) { |
3264 | | // Ideally sam_open wouldn't be a #define to hts_open but instead would |
3265 | | // be a redirect call with an additional 'S' mode. This in turn would |
3266 | | // correctly set the designed format to sam instead of a generic |
3267 | | // text_format. |
3268 | 0 | if (fp->format.format != sam && fp->format.format != text_format) |
3269 | 0 | return NULL; |
3270 | | |
3271 | 0 | SAM_state *fd = calloc(1, sizeof(*fd)); |
3272 | 0 | if (!fd) |
3273 | 0 | return NULL; |
3274 | | |
3275 | 0 | fp->state = fd; |
3276 | 0 | fd->fp = fp; |
3277 | |
|
3278 | 0 | return fd; |
3279 | 0 | } |
3280 | | |
3281 | | static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str); |
3282 | | static void *sam_format_worker(void *arg); |
3283 | | |
3284 | 0 | static void sam_state_err(SAM_state *fd, int errcode) { |
3285 | 0 | pthread_mutex_lock(&fd->command_m); |
3286 | 0 | if (!fd->errcode) |
3287 | 0 | fd->errcode = errcode; |
3288 | 0 | pthread_mutex_unlock(&fd->command_m); |
3289 | 0 | } |
3290 | | |
3291 | 0 | static void sam_free_sp_bams(sp_bams *b) { |
3292 | 0 | if (!b) |
3293 | 0 | return; |
3294 | | |
3295 | 0 | if (b->bams) { |
3296 | 0 | int i; |
3297 | 0 | for (i = 0; i < b->abams; i++) { |
3298 | 0 | if (b->bams[i].data) |
3299 | 0 | free(b->bams[i].data); |
3300 | 0 | } |
3301 | 0 | free(b->bams); |
3302 | 0 | } |
3303 | 0 | free(b); |
3304 | 0 | } |
3305 | | |
3306 | | // Destroys the state produce by sam_state_create. |
3307 | 25.7k | int sam_state_destroy(htsFile *fp) { |
3308 | 25.7k | int ret = 0; |
3309 | | |
3310 | 25.7k | if (!fp->state) |
3311 | 25.7k | return 0; |
3312 | | |
3313 | 0 | SAM_state *fd = fp->state; |
3314 | 0 | if (fd->p) { |
3315 | 0 | if (fd->h) { |
3316 | | // Notify sam_dispatcher we're closing |
3317 | 0 | pthread_mutex_lock(&fd->command_m); |
3318 | 0 | if (fd->command != SAM_CLOSE_DONE) |
3319 | 0 | fd->command = SAM_CLOSE; |
3320 | 0 | pthread_cond_signal(&fd->command_c); |
3321 | 0 | ret = -fd->errcode; |
3322 | 0 | if (fd->q) |
3323 | 0 | hts_tpool_wake_dispatch(fd->q); // unstick the reader |
3324 | |
|
3325 | 0 | if (!fp->is_write && fd->q && fd->dispatcher_set) { |
3326 | 0 | for (;;) { |
3327 | | // Avoid deadlocks with dispatcher |
3328 | 0 | if (fd->command == SAM_CLOSE_DONE) |
3329 | 0 | break; |
3330 | 0 | hts_tpool_wake_dispatch(fd->q); |
3331 | 0 | pthread_mutex_unlock(&fd->command_m); |
3332 | 0 | hts_usleep(10000); |
3333 | 0 | pthread_mutex_lock(&fd->command_m); |
3334 | 0 | } |
3335 | 0 | } |
3336 | 0 | pthread_mutex_unlock(&fd->command_m); |
3337 | |
|
3338 | 0 | if (fp->is_write) { |
3339 | | // Dispatch the last partial block. |
3340 | 0 | sp_bams *gb = fd->curr_bam; |
3341 | 0 | if (!ret && gb && gb->nbams > 0 && fd->q) |
3342 | 0 | ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb); |
3343 | | |
3344 | | // Flush and drain output |
3345 | 0 | if (fd->q) |
3346 | 0 | hts_tpool_process_flush(fd->q); |
3347 | 0 | pthread_mutex_lock(&fd->command_m); |
3348 | 0 | if (!ret) ret = -fd->errcode; |
3349 | 0 | pthread_mutex_unlock(&fd->command_m); |
3350 | |
|
3351 | 0 | while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) { |
3352 | 0 | hts_usleep(10000); |
3353 | 0 | pthread_mutex_lock(&fd->command_m); |
3354 | 0 | ret = -fd->errcode; |
3355 | | // not empty but shutdown implies error |
3356 | 0 | if (hts_tpool_process_is_shutdown(fd->q) && !ret) |
3357 | 0 | ret = EIO; |
3358 | 0 | pthread_mutex_unlock(&fd->command_m); |
3359 | 0 | } |
3360 | 0 | if (fd->q) |
3361 | 0 | hts_tpool_process_shutdown(fd->q); |
3362 | 0 | } |
3363 | | |
3364 | | // Wait for it to acknowledge |
3365 | 0 | if (fd->dispatcher_set) |
3366 | 0 | pthread_join(fd->dispatcher, NULL); |
3367 | 0 | if (!ret) ret = -fd->errcode; |
3368 | 0 | } |
3369 | | |
3370 | | // Tidy up memory |
3371 | 0 | if (fd->q) |
3372 | 0 | hts_tpool_process_destroy(fd->q); |
3373 | |
|
3374 | 0 | if (fd->own_pool && fp->format.compression == no_compression) { |
3375 | 0 | hts_tpool_destroy(fd->p); |
3376 | 0 | fd->p = NULL; |
3377 | 0 | } |
3378 | 0 | pthread_mutex_destroy(&fd->lines_m); |
3379 | 0 | pthread_mutex_destroy(&fd->command_m); |
3380 | 0 | pthread_cond_destroy(&fd->command_c); |
3381 | |
|
3382 | 0 | sp_lines *l = fd->lines; |
3383 | 0 | while (l) { |
3384 | 0 | sp_lines *n = l->next; |
3385 | 0 | free(l->data); |
3386 | 0 | free(l); |
3387 | 0 | l = n; |
3388 | 0 | } |
3389 | |
|
3390 | 0 | sp_bams *b = fd->bams; |
3391 | 0 | while (b) { |
3392 | 0 | if (fd->curr_bam == b) |
3393 | 0 | fd->curr_bam = NULL; |
3394 | 0 | sp_bams *n = b->next; |
3395 | 0 | sam_free_sp_bams(b); |
3396 | 0 | b = n; |
3397 | 0 | } |
3398 | |
|
3399 | 0 | if (fd->curr_bam) |
3400 | 0 | sam_free_sp_bams(fd->curr_bam); |
3401 | | |
3402 | | // Decrement counter by one, maybe destroying too. |
3403 | | // This is to permit the caller using bam_hdr_destroy |
3404 | | // before sam_close without triggering decode errors |
3405 | | // in the background threads. |
3406 | 0 | bam_hdr_destroy(fd->h); |
3407 | 0 | } |
3408 | |
|
3409 | 0 | free(fp->state); |
3410 | 0 | fp->state = NULL; |
3411 | 0 | return ret; |
3412 | 25.7k | } |
3413 | | |
3414 | | // Cleanup function - job for sam_parse_worker; result for sam_format_worker |
3415 | 0 | static void cleanup_sp_lines(void *arg) { |
3416 | 0 | sp_lines *gl = (sp_lines *)arg; |
3417 | 0 | if (!gl) return; |
3418 | | |
3419 | | // Should always be true for lines passed to / from thread workers. |
3420 | 0 | assert(gl->next == NULL); |
3421 | | |
3422 | 0 | free(gl->data); |
3423 | 0 | sam_free_sp_bams(gl->bams); |
3424 | 0 | free(gl); |
3425 | 0 | } |
3426 | | |
3427 | | // Run from one of the worker threads. |
3428 | | // Convert a passed in array of lines to array of BAMs, returning |
3429 | | // the result back to the thread queue. |
3430 | 0 | static void *sam_parse_worker(void *arg) { |
3431 | 0 | sp_lines *gl = (sp_lines *)arg; |
3432 | 0 | sp_bams *gb = NULL; |
3433 | 0 | char *lines = gl->data; |
3434 | 0 | int i; |
3435 | 0 | bam1_t *b; |
3436 | 0 | SAM_state *fd = gl->fd; |
3437 | | |
3438 | | // Use a block of BAM structs we had earlier if available. |
3439 | 0 | pthread_mutex_lock(&fd->lines_m); |
3440 | 0 | if (fd->bams) { |
3441 | 0 | gb = fd->bams; |
3442 | 0 | fd->bams = gb->next; |
3443 | 0 | } |
3444 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3445 | |
|
3446 | 0 | if (gb == NULL) { |
3447 | 0 | gb = calloc(1, sizeof(*gb)); |
3448 | 0 | if (!gb) { |
3449 | 0 | return NULL; |
3450 | 0 | } |
3451 | 0 | gb->abams = 100; |
3452 | 0 | gb->bams = b = calloc(gb->abams, sizeof(*b)); |
3453 | 0 | if (!gb->bams) { |
3454 | 0 | sam_state_err(fd, ENOMEM); |
3455 | 0 | goto err; |
3456 | 0 | } |
3457 | 0 | gb->nbams = 0; |
3458 | 0 | gb->bam_mem = 0; |
3459 | 0 | } |
3460 | 0 | gb->serial = gl->serial; |
3461 | 0 | gb->next = NULL; |
3462 | |
|
3463 | 0 | b = (bam1_t *)gb->bams; |
3464 | 0 | if (!b) { |
3465 | 0 | sam_state_err(fd, ENOMEM); |
3466 | 0 | goto err; |
3467 | 0 | } |
3468 | | |
3469 | 0 | i = 0; |
3470 | 0 | char *cp = lines, *cp_end = lines + gl->data_size; |
3471 | 0 | while (cp < cp_end) { |
3472 | 0 | if (i >= gb->abams) { |
3473 | 0 | int old_abams = gb->abams; |
3474 | 0 | gb->abams *= 2; |
3475 | 0 | b = (bam1_t *)realloc(gb->bams, gb->abams*sizeof(bam1_t)); |
3476 | 0 | if (!b) { |
3477 | 0 | gb->abams /= 2; |
3478 | 0 | sam_state_err(fd, ENOMEM); |
3479 | 0 | goto err; |
3480 | 0 | } |
3481 | 0 | memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b)); |
3482 | 0 | gb->bams = b; |
3483 | 0 | } |
3484 | | |
3485 | | // Ideally we'd get sam_parse1 to return the number of |
3486 | | // bytes decoded and to be able to stop on newline as |
3487 | | // well as \0. |
3488 | | // |
3489 | | // We can then avoid the additional strchr loop. |
3490 | | // It's around 6% of our CPU cost, albeit threadable. |
3491 | | // |
3492 | | // However this is an API change so for now we copy. |
3493 | | |
3494 | 0 | char *nl = strchr(cp, '\n'); |
3495 | 0 | char *line_end; |
3496 | 0 | if (nl) { |
3497 | 0 | line_end = nl; |
3498 | 0 | if (line_end > cp && *(line_end - 1) == '\r') |
3499 | 0 | line_end--; |
3500 | 0 | nl++; |
3501 | 0 | } else { |
3502 | 0 | nl = line_end = cp_end; |
3503 | 0 | } |
3504 | 0 | *line_end = '\0'; |
3505 | 0 | kstring_t ks = { line_end - cp, gl->alloc, cp }; |
3506 | 0 | if (sam_parse1(&ks, fd->h, &b[i]) < 0) { |
3507 | 0 | sam_state_err(fd, errno ? errno : EIO); |
3508 | 0 | cleanup_sp_lines(gl); |
3509 | 0 | goto err; |
3510 | 0 | } |
3511 | | |
3512 | 0 | cp = nl; |
3513 | 0 | i++; |
3514 | 0 | } |
3515 | 0 | gb->nbams = i; |
3516 | |
|
3517 | 0 | pthread_mutex_lock(&fd->lines_m); |
3518 | 0 | gl->next = fd->lines; |
3519 | 0 | fd->lines = gl; |
3520 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3521 | 0 | return gb; |
3522 | | |
3523 | 0 | err: |
3524 | 0 | sam_free_sp_bams(gb); |
3525 | 0 | return NULL; |
3526 | 0 | } |
3527 | | |
3528 | 0 | static void *sam_parse_eof(void *arg) { |
3529 | 0 | return NULL; |
3530 | 0 | } |
3531 | | |
3532 | | // Cleanup function - result for sam_parse_worker; job for sam_format_worker |
3533 | 0 | static void cleanup_sp_bams(void *arg) { |
3534 | 0 | sam_free_sp_bams((sp_bams *) arg); |
3535 | 0 | } |
3536 | | |
3537 | | // Runs in its own thread. |
3538 | | // Reads a block of text (SAM) and sends a new job to the thread queue to |
3539 | | // translate this to BAM. |
3540 | 0 | static void *sam_dispatcher_read(void *vp) { |
3541 | 0 | htsFile *fp = vp; |
3542 | 0 | kstring_t line = {0}; |
3543 | 0 | int line_frag = 0; |
3544 | 0 | SAM_state *fd = fp->state; |
3545 | 0 | sp_lines *l = NULL; |
3546 | | |
3547 | | // Pre-allocate buffer for left-over bits of line (exact size doesn't |
3548 | | // matter as it will grow if necessary). |
3549 | 0 | if (ks_resize(&line, 1000) < 0) |
3550 | 0 | goto err; |
3551 | | |
3552 | 0 | for (;;) { |
3553 | | // Check for command |
3554 | 0 | pthread_mutex_lock(&fd->command_m); |
3555 | 0 | switch (fd->command) { |
3556 | | |
3557 | 0 | case SAM_CLOSE: |
3558 | 0 | pthread_cond_signal(&fd->command_c); |
3559 | 0 | pthread_mutex_unlock(&fd->command_m); |
3560 | 0 | hts_tpool_process_shutdown(fd->q); |
3561 | 0 | goto tidyup; |
3562 | | |
3563 | 0 | default: |
3564 | 0 | break; |
3565 | 0 | } |
3566 | 0 | pthread_mutex_unlock(&fd->command_m); |
3567 | |
|
3568 | 0 | pthread_mutex_lock(&fd->lines_m); |
3569 | 0 | if (fd->lines) { |
3570 | | // reuse existing line buffer |
3571 | 0 | l = fd->lines; |
3572 | 0 | fd->lines = l->next; |
3573 | 0 | } |
3574 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3575 | |
|
3576 | 0 | if (l == NULL) { |
3577 | | // none to reuse, to create a new one |
3578 | 0 | l = calloc(1, sizeof(*l)); |
3579 | 0 | if (!l) |
3580 | 0 | goto err; |
3581 | 0 | l->alloc = SAM_NBYTES; |
3582 | 0 | l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1 |
3583 | 0 | if (!l->data) { |
3584 | 0 | free(l); |
3585 | 0 | l = NULL; |
3586 | 0 | goto err; |
3587 | 0 | } |
3588 | 0 | l->fd = fd; |
3589 | 0 | } |
3590 | 0 | l->next = NULL; |
3591 | |
|
3592 | 0 | if (l->alloc < line_frag+SAM_NBYTES/2) { |
3593 | 0 | char *rp = realloc(l->data, line_frag+SAM_NBYTES/2 +8); |
3594 | 0 | if (!rp) |
3595 | 0 | goto err; |
3596 | 0 | l->alloc = line_frag+SAM_NBYTES/2; |
3597 | 0 | l->data = rp; |
3598 | 0 | } |
3599 | 0 | memcpy(l->data, line.s, line_frag); |
3600 | |
|
3601 | 0 | l->data_size = line_frag; |
3602 | 0 | ssize_t nbytes; |
3603 | 0 | longer_line: |
3604 | 0 | if (fp->is_bgzf) |
3605 | 0 | nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag); |
3606 | 0 | else |
3607 | 0 | nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag); |
3608 | 0 | if (nbytes < 0) { |
3609 | 0 | sam_state_err(fd, errno ? errno : EIO); |
3610 | 0 | goto err; |
3611 | 0 | } else if (nbytes == 0) |
3612 | 0 | break; // EOF |
3613 | 0 | l->data_size += nbytes; |
3614 | | |
3615 | | // trim to last \n. Maybe \r\n, but that's still fine |
3616 | 0 | if (nbytes == l->alloc - line_frag) { |
3617 | 0 | char *cp_end = l->data + l->data_size; |
3618 | 0 | char *cp = cp_end-1; |
3619 | |
|
3620 | 0 | while (cp > (char *)l->data && *cp != '\n') |
3621 | 0 | cp--; |
3622 | | |
3623 | | // entire buffer is part of a single line |
3624 | 0 | if (cp == l->data) { |
3625 | 0 | line_frag = l->data_size; |
3626 | 0 | char *rp = realloc(l->data, l->alloc * 2 + 8); |
3627 | 0 | if (!rp) |
3628 | 0 | goto err; |
3629 | 0 | l->alloc *= 2; |
3630 | 0 | l->data = rp; |
3631 | 0 | assert(l->alloc >= l->data_size); |
3632 | 0 | assert(l->alloc >= line_frag); |
3633 | 0 | assert(l->alloc >= l->alloc - line_frag); |
3634 | 0 | goto longer_line; |
3635 | 0 | } |
3636 | 0 | cp++; |
3637 | | |
3638 | | // line holds the remainder of our line. |
3639 | 0 | if (ks_resize(&line, cp_end - cp) < 0) |
3640 | 0 | goto err; |
3641 | 0 | memcpy(line.s, cp, cp_end - cp); |
3642 | 0 | line_frag = cp_end - cp; |
3643 | 0 | l->data_size = l->alloc - line_frag; |
3644 | 0 | } else { |
3645 | | // out of buffer |
3646 | 0 | line_frag = 0; |
3647 | 0 | } |
3648 | | |
3649 | 0 | l->serial = fd->serial++; |
3650 | | //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial); |
3651 | 0 | if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l, |
3652 | 0 | cleanup_sp_lines, cleanup_sp_bams, 0) < 0) |
3653 | 0 | goto err; |
3654 | 0 | pthread_mutex_lock(&fd->command_m); |
3655 | 0 | if (fd->command == SAM_CLOSE) { |
3656 | 0 | pthread_mutex_unlock(&fd->command_m); |
3657 | 0 | l = NULL; |
3658 | 0 | goto tidyup; |
3659 | 0 | } |
3660 | 0 | l = NULL; // Now "owned" by sam_parse_worker() |
3661 | 0 | pthread_mutex_unlock(&fd->command_m); |
3662 | 0 | } |
3663 | | |
3664 | | // Submit a NULL sp_bams entry to act as an EOF marker |
3665 | 0 | if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0) |
3666 | 0 | goto err; |
3667 | | |
3668 | | // At EOF, wait for close request. |
3669 | | // (In future if we add support for seek, this is where we need to catch it.) |
3670 | 0 | for (;;) { |
3671 | 0 | pthread_mutex_lock(&fd->command_m); |
3672 | 0 | if (fd->command == SAM_NONE) |
3673 | 0 | pthread_cond_wait(&fd->command_c, &fd->command_m); |
3674 | 0 | switch (fd->command) { |
3675 | 0 | case SAM_CLOSE: |
3676 | 0 | pthread_cond_signal(&fd->command_c); |
3677 | 0 | pthread_mutex_unlock(&fd->command_m); |
3678 | 0 | hts_tpool_process_shutdown(fd->q); |
3679 | 0 | goto tidyup; |
3680 | | |
3681 | 0 | default: |
3682 | 0 | pthread_mutex_unlock(&fd->command_m); |
3683 | 0 | break; |
3684 | 0 | } |
3685 | 0 | } |
3686 | | |
3687 | 0 | tidyup: |
3688 | 0 | pthread_mutex_lock(&fd->command_m); |
3689 | 0 | fd->command = SAM_CLOSE_DONE; |
3690 | 0 | pthread_cond_signal(&fd->command_c); |
3691 | 0 | pthread_mutex_unlock(&fd->command_m); |
3692 | |
|
3693 | 0 | if (l) { |
3694 | 0 | pthread_mutex_lock(&fd->lines_m); |
3695 | 0 | l->next = fd->lines; |
3696 | 0 | fd->lines = l; |
3697 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3698 | 0 | } |
3699 | 0 | free(line.s); |
3700 | |
|
3701 | 0 | return NULL; |
3702 | | |
3703 | 0 | err: |
3704 | 0 | sam_state_err(fd, errno ? errno : ENOMEM); |
3705 | 0 | hts_tpool_process_shutdown(fd->q); |
3706 | 0 | goto tidyup; |
3707 | 0 | } |
3708 | | |
3709 | | // Runs in its own thread. |
3710 | | // Takes encoded blocks of SAM off the thread results queue and writes them |
3711 | | // to our output stream. |
3712 | 0 | static void *sam_dispatcher_write(void *vp) { |
3713 | 0 | htsFile *fp = vp; |
3714 | 0 | SAM_state *fd = fp->state; |
3715 | 0 | hts_tpool_result *r; |
3716 | | |
3717 | | // Iterates until result queue is shutdown, where it returns NULL. |
3718 | 0 | while ((r = hts_tpool_next_result_wait(fd->q))) { |
3719 | 0 | sp_lines *gl = (sp_lines *)hts_tpool_result_data(r); |
3720 | 0 | if (!gl) { |
3721 | 0 | sam_state_err(fd, ENOMEM); |
3722 | 0 | goto err; |
3723 | 0 | } |
3724 | | |
3725 | 0 | if (fp->idx) { |
3726 | 0 | sp_bams *gb = gl->bams; |
3727 | 0 | int i = 0, count = 0; |
3728 | 0 | while (i < gl->data_size) { |
3729 | 0 | int j = i; |
3730 | 0 | while (i < gl->data_size && gl->data[i] != '\n') |
3731 | 0 | i++; |
3732 | 0 | if (i < gl->data_size) |
3733 | 0 | i++; |
3734 | |
|
3735 | 0 | if (fp->is_bgzf) { |
3736 | 0 | if (bgzf_flush_try(fp->fp.bgzf, i-j) < 0) |
3737 | 0 | goto err; |
3738 | 0 | if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j) |
3739 | 0 | goto err; |
3740 | 0 | } else { |
3741 | 0 | if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j) |
3742 | 0 | goto err; |
3743 | 0 | } |
3744 | | |
3745 | 0 | bam1_t *b = &gb->bams[count++]; |
3746 | 0 | if (fp->format.compression == bgzf) { |
3747 | 0 | if (bgzf_idx_push(fp->fp.bgzf, fp->idx, |
3748 | 0 | b->core.tid, b->core.pos, bam_endpos(b), |
3749 | 0 | bgzf_tell(fp->fp.bgzf), |
3750 | 0 | !(b->core.flag&BAM_FUNMAP)) < 0) { |
3751 | 0 | sam_state_err(fd, errno ? errno : ENOMEM); |
3752 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
3753 | 0 | bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1); |
3754 | 0 | goto err; |
3755 | 0 | } |
3756 | 0 | } else { |
3757 | 0 | if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b), |
3758 | 0 | bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) { |
3759 | 0 | sam_state_err(fd, errno ? errno : ENOMEM); |
3760 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
3761 | 0 | bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1); |
3762 | 0 | goto err; |
3763 | 0 | } |
3764 | 0 | } |
3765 | 0 | } |
3766 | | |
3767 | 0 | assert(count == gb->nbams); |
3768 | | |
3769 | | // Add bam array to free-list |
3770 | 0 | pthread_mutex_lock(&fd->lines_m); |
3771 | 0 | gb->next = fd->bams; |
3772 | 0 | fd->bams = gl->bams; |
3773 | 0 | gl->bams = NULL; |
3774 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3775 | 0 | } else { |
3776 | 0 | if (fp->is_bgzf) { |
3777 | | // We keep track of how much in the current block we have |
3778 | | // remaining => R. We look for the last newline in input |
3779 | | // [i] to [i+R], backwards => position N. |
3780 | | // |
3781 | | // If we find a newline, we write out bytes i to N. |
3782 | | // We know we cannot fit the next record in this bgzf block, |
3783 | | // so we flush what we have and copy input N to i+R into |
3784 | | // the start of a new block, and recompute a new R for that. |
3785 | | // |
3786 | | // If we don't find a newline (i==N) then we cannot extend |
3787 | | // the current block at all, so flush whatever is in it now |
3788 | | // if it ends on a newline. |
3789 | | // We still copy i(==N) to i+R to the next block and |
3790 | | // continue as before with a new R. |
3791 | | // |
3792 | | // The only exception on the flush is when we run out of |
3793 | | // data in the input. In that case we skip it as we don't |
3794 | | // yet know if the next record will fit. |
3795 | | // |
3796 | | // Both conditions share the same code here: |
3797 | | // - Look for newline (pos N) |
3798 | | // - Write i to N (which maybe 0) |
3799 | | // - Flush if block ends on newline and not end of input |
3800 | | // - write N to i+R |
3801 | |
|
3802 | 0 | int i = 0; |
3803 | 0 | BGZF *fb = fp->fp.bgzf; |
3804 | 0 | while (i < gl->data_size) { |
3805 | | // remaining space in block |
3806 | 0 | int R = BGZF_BLOCK_SIZE - fb->block_offset; |
3807 | 0 | int eod = 0; |
3808 | 0 | if (R > gl->data_size-i) |
3809 | 0 | R = gl->data_size-i, eod = 1; |
3810 | | |
3811 | | // Find last newline in input data |
3812 | 0 | int N = i + R; |
3813 | 0 | while (--N > i) { |
3814 | 0 | if (gl->data[N] == '\n') |
3815 | 0 | break; |
3816 | 0 | } |
3817 | |
|
3818 | 0 | if (N != i) { |
3819 | | // Found a newline |
3820 | 0 | N++; |
3821 | 0 | if (bgzf_write(fb, &gl->data[i], N-i) != N-i) |
3822 | 0 | goto err; |
3823 | 0 | } |
3824 | | |
3825 | | // Flush bgzf block |
3826 | 0 | int b_off = fb->block_offset; |
3827 | 0 | if (!eod && b_off && |
3828 | 0 | ((char *)fb->uncompressed_block)[b_off-1] == '\n') |
3829 | 0 | if (bgzf_flush_try(fb, BGZF_BLOCK_SIZE) < 0) |
3830 | 0 | goto err; |
3831 | | |
3832 | | // Copy from N onwards into next block |
3833 | 0 | if (i+R > N) |
3834 | 0 | if (bgzf_write(fb, &gl->data[N], i+R - N) |
3835 | 0 | != i+R - N) |
3836 | 0 | goto err; |
3837 | | |
3838 | 0 | i = i+R; |
3839 | 0 | } |
3840 | 0 | } else { |
3841 | 0 | if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size) |
3842 | 0 | goto err; |
3843 | 0 | } |
3844 | 0 | } |
3845 | | |
3846 | 0 | hts_tpool_delete_result(r, 0); |
3847 | | |
3848 | | // Also updated by main thread |
3849 | 0 | pthread_mutex_lock(&fd->lines_m); |
3850 | 0 | gl->next = fd->lines; |
3851 | 0 | fd->lines = gl; |
3852 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3853 | 0 | } |
3854 | | |
3855 | 0 | sam_state_err(fd, 0); // success |
3856 | 0 | hts_tpool_process_shutdown(fd->q); |
3857 | 0 | return NULL; |
3858 | | |
3859 | 0 | err: |
3860 | 0 | sam_state_err(fd, errno ? errno : EIO); |
3861 | 0 | return (void *)-1; |
3862 | 0 | } |
3863 | | |
3864 | | // Run from one of the worker threads. |
3865 | | // Convert a passed in array of BAMs (sp_bams) and converts to a block |
3866 | | // of text SAM records (sp_lines). |
3867 | 0 | static void *sam_format_worker(void *arg) { |
3868 | 0 | sp_bams *gb = (sp_bams *)arg; |
3869 | 0 | sp_lines *gl = NULL; |
3870 | 0 | int i; |
3871 | 0 | SAM_state *fd = gb->fd; |
3872 | 0 | htsFile *fp = fd->fp; |
3873 | | |
3874 | | // Use a block of SAM strings we had earlier if available. |
3875 | 0 | pthread_mutex_lock(&fd->lines_m); |
3876 | 0 | if (fd->lines) { |
3877 | 0 | gl = fd->lines; |
3878 | 0 | fd->lines = gl->next; |
3879 | 0 | } |
3880 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3881 | |
|
3882 | 0 | if (gl == NULL) { |
3883 | 0 | gl = calloc(1, sizeof(*gl)); |
3884 | 0 | if (!gl) { |
3885 | 0 | sam_state_err(fd, ENOMEM); |
3886 | 0 | return NULL; |
3887 | 0 | } |
3888 | 0 | gl->alloc = gl->data_size = 0; |
3889 | 0 | gl->data = NULL; |
3890 | 0 | } |
3891 | 0 | gl->serial = gb->serial; |
3892 | 0 | gl->next = NULL; |
3893 | |
|
3894 | 0 | kstring_t ks = {0, gl->alloc, gl->data}; |
3895 | |
|
3896 | 0 | for (i = 0; i < gb->nbams; i++) { |
3897 | 0 | if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) { |
3898 | 0 | sam_state_err(fd, errno ? errno : EIO); |
3899 | 0 | goto err; |
3900 | 0 | } |
3901 | 0 | kputc('\n', &ks); |
3902 | 0 | } |
3903 | | |
3904 | 0 | pthread_mutex_lock(&fd->lines_m); |
3905 | 0 | gl->data_size = ks.l; |
3906 | 0 | gl->alloc = ks.m; |
3907 | 0 | gl->data = ks.s; |
3908 | |
|
3909 | 0 | if (fp->idx) { |
3910 | | // Keep hold of the bam array a little longer as |
3911 | | // sam_dispatcher_write needs to use them for building the index. |
3912 | 0 | gl->bams = gb; |
3913 | 0 | } else { |
3914 | | // Add bam array to free-list |
3915 | 0 | gb->next = fd->bams; |
3916 | 0 | fd->bams = gb; |
3917 | 0 | } |
3918 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3919 | |
|
3920 | 0 | return gl; |
3921 | | |
3922 | 0 | err: |
3923 | | // Possible race between this and fd->curr_bam. |
3924 | | // Easier to not free and leave it on the input list so it |
3925 | | // gets freed there instead? |
3926 | | // sam_free_sp_bams(gb); |
3927 | 0 | if (gl) { |
3928 | 0 | free(gl->data); |
3929 | 0 | free(gl); |
3930 | 0 | } |
3931 | 0 | return NULL; |
3932 | 0 | } |
3933 | | |
3934 | 0 | int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) { |
3935 | 0 | if (fp->state) |
3936 | 0 | return 0; |
3937 | | |
3938 | 0 | if (!(fp->state = sam_state_create(fp))) |
3939 | 0 | return -1; |
3940 | 0 | SAM_state *fd = (SAM_state *)fp->state; |
3941 | |
|
3942 | 0 | pthread_mutex_init(&fd->lines_m, NULL); |
3943 | 0 | pthread_mutex_init(&fd->command_m, NULL); |
3944 | 0 | pthread_cond_init(&fd->command_c, NULL); |
3945 | 0 | fd->p = p->pool; |
3946 | 0 | int qsize = p->qsize; |
3947 | 0 | if (!qsize) |
3948 | 0 | qsize = 2*hts_tpool_size(fd->p); |
3949 | 0 | fd->q = hts_tpool_process_init(fd->p, qsize, 0); |
3950 | 0 | if (!fd->q) { |
3951 | 0 | sam_state_destroy(fp); |
3952 | 0 | return -1; |
3953 | 0 | } |
3954 | | |
3955 | 0 | if (fp->format.compression == bgzf) |
3956 | 0 | return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize); |
3957 | | |
3958 | 0 | return 0; |
3959 | 0 | } |
3960 | | |
3961 | 0 | int sam_set_threads(htsFile *fp, int nthreads) { |
3962 | 0 | if (nthreads <= 0) |
3963 | 0 | return 0; |
3964 | | |
3965 | 0 | htsThreadPool p; |
3966 | 0 | p.pool = hts_tpool_init(nthreads); |
3967 | 0 | p.qsize = nthreads*2; |
3968 | |
|
3969 | 0 | int ret = sam_set_thread_pool(fp, &p); |
3970 | 0 | if (ret < 0) |
3971 | 0 | return ret; |
3972 | | |
3973 | 0 | SAM_state *fd = (SAM_state *)fp->state; |
3974 | 0 | fd->own_pool = 1; |
3975 | |
|
3976 | 0 | return 0; |
3977 | 0 | } |
3978 | | |
3979 | | typedef struct { |
3980 | | kstring_t name; |
3981 | | kstring_t comment; // NB: pointer into name, do not free |
3982 | | kstring_t seq; |
3983 | | kstring_t qual; |
3984 | | int casava; |
3985 | | int aux; |
3986 | | int rnum; |
3987 | | char BC[3]; // aux tag ID for barcode |
3988 | | khash_t(tag) *tags; // which aux tags to use (if empty, use all). |
3989 | | char nprefix; |
3990 | | int sra_names; |
3991 | | } fastq_state; |
3992 | | |
3993 | | // Initialise fastq state. |
3994 | | // Name char of '@' or '>' distinguishes fastq vs fasta variant |
3995 | 7.27k | static fastq_state *fastq_state_init(int name_char) { |
3996 | 7.27k | fastq_state *x = (fastq_state *)calloc(1, sizeof(*x)); |
3997 | 7.27k | if (!x) |
3998 | 0 | return NULL; |
3999 | 7.27k | strcpy(x->BC, "BC"); |
4000 | 7.27k | x->nprefix = name_char; |
4001 | | |
4002 | 7.27k | return x; |
4003 | 7.27k | } |
4004 | | |
4005 | 9.69k | void fastq_state_destroy(htsFile *fp) { |
4006 | 9.69k | if (fp->state) { |
4007 | 7.27k | fastq_state *x = (fastq_state *)fp->state; |
4008 | 7.27k | if (x->tags) |
4009 | 7.27k | kh_destroy(tag, x->tags); |
4010 | 7.27k | ks_free(&x->name); |
4011 | 7.27k | ks_free(&x->seq); |
4012 | 7.27k | ks_free(&x->qual); |
4013 | 7.27k | free(fp->state); |
4014 | 7.27k | } |
4015 | 9.69k | } |
4016 | | |
4017 | 0 | int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) { |
4018 | 0 | va_list args; |
4019 | |
|
4020 | 0 | if (!fp) |
4021 | 0 | return -1; |
4022 | 0 | if (!fp->state) |
4023 | 0 | if (!(fp->state = fastq_state_init(fp->format.format == fastq_format |
4024 | 0 | ? '@' : '>'))) |
4025 | 0 | return -1; |
4026 | | |
4027 | 0 | fastq_state *x = (fastq_state *)fp->state; |
4028 | |
|
4029 | 0 | switch (opt) { |
4030 | 0 | case FASTQ_OPT_CASAVA: |
4031 | 0 | x->casava = 1; |
4032 | 0 | break; |
4033 | | |
4034 | 0 | case FASTQ_OPT_NAME2: |
4035 | 0 | x->sra_names = 1; |
4036 | 0 | break; |
4037 | | |
4038 | 0 | case FASTQ_OPT_AUX: { |
4039 | 0 | va_start(args, opt); |
4040 | 0 | x->aux = 1; |
4041 | 0 | char *tag = va_arg(args, char *); |
4042 | 0 | va_end(args); |
4043 | 0 | if (tag && strcmp(tag, "1") != 0) { |
4044 | 0 | if (!x->tags) |
4045 | 0 | if (!(x->tags = kh_init(tag))) |
4046 | 0 | return -1; |
4047 | | |
4048 | 0 | size_t i, tlen = strlen(tag); |
4049 | 0 | for (i = 0; i+3 <= tlen+1; i += 3) { |
4050 | 0 | if (tag[i+0] == ',' || tag[i+1] == ',' || |
4051 | 0 | !(tag[i+2] == ',' || tag[i+2] == '\0')) { |
4052 | 0 | hts_log_warning("Bad tag format '%.3s'; skipping option", tag+i); |
4053 | 0 | break; |
4054 | 0 | } |
4055 | 0 | int ret, tcode = tag[i+0]*256 + tag[i+1]; |
4056 | 0 | kh_put(tag, x->tags, tcode, &ret); |
4057 | 0 | if (ret < 0) |
4058 | 0 | return -1; |
4059 | 0 | } |
4060 | 0 | } |
4061 | 0 | break; |
4062 | 0 | } |
4063 | | |
4064 | 0 | case FASTQ_OPT_BARCODE: { |
4065 | 0 | va_start(args, opt); |
4066 | 0 | char *bc = va_arg(args, char *); |
4067 | 0 | va_end(args); |
4068 | 0 | strncpy(x->BC, bc, 2); |
4069 | 0 | x->BC[2] = 0; |
4070 | 0 | break; |
4071 | 0 | } |
4072 | | |
4073 | 0 | case FASTQ_OPT_RNUM: |
4074 | 0 | x->rnum = 1; |
4075 | 0 | break; |
4076 | | |
4077 | 0 | default: |
4078 | 0 | break; |
4079 | 0 | } |
4080 | 0 | return 0; |
4081 | 0 | } |
4082 | | |
4083 | 4.32M | static int fastq_parse1(htsFile *fp, bam1_t *b) { |
4084 | 4.32M | fastq_state *x = (fastq_state *)fp->state; |
4085 | 4.32M | size_t i, l; |
4086 | 4.32M | int ret = 0; |
4087 | | |
4088 | 4.32M | if (fp->format.format == fasta_format && fp->line.s) { |
4089 | | // For FASTA we've already read the >name line; steal it |
4090 | | // Not the most efficient, but we don't optimise for fasta reading. |
4091 | 4.32M | if (fp->line.l == 0) |
4092 | 6.18k | return -1; // EOF |
4093 | | |
4094 | 4.31M | free(x->name.s); |
4095 | 4.31M | x->name = fp->line; |
4096 | 4.31M | fp->line.l = fp->line.m = 0; |
4097 | 4.31M | fp->line.s = NULL; |
4098 | 4.31M | } else { |
4099 | | // Read a FASTQ format entry. |
4100 | 8.01k | ret = hts_getline(fp, KS_SEP_LINE, &x->name); |
4101 | 8.01k | if (ret == -1) |
4102 | 120 | return -1; // EOF |
4103 | 7.89k | else if (ret < -1) |
4104 | 84 | return ret; // ERR |
4105 | 8.01k | } |
4106 | | |
4107 | | // Name |
4108 | 4.32M | if (*x->name.s != x->nprefix) |
4109 | 102 | return -2; |
4110 | | |
4111 | | // Reverse the SRA strangeness of putting the run_name.number before |
4112 | | // the read name. |
4113 | 4.32M | i = 0; |
4114 | 4.32M | char *name = x->name.s+1; |
4115 | 4.32M | if (x->sra_names) { |
4116 | 0 | char *cp = strpbrk(x->name.s, " \t"); |
4117 | 0 | if (cp) { |
4118 | 0 | while (*cp == ' ' || *cp == '\t') |
4119 | 0 | cp++; |
4120 | 0 | *--cp = '@'; |
4121 | 0 | i = cp - x->name.s; |
4122 | 0 | name = cp+1; |
4123 | 0 | } |
4124 | 0 | } |
4125 | | |
4126 | 4.32M | l = x->name.l; |
4127 | 4.32M | char *s = x->name.s; |
4128 | 36.1M | while (i < l && !isspace_c(s[i])) |
4129 | 31.8M | i++; |
4130 | 4.32M | if (i < l) { |
4131 | 70.9k | s[i] = 0; |
4132 | 70.9k | x->name.l = i++; |
4133 | 70.9k | } |
4134 | | |
4135 | | // Comment; a kstring struct, but pointer into name line. (Do not free) |
4136 | 4.51M | while (i < l && isspace_c(s[i])) |
4137 | 192k | i++; |
4138 | 4.32M | x->comment.s = s+i; |
4139 | 4.32M | x->comment.l = l - i; |
4140 | | |
4141 | | // Seq |
4142 | 4.32M | x->seq.l = 0; |
4143 | 28.8M | for (;;) { |
4144 | 28.8M | if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0) |
4145 | 6.85k | if (fp->format.format == fastq_format || ret < -1) |
4146 | 627 | return -2; |
4147 | 28.8M | if (ret == -1 || |
4148 | 28.8M | *fp->line.s == (fp->format.format == fastq_format ? '+' : '>')) |
4149 | 4.32M | break; |
4150 | 24.5M | if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0) |
4151 | 0 | return -2; |
4152 | 24.5M | } |
4153 | | |
4154 | | // Qual |
4155 | 4.32M | if (fp->format.format == fastq_format) { |
4156 | 798 | size_t remainder = x->seq.l; |
4157 | 798 | x->qual.l = 0; |
4158 | 45.6k | do { |
4159 | 45.6k | if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0) |
4160 | 30 | return -2; |
4161 | 45.6k | if (fp->line.l > remainder) |
4162 | 45 | return -2; |
4163 | 45.6k | if (kputsn(fp->line.s, fp->line.l, &x->qual) < 0) |
4164 | 0 | return -2; |
4165 | 45.6k | remainder -= fp->line.l; |
4166 | 45.6k | } while (remainder > 0); |
4167 | | |
4168 | | // Decr qual |
4169 | 530k | for (i = 0; i < x->qual.l; i++) |
4170 | 530k | x->qual.s[i] -= '!'; |
4171 | 723 | } |
4172 | | |
4173 | 4.32M | int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED; |
4174 | 4.32M | if (x->name.l > 2 && |
4175 | 4.32M | x->name.s[x->name.l-2] == '/' && |
4176 | 4.32M | isdigit_c(x->name.s[x->name.l-1])) { |
4177 | 105k | switch(x->name.s[x->name.l-1]) { |
4178 | 6.44k | case '1': flag |= BAM_FREAD1 | pflag; break; |
4179 | 2.52k | case '2': flag |= BAM_FREAD2 | pflag; break; |
4180 | 96.0k | default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break; |
4181 | 105k | } |
4182 | 105k | x->name.s[x->name.l-=2] = 0; |
4183 | 105k | } |
4184 | | |
4185 | | // Convert to BAM |
4186 | 4.32M | ret = bam_set1(b, |
4187 | 4.32M | x->name.s + x->name.l - name, name, |
4188 | 4.32M | flag, |
4189 | 4.32M | -1, -1, 0, // ref '*', pos, mapq, |
4190 | 4.32M | 0, NULL, // no cigar, |
4191 | 4.32M | -1, -1, 0, // mate |
4192 | 4.32M | x->seq.l, x->seq.s, x->qual.s, |
4193 | 4.32M | 0); |
4194 | 4.32M | if (ret < 0) return -2; |
4195 | | |
4196 | | // Identify Illumina CASAVA strings. |
4197 | | // <read>:<is_filtered>:<control_bits>:<barcode_sequence> |
4198 | 4.32M | char *barcode = NULL; |
4199 | 4.32M | int barcode_len = 0; |
4200 | 4.32M | kstring_t *kc = &x->comment; |
4201 | 4.32M | char *endptr; |
4202 | 4.32M | if (x->casava && |
4203 | | // \d:[YN]:\d+:[ACGTN]+ |
4204 | 4.32M | kc->l > 6 && (kc->s[1] | kc->s[3]) == ':' && isdigit_c(kc->s[0]) && |
4205 | 4.32M | strtol(kc->s+4, &endptr, 10) >= 0 && endptr != kc->s+4 |
4206 | 4.32M | && *endptr == ':') { |
4207 | | |
4208 | | // read num |
4209 | 0 | switch(kc->s[0]) { |
4210 | 0 | case '1': b->core.flag |= BAM_FREAD1 | pflag; break; |
4211 | 0 | case '2': b->core.flag |= BAM_FREAD2 | pflag; break; |
4212 | 0 | default : b->core.flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break; |
4213 | 0 | } |
4214 | | |
4215 | 0 | if (kc->s[2] == 'Y') |
4216 | 0 | b->core.flag |= BAM_FQCFAIL; |
4217 | | |
4218 | | // Barcode, maybe numeric in which case we skip it |
4219 | 0 | if (!isdigit_c(endptr[1])) { |
4220 | 0 | barcode = endptr+1; |
4221 | 0 | for (i = barcode - kc->s; i < kc->l; i++) |
4222 | 0 | if (isspace_c(kc->s[i])) |
4223 | 0 | break; |
4224 | |
|
4225 | 0 | kc->s[i] = 0; |
4226 | 0 | barcode_len = i+1-(barcode - kc->s); |
4227 | 0 | } |
4228 | 0 | } |
4229 | | |
4230 | 4.32M | if (ret >= 0 && barcode_len) |
4231 | 0 | if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0) |
4232 | 0 | ret = -2; |
4233 | | |
4234 | 4.32M | if (!x->aux) |
4235 | 4.32M | return ret; |
4236 | | |
4237 | | // Identify any SAM style aux tags in comments too. |
4238 | 0 | if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0) |
4239 | 0 | ret = -2; |
4240 | |
|
4241 | 0 | return ret; |
4242 | 4.32M | } |
4243 | | |
4244 | | // Internal component of sam_read1 below |
4245 | 3.47k | static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { |
4246 | 3.47k | int ret = bam_read1(fp->fp.bgzf, b); |
4247 | 3.47k | if (h && ret >= 0) { |
4248 | 1.91k | if (b->core.tid >= h->n_targets || b->core.tid < -1 || |
4249 | 1.91k | b->core.mtid >= h->n_targets || b->core.mtid < -1) { |
4250 | 228 | errno = ERANGE; |
4251 | 228 | return -3; |
4252 | 228 | } |
4253 | 1.91k | } |
4254 | 3.24k | return ret; |
4255 | 3.47k | } |
4256 | | |
4257 | | // Internal component of sam_read1 below |
4258 | 5.33k | static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) { |
4259 | 5.33k | int ret = cram_get_bam_seq(fp->fp.cram, b); |
4260 | 5.33k | if (ret < 0) |
4261 | 5.33k | return cram_eof(fp->fp.cram) ? -1 : -2; |
4262 | | |
4263 | 0 | if (bam_tag2cigar(*b, 1, 1) < 0) |
4264 | 0 | return -2; |
4265 | | |
4266 | 0 | return ret; |
4267 | 0 | } |
4268 | | |
4269 | | // Internal component of sam_read1 below |
4270 | 242k | static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { |
4271 | 242k | int ret; |
4272 | | |
4273 | | // Consume 1st line after header parsing as it wasn't using peek |
4274 | 242k | if (fp->line.l != 0) { |
4275 | 63 | ret = sam_parse1(&fp->line, h, b); |
4276 | 63 | fp->line.l = 0; |
4277 | 63 | return ret; |
4278 | 63 | } |
4279 | | |
4280 | 242k | if (fp->state) { |
4281 | 0 | SAM_state *fd = (SAM_state *)fp->state; |
4282 | |
|
4283 | 0 | if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) { |
4284 | | // We don't support multi-threaded SAM parsing with seeks yet. |
4285 | 0 | int ret; |
4286 | 0 | if ((ret = sam_state_destroy(fp)) < 0) { |
4287 | 0 | errno = -ret; |
4288 | 0 | return -2; |
4289 | 0 | } |
4290 | 0 | if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0) |
4291 | 0 | return -2; |
4292 | 0 | fp->fp.bgzf->seeked = 0; |
4293 | 0 | goto err_recover; |
4294 | 0 | } |
4295 | | |
4296 | 0 | if (!fd->h) { |
4297 | 0 | fd->h = h; |
4298 | 0 | fd->h->ref_count++; |
4299 | | // Ensure hrecs is initialised now as we don't want multiple |
4300 | | // threads trying to do this simultaneously. |
4301 | 0 | if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0) |
4302 | 0 | return -2; |
4303 | | |
4304 | | // We can only do this once we've got a header |
4305 | 0 | if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read, |
4306 | 0 | fp) != 0) |
4307 | 0 | return -2; |
4308 | 0 | fd->dispatcher_set = 1; |
4309 | 0 | } |
4310 | | |
4311 | 0 | if (fd->h != h) { |
4312 | 0 | hts_log_error("SAM multi-threaded decoding does not support changing header"); |
4313 | 0 | return -2; |
4314 | 0 | } |
4315 | | |
4316 | 0 | sp_bams *gb = fd->curr_bam; |
4317 | 0 | if (!gb) { |
4318 | 0 | if (fd->errcode) { |
4319 | | // In case reader failed |
4320 | 0 | errno = fd->errcode; |
4321 | 0 | return -2; |
4322 | 0 | } |
4323 | | |
4324 | 0 | pthread_mutex_lock(&fd->command_m); |
4325 | 0 | int cmd = fd->command; |
4326 | 0 | pthread_mutex_unlock(&fd->command_m); |
4327 | 0 | if (cmd == SAM_AT_EOF) |
4328 | 0 | return -1; |
4329 | | |
4330 | 0 | hts_tpool_result *r = hts_tpool_next_result_wait(fd->q); |
4331 | 0 | if (!r) |
4332 | 0 | return -2; |
4333 | 0 | fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r); |
4334 | 0 | hts_tpool_delete_result(r, 0); |
4335 | 0 | } |
4336 | 0 | if (!gb) { |
4337 | 0 | pthread_mutex_lock(&fd->command_m); |
4338 | 0 | fd->command = SAM_AT_EOF; |
4339 | 0 | pthread_mutex_unlock(&fd->command_m); |
4340 | 0 | return fd->errcode ? -2 : -1; |
4341 | 0 | } |
4342 | 0 | bam1_t *b_array = (bam1_t *)gb->bams; |
4343 | 0 | if (fd->curr_idx < gb->nbams) |
4344 | 0 | if (!bam_copy1(b, &b_array[fd->curr_idx++])) |
4345 | 0 | return -2; |
4346 | 0 | if (fd->curr_idx == gb->nbams) { |
4347 | 0 | pthread_mutex_lock(&fd->lines_m); |
4348 | 0 | gb->next = fd->bams; |
4349 | 0 | fd->bams = gb; |
4350 | 0 | pthread_mutex_unlock(&fd->lines_m); |
4351 | |
|
4352 | 0 | fd->curr_bam = NULL; |
4353 | 0 | fd->curr_idx = 0; |
4354 | | // Consider prefetching next record? I.e. |
4355 | | // } else { |
4356 | | // __builtin_prefetch(&b_array[fd->curr_idx], 0, 3); |
4357 | 0 | } |
4358 | |
|
4359 | 0 | ret = 0; |
4360 | |
|
4361 | 242k | } else { |
4362 | 242k | err_recover: |
4363 | 242k | ret = hts_getline(fp, KS_SEP_LINE, &fp->line); |
4364 | 242k | if (ret < 0) return ret; |
4365 | | |
4366 | 236k | ret = sam_parse1(&fp->line, h, b); |
4367 | 236k | fp->line.l = 0; |
4368 | 236k | if (ret < 0) { |
4369 | 4.49k | hts_log_warning("Parse error at line %lld", (long long)fp->lineno); |
4370 | 4.49k | if (h && h->ignore_sam_err) goto err_recover; |
4371 | 4.49k | } |
4372 | 236k | } |
4373 | | |
4374 | 236k | return ret; |
4375 | 242k | } |
4376 | | |
4377 | | // Returns 0 on success, |
4378 | | // -1 on EOF, |
4379 | | // <-1 on error |
4380 | | int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) |
4381 | 4.58M | { |
4382 | 4.58M | int ret, pass_filter; |
4383 | | |
4384 | 4.58M | do { |
4385 | 4.58M | switch (fp->format.format) { |
4386 | 3.47k | case bam: |
4387 | 3.47k | ret = sam_read1_bam(fp, h, b); |
4388 | 3.47k | break; |
4389 | | |
4390 | 5.33k | case cram: |
4391 | 5.33k | ret = sam_read1_cram(fp, h, &b); |
4392 | 5.33k | break; |
4393 | | |
4394 | 242k | case sam: |
4395 | 242k | ret = sam_read1_sam(fp, h, b); |
4396 | 242k | break; |
4397 | | |
4398 | 4.32M | case fasta_format: |
4399 | 4.32M | case fastq_format: { |
4400 | 4.32M | fastq_state *x = (fastq_state *)fp->state; |
4401 | 4.32M | if (!x) { |
4402 | 7.27k | if (!(fp->state = fastq_state_init(fp->format.format |
4403 | 7.27k | == fastq_format ? '@' : '>'))) |
4404 | 0 | return -2; |
4405 | 7.27k | } |
4406 | | |
4407 | 4.32M | return fastq_parse1(fp, b); |
4408 | 4.32M | } |
4409 | | |
4410 | 0 | case empty_format: |
4411 | 0 | errno = EPIPE; |
4412 | 0 | return -3; |
4413 | | |
4414 | 0 | default: |
4415 | 0 | errno = EFTYPE; |
4416 | 0 | return -3; |
4417 | 4.58M | } |
4418 | | |
4419 | 251k | pass_filter = (ret >= 0 && fp->filter) |
4420 | 251k | ? sam_passes_filter(h, b, fp->filter) |
4421 | 251k | : 1; |
4422 | 251k | } while (pass_filter == 0); |
4423 | | |
4424 | 251k | return pass_filter < 0 ? -2 : ret; |
4425 | 4.58M | } |
4426 | | |
4427 | | // With gcc, -O3 or -ftree-loop-vectorize is really key here as otherwise |
4428 | | // this code isn't vectorised and runs far slower than is necessary (even |
4429 | | // with the restrict keyword being used). |
4430 | | static inline void HTS_OPT3 |
4431 | 525 | add33(uint8_t *a, const uint8_t * b, int32_t len) { |
4432 | 525 | uint32_t i; |
4433 | 179k | for (i = 0; i < len; i++) |
4434 | 178k | a[i] = b[i]+33; |
4435 | 525 | } |
4436 | | |
4437 | | static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) |
4438 | 1.51M | { |
4439 | 1.51M | int i, r = 0; |
4440 | 1.51M | uint8_t *s, *end; |
4441 | 1.51M | const bam1_core_t *c = &b->core; |
4442 | | |
4443 | 1.51M | if (c->l_qname == 0) |
4444 | 0 | return -1; |
4445 | 1.51M | r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str); |
4446 | 1.51M | r |= kputc_('\t', str); // query name |
4447 | 1.51M | r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag |
4448 | 1.51M | if (c->tid >= 0) { // chr |
4449 | 56.4k | r |= kputs(h->target_name[c->tid] , str); |
4450 | 56.4k | r |= kputc_('\t', str); |
4451 | 1.46M | } else r |= kputsn_("*\t", 2, str); |
4452 | 1.51M | r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos |
4453 | 1.51M | r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual |
4454 | 1.51M | if (c->n_cigar) { // cigar |
4455 | 73.0k | uint32_t *cigar = bam_get_cigar(b); |
4456 | 2.04M | for (i = 0; i < c->n_cigar; ++i) { |
4457 | 1.97M | r |= kputw(bam_cigar_oplen(cigar[i]), str); |
4458 | 1.97M | r |= kputc_(bam_cigar_opchr(cigar[i]), str); |
4459 | 1.97M | } |
4460 | 1.44M | } else r |= kputc_('*', str); |
4461 | 1.51M | r |= kputc_('\t', str); |
4462 | 1.51M | if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr |
4463 | 2.00k | else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str); |
4464 | 1.50k | else { |
4465 | 1.50k | r |= kputs(h->target_name[c->mtid], str); |
4466 | 1.50k | r |= kputc_('\t', str); |
4467 | 1.50k | } |
4468 | 1.51M | r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos |
4469 | 1.51M | r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len |
4470 | 1.51M | if (c->l_qseq) { // seq and qual |
4471 | 185k | uint8_t *s = bam_get_seq(b); |
4472 | 185k | if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err; |
4473 | 185k | char *cp = str->s + str->l; |
4474 | | |
4475 | | // Sequence, 2 bases at a time |
4476 | 185k | nibble2base(s, cp, c->l_qseq); |
4477 | 185k | cp[c->l_qseq] = '\t'; |
4478 | 185k | cp += c->l_qseq+1; |
4479 | | |
4480 | | // Quality |
4481 | 185k | s = bam_get_qual(b); |
4482 | 185k | i = 0; |
4483 | 185k | if (s[0] == 0xff) { |
4484 | 184k | cp[i++] = '*'; |
4485 | 184k | } else { |
4486 | 525 | add33((uint8_t *)cp, s, c->l_qseq); // cp[i] = s[i]+33; |
4487 | 525 | i = c->l_qseq; |
4488 | 525 | } |
4489 | 185k | cp[i] = 0; |
4490 | 185k | cp += i; |
4491 | 185k | str->l = cp - str->s; |
4492 | 1.33M | } else r |= kputsn_("*\t*", 3, str); |
4493 | | |
4494 | 1.51M | s = bam_get_aux(b); // aux |
4495 | 1.51M | end = b->data + b->l_data; |
4496 | | |
4497 | 2.34M | while (end - s >= 4) { |
4498 | 822k | r |= kputc_('\t', str); |
4499 | 822k | if ((s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)) == NULL) |
4500 | 322 | goto bad_aux; |
4501 | 822k | } |
4502 | 1.51M | r |= kputsn("", 0, str); // nul terminate |
4503 | 1.51M | if (r < 0) goto mem_err; |
4504 | | |
4505 | 1.51M | return str->l; |
4506 | | |
4507 | 322 | bad_aux: |
4508 | 322 | hts_log_error("Corrupted aux data for read %.*s flag %d", |
4509 | 322 | b->core.l_qname, bam_get_qname(b), b->core.flag); |
4510 | 322 | errno = EINVAL; |
4511 | 322 | return -1; |
4512 | | |
4513 | 0 | mem_err: |
4514 | 0 | hts_log_error("Out of memory"); |
4515 | 0 | errno = ENOMEM; |
4516 | 0 | return -1; |
4517 | 1.51M | } |
4518 | | |
4519 | | int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) |
4520 | 1.51M | { |
4521 | 1.51M | str->l = 0; |
4522 | 1.51M | return sam_format1_append(h, b, str); |
4523 | 1.51M | } |
4524 | | |
4525 | | static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end); |
4526 | | int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str) |
4527 | 0 | { |
4528 | 0 | unsigned flag = b->core.flag; |
4529 | 0 | int i, e = 0, len = b->core.l_qseq; |
4530 | 0 | uint8_t *seq, *qual; |
4531 | |
|
4532 | 0 | str->l = 0; |
4533 | | |
4534 | | // Name |
4535 | 0 | if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF) |
4536 | 0 | return -1; |
4537 | | |
4538 | | // /1 or /2 suffix |
4539 | 0 | if (x && x->rnum && (flag & BAM_FPAIRED)) { |
4540 | 0 | int r12 = flag & (BAM_FREAD1 | BAM_FREAD2); |
4541 | 0 | if (r12 == BAM_FREAD1) { |
4542 | 0 | if (kputs("/1", str) == EOF) |
4543 | 0 | return -1; |
4544 | 0 | } else if (r12 == BAM_FREAD2) { |
4545 | 0 | if (kputs("/2", str) == EOF) |
4546 | 0 | return -1; |
4547 | 0 | } |
4548 | 0 | } |
4549 | | |
4550 | | // Illumina CASAVA tag. |
4551 | | // This is <rnum>:<Y/N qcfail>:<control-bits>:<barcode-or-zero> |
4552 | 0 | if (x && x->casava) { |
4553 | 0 | int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0; |
4554 | 0 | char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N'; |
4555 | 0 | uint8_t *bc = bam_aux_get(b, x->BC); |
4556 | 0 | if (ksprintf(str, " %d:%c:0:%s", rnum, filtered, |
4557 | 0 | bc ? (char *)bc+1 : "0") < 0) |
4558 | 0 | return -1; |
4559 | | |
4560 | 0 | if (bc && (*bc != 'Z' || (!isupper_c(bc[1]) && !islower_c(bc[1])))) { |
4561 | 0 | hts_log_warning("BC tag starts with non-sequence base; using '0'"); |
4562 | 0 | str->l -= strlen((char *)bc)-2; // limit to 1 char |
4563 | 0 | str->s[str->l-1] = '0'; |
4564 | 0 | str->s[str->l] = 0; |
4565 | 0 | bc = NULL; |
4566 | 0 | } |
4567 | | |
4568 | | // Replace any non-alpha with '+'. Ie seq-seq to seq+seq |
4569 | 0 | if (bc) { |
4570 | 0 | int l = strlen((char *)bc+1); |
4571 | 0 | char *c = (char *)str->s + str->l - l; |
4572 | 0 | for (i = 0; i < l; i++) { |
4573 | 0 | if (!isalpha_c(c[i])) |
4574 | 0 | c[i] = '+'; |
4575 | 0 | else if (islower_c(c[i])) |
4576 | 0 | c[i] = toupper_c(c[i]); |
4577 | 0 | } |
4578 | 0 | } |
4579 | 0 | } |
4580 | | |
4581 | | // Aux tags |
4582 | 0 | if (x && x->aux) { |
4583 | 0 | uint8_t *s = bam_get_aux(b), *end = b->data + b->l_data; |
4584 | 0 | while (s && end - s >= 4) { |
4585 | 0 | int tt = s[0]*256 + s[1]; |
4586 | 0 | if (x->tags == NULL || |
4587 | 0 | kh_get(tag, x->tags, tt) != kh_end(x->tags)) { |
4588 | 0 | e |= kputc_('\t', str) < 0; |
4589 | 0 | if (!(s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str))) |
4590 | 0 | return -1; |
4591 | 0 | } else { |
4592 | 0 | s = skip_aux(s+2, end); |
4593 | 0 | } |
4594 | 0 | } |
4595 | 0 | e |= kputsn("", 0, str) < 0; // nul terminate |
4596 | 0 | } |
4597 | | |
4598 | 0 | if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1; |
4599 | 0 | e |= kputc_('\n', str) < 0; |
4600 | | |
4601 | | // Seq line |
4602 | 0 | seq = bam_get_seq(b); |
4603 | 0 | if (flag & BAM_FREVERSE) |
4604 | 0 | for (i = len-1; i >= 0; i--) |
4605 | 0 | e |= kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str) < 0; |
4606 | 0 | else |
4607 | 0 | for (i = 0; i < len; i++) |
4608 | 0 | e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0; |
4609 | | |
4610 | | |
4611 | | // Qual line |
4612 | 0 | if (x->nprefix == '@') { |
4613 | 0 | kputsn("\n+\n", 3, str); |
4614 | 0 | qual = bam_get_qual(b); |
4615 | 0 | if (qual[0] == 0xff) |
4616 | 0 | for (i = 0; i < len; i++) |
4617 | 0 | e |= kputc_('B', str) < 0; |
4618 | 0 | else if (flag & BAM_FREVERSE) |
4619 | 0 | for (i = len-1; i >= 0; i--) |
4620 | 0 | e |= kputc_(33 + qual[i], str) < 0; |
4621 | 0 | else |
4622 | 0 | for (i = 0; i < len; i++) |
4623 | 0 | e |= kputc_(33 + qual[i], str) < 0; |
4624 | |
|
4625 | 0 | } |
4626 | 0 | e |= kputc('\n', str) < 0; |
4627 | |
|
4628 | 0 | return e ? -1 : str->l; |
4629 | 0 | } |
4630 | | |
4631 | | // Sadly we need to be able to modify the bam_hdr here so we can |
4632 | | // reference count the structure. |
4633 | | int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) |
4634 | 4.55M | { |
4635 | 4.55M | switch (fp->format.format) { |
4636 | 0 | case binary_format: |
4637 | 0 | fp->format.category = sequence_data; |
4638 | 0 | fp->format.format = bam; |
4639 | | /* fall-through */ |
4640 | 1.51M | case bam: |
4641 | 1.51M | return bam_write_idx1(fp, h, b); |
4642 | | |
4643 | 1.51M | case cram: |
4644 | 1.51M | return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b); |
4645 | | |
4646 | 0 | case text_format: |
4647 | 0 | fp->format.category = sequence_data; |
4648 | 0 | fp->format.format = sam; |
4649 | | /* fall-through */ |
4650 | 1.51M | case sam: |
4651 | 1.51M | if (fp->state) { |
4652 | 0 | SAM_state *fd = (SAM_state *)fp->state; |
4653 | | |
4654 | | // Threaded output |
4655 | 0 | if (!fd->h) { |
4656 | | // NB: discard const. We don't actually modify sam_hdr_t here, |
4657 | | // just data pointed to by it (which is a bit weasely still), |
4658 | | // but out cached pointer must be non-const as we want to |
4659 | | // destroy it later on and sam_hdr_destroy takes non-const. |
4660 | | // |
4661 | | // We do this because some tools do sam_hdr_destroy; sam_close |
4662 | | // while others do sam_close; sam_hdr_destroy. The former is |
4663 | | // an issue as we need the header still when flushing. |
4664 | 0 | fd->h = (sam_hdr_t *)h; |
4665 | 0 | fd->h->ref_count++; |
4666 | |
|
4667 | 0 | if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write, |
4668 | 0 | fp) != 0) |
4669 | 0 | return -2; |
4670 | 0 | fd->dispatcher_set = 1; |
4671 | 0 | } |
4672 | | |
4673 | 0 | if (fd->h != h) { |
4674 | 0 | hts_log_error("SAM multi-threaded decoding does not support changing header"); |
4675 | 0 | return -2; |
4676 | 0 | } |
4677 | | |
4678 | | // Find a suitable BAM array to copy to |
4679 | 0 | sp_bams *gb = fd->curr_bam; |
4680 | 0 | if (!gb) { |
4681 | 0 | pthread_mutex_lock(&fd->lines_m); |
4682 | 0 | if (fd->bams) { |
4683 | 0 | fd->curr_bam = gb = fd->bams; |
4684 | 0 | fd->bams = gb->next; |
4685 | 0 | gb->next = NULL; |
4686 | 0 | gb->nbams = 0; |
4687 | 0 | gb->bam_mem = 0; |
4688 | 0 | pthread_mutex_unlock(&fd->lines_m); |
4689 | 0 | } else { |
4690 | 0 | pthread_mutex_unlock(&fd->lines_m); |
4691 | 0 | if (!(gb = calloc(1, sizeof(*gb)))) return -1; |
4692 | 0 | if (!(gb->bams = calloc(SAM_NBAM, sizeof(*gb->bams)))) { |
4693 | 0 | free(gb); |
4694 | 0 | return -1; |
4695 | 0 | } |
4696 | 0 | gb->nbams = 0; |
4697 | 0 | gb->abams = SAM_NBAM; |
4698 | 0 | gb->bam_mem = 0; |
4699 | 0 | gb->fd = fd; |
4700 | 0 | fd->curr_idx = 0; |
4701 | 0 | fd->curr_bam = gb; |
4702 | 0 | } |
4703 | 0 | } |
4704 | | |
4705 | 0 | if (!bam_copy1(&gb->bams[gb->nbams++], b)) |
4706 | 0 | return -2; |
4707 | 0 | gb->bam_mem += b->l_data + sizeof(*b); |
4708 | | |
4709 | | // Dispatch if full |
4710 | 0 | if (gb->nbams == SAM_NBAM || gb->bam_mem > SAM_NBYTES*0.8) { |
4711 | 0 | gb->serial = fd->serial++; |
4712 | 0 | pthread_mutex_lock(&fd->command_m); |
4713 | 0 | if (fd->errcode != 0) { |
4714 | 0 | pthread_mutex_unlock(&fd->command_m); |
4715 | 0 | return -fd->errcode; |
4716 | 0 | } |
4717 | 0 | if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb, |
4718 | 0 | cleanup_sp_bams, |
4719 | 0 | cleanup_sp_lines, 0) < 0) { |
4720 | 0 | pthread_mutex_unlock(&fd->command_m); |
4721 | 0 | return -1; |
4722 | 0 | } |
4723 | 0 | pthread_mutex_unlock(&fd->command_m); |
4724 | 0 | fd->curr_bam = NULL; |
4725 | 0 | } |
4726 | | |
4727 | | // Dummy value as we don't know how long it really is. |
4728 | | // We could track file sizes via a SAM_state field, but I don't think |
4729 | | // it is necessary. |
4730 | 0 | return 1; |
4731 | 1.51M | } else { |
4732 | 1.51M | if (sam_format1(h, b, &fp->line) < 0) return -1; |
4733 | 1.51M | kputc('\n', &fp->line); |
4734 | 1.51M | if (fp->is_bgzf) { |
4735 | 0 | if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) |
4736 | 0 | return -1; |
4737 | 0 | if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1; |
4738 | 1.51M | } else { |
4739 | 1.51M | if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1; |
4740 | 1.51M | } |
4741 | | |
4742 | 1.51M | if (fp->idx) { |
4743 | 0 | if (fp->format.compression == bgzf) { |
4744 | 0 | if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), |
4745 | 0 | bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) { |
4746 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
4747 | 0 | bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); |
4748 | 0 | return -1; |
4749 | 0 | } |
4750 | 0 | } else { |
4751 | 0 | if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b), |
4752 | 0 | bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) { |
4753 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
4754 | 0 | bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); |
4755 | 0 | return -1; |
4756 | 0 | } |
4757 | 0 | } |
4758 | 0 | } |
4759 | | |
4760 | 1.51M | return fp->line.l; |
4761 | 1.51M | } |
4762 | | |
4763 | | |
4764 | 0 | case fasta_format: |
4765 | 0 | case fastq_format: { |
4766 | 0 | fastq_state *x = (fastq_state *)fp->state; |
4767 | 0 | if (!x) { |
4768 | 0 | if (!(fp->state = fastq_state_init(fp->format.format |
4769 | 0 | == fastq_format ? '@' : '>'))) |
4770 | 0 | return -2; |
4771 | 0 | } |
4772 | | |
4773 | 0 | if (fastq_format1(fp->state, b, &fp->line) < 0) |
4774 | 0 | return -1; |
4775 | 0 | if (fp->is_bgzf) { |
4776 | 0 | if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) |
4777 | 0 | return -1; |
4778 | 0 | if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l) |
4779 | 0 | return -1; |
4780 | 0 | } else { |
4781 | 0 | if (hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l) |
4782 | 0 | return -1; |
4783 | 0 | } |
4784 | 0 | return fp->line.l; |
4785 | 0 | } |
4786 | | |
4787 | 0 | default: |
4788 | 0 | errno = EBADF; |
4789 | 0 | return -1; |
4790 | 4.55M | } |
4791 | 4.55M | } |
4792 | | |
4793 | | /************************ |
4794 | | *** Auxiliary fields *** |
4795 | | ************************/ |
4796 | | #ifndef HTS_LITTLE_ENDIAN |
4797 | | static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) { |
4798 | | int tsz = aux_type2size(type); |
4799 | | |
4800 | | if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1; |
4801 | | |
4802 | | switch (tsz) { |
4803 | | case 'H': case 'Z': case 1: // Trivial |
4804 | | memcpy(out, in, len); |
4805 | | break; |
4806 | | |
4807 | | #define aux_val_to_le(type_t, store_le) do { \ |
4808 | | type_t v; \ |
4809 | | size_t i; \ |
4810 | | for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \ |
4811 | | memcpy(&v, in + i, sizeof(type_t)); \ |
4812 | | store_le(v, out); \ |
4813 | | } \ |
4814 | | } while (0) |
4815 | | |
4816 | | case 2: aux_val_to_le(uint16_t, u16_to_le); break; |
4817 | | case 4: aux_val_to_le(uint32_t, u32_to_le); break; |
4818 | | case 8: aux_val_to_le(uint64_t, u64_to_le); break; |
4819 | | |
4820 | | #undef aux_val_to_le |
4821 | | |
4822 | | case 'B': { // Recurse! |
4823 | | uint32_t n; |
4824 | | if (len < 5) return -1; |
4825 | | memcpy(&n, in + 1, 4); |
4826 | | out[0] = in[0]; |
4827 | | u32_to_le(n, out + 1); |
4828 | | return aux_to_le(in[0], out + 5, in + 5, len - 5); |
4829 | | } |
4830 | | |
4831 | | default: // Unknown type code |
4832 | | return -1; |
4833 | | } |
4834 | | |
4835 | | |
4836 | | |
4837 | | return 0; |
4838 | | } |
4839 | | #endif |
4840 | | |
4841 | | int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data) |
4842 | 0 | { |
4843 | 0 | uint32_t new_len; |
4844 | |
|
4845 | 0 | assert(b->l_data >= 0); |
4846 | 0 | new_len = b->l_data + 3 + len; |
4847 | 0 | if (new_len > INT32_MAX || new_len < b->l_data) goto nomem; |
4848 | | |
4849 | 0 | if (realloc_bam_data(b, new_len) < 0) return -1; |
4850 | | |
4851 | 0 | b->data[b->l_data] = tag[0]; |
4852 | 0 | b->data[b->l_data + 1] = tag[1]; |
4853 | 0 | b->data[b->l_data + 2] = type; |
4854 | |
|
4855 | 0 | #ifdef HTS_LITTLE_ENDIAN |
4856 | 0 | memcpy(b->data + b->l_data + 3, data, len); |
4857 | | #else |
4858 | | if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) { |
4859 | | errno = EINVAL; |
4860 | | return -1; |
4861 | | } |
4862 | | #endif |
4863 | |
|
4864 | 0 | b->l_data = new_len; |
4865 | |
|
4866 | 0 | return 0; |
4867 | | |
4868 | 0 | nomem: |
4869 | 0 | errno = ENOMEM; |
4870 | 0 | return -1; |
4871 | 0 | } |
4872 | | |
4873 | | static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end) |
4874 | 1.67M | { |
4875 | 1.67M | int size; |
4876 | 1.67M | uint32_t n; |
4877 | 1.67M | if (s >= end) return end; |
4878 | 1.67M | size = aux_type2size(*s); ++s; // skip type |
4879 | 1.67M | switch (size) { |
4880 | 209k | case 'Z': |
4881 | 212k | case 'H': |
4882 | 212k | s = memchr(s, 0, end-s); |
4883 | 212k | return s ? s+1 : end; |
4884 | 79.8k | case 'B': |
4885 | 79.8k | if (end - s < 5) return NULL; |
4886 | 79.8k | size = aux_type2size(*s); ++s; |
4887 | 79.8k | n = le_to_u32(s); |
4888 | 79.8k | s += 4; |
4889 | 79.8k | if (size == 0 || end - s < size * n) return NULL; |
4890 | 79.7k | return s + size * n; |
4891 | 550 | case 0: |
4892 | 550 | return NULL; |
4893 | 1.38M | default: |
4894 | 1.38M | if (end - s < size) return NULL; |
4895 | 1.38M | return s + size; |
4896 | 1.67M | } |
4897 | 1.67M | } |
4898 | | |
4899 | | uint8_t *bam_aux_first(const bam1_t *b) |
4900 | 1.65M | { |
4901 | 1.65M | uint8_t *s = bam_get_aux(b); |
4902 | 1.65M | uint8_t *end = b->data + b->l_data; |
4903 | 1.65M | if (end - s <= 2) { errno = ENOENT; return NULL; } |
4904 | 196k | return s+2; |
4905 | 1.65M | } |
4906 | | |
4907 | | uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s) |
4908 | 1.59M | { |
4909 | 1.59M | uint8_t *end = b->data + b->l_data; |
4910 | 1.59M | uint8_t *next = s? skip_aux((uint8_t *) s, end) : end; |
4911 | 1.59M | if (next == NULL) goto bad_aux; |
4912 | 1.59M | if (end - next <= 2) { errno = ENOENT; return NULL; } |
4913 | 1.48M | return next+2; |
4914 | | |
4915 | 630 | bad_aux: |
4916 | 630 | hts_log_error("Corrupted aux data for read %s flag %d", |
4917 | 630 | bam_get_qname(b), b->core.flag); |
4918 | 630 | errno = EINVAL; |
4919 | 630 | return NULL; |
4920 | 1.59M | } |
4921 | | |
4922 | | uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) |
4923 | 1.65M | { |
4924 | 1.65M | uint8_t *s; |
4925 | 3.25M | for (s = bam_aux_first(b); s; s = bam_aux_next(b, s)) |
4926 | 1.67M | if (s[-2] == tag[0] && s[-1] == tag[1]) { |
4927 | | // Check the tag value is valid and complete |
4928 | 79.6k | uint8_t *e = skip_aux(s, b->data + b->l_data); |
4929 | 79.6k | if (e == NULL) goto bad_aux; |
4930 | 79.6k | if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux; |
4931 | | |
4932 | 79.6k | return s; |
4933 | 79.6k | } |
4934 | | |
4935 | | // errno now as set by bam_aux_first()/bam_aux_next() |
4936 | 1.57M | return NULL; |
4937 | | |
4938 | 7 | bad_aux: |
4939 | 7 | hts_log_error("Corrupted aux data for read %s flag %d", |
4940 | 7 | bam_get_qname(b), b->core.flag); |
4941 | 7 | errno = EINVAL; |
4942 | 7 | return NULL; |
4943 | 1.65M | } |
4944 | | |
4945 | | int bam_aux_del(bam1_t *b, uint8_t *s) |
4946 | 0 | { |
4947 | 0 | s = bam_aux_remove(b, s); |
4948 | 0 | return (s || errno == ENOENT)? 0 : -1; |
4949 | 0 | } |
4950 | | |
4951 | | uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s) |
4952 | 0 | { |
4953 | 0 | uint8_t *end = b->data + b->l_data; |
4954 | 0 | uint8_t *next = skip_aux(s, end); |
4955 | 0 | if (next == NULL) goto bad_aux; |
4956 | | |
4957 | 0 | b->l_data -= next - (s-2); |
4958 | 0 | if (next >= end) { errno = ENOENT; return NULL; } |
4959 | | |
4960 | 0 | memmove(s-2, next, end - next); |
4961 | 0 | return s; |
4962 | | |
4963 | 0 | bad_aux: |
4964 | 0 | hts_log_error("Corrupted aux data for read %s flag %d", |
4965 | 0 | bam_get_qname(b), b->core.flag); |
4966 | 0 | errno = EINVAL; |
4967 | 0 | return NULL; |
4968 | 0 | } |
4969 | | |
4970 | | int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data) |
4971 | 0 | { |
4972 | | // FIXME: This is not at all efficient! |
4973 | 0 | size_t ln = len >= 0 ? len : strlen(data) + 1; |
4974 | 0 | size_t old_ln = 0; |
4975 | 0 | int need_nul = ln == 0 || data[ln - 1] != '\0'; |
4976 | 0 | int save_errno = errno; |
4977 | 0 | int new_tag = 0; |
4978 | 0 | uint8_t *s = bam_aux_get(b,tag), *e; |
4979 | |
|
4980 | 0 | if (s) { // Replacing existing tag |
4981 | 0 | char type = *s; |
4982 | 0 | if (type != 'Z') { |
4983 | 0 | hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type); |
4984 | 0 | errno = EINVAL; |
4985 | 0 | return -1; |
4986 | 0 | } |
4987 | 0 | s++; |
4988 | 0 | e = memchr(s, '\0', b->data + b->l_data - s); |
4989 | 0 | old_ln = (e ? e - s : b->data + b->l_data - s) + 1; |
4990 | 0 | s -= 3; |
4991 | 0 | } else { |
4992 | 0 | if (errno != ENOENT) { // Invalid aux data, give up |
4993 | 0 | return -1; |
4994 | 0 | } else { // Tag doesn't exist - put it on the end |
4995 | 0 | errno = save_errno; |
4996 | 0 | s = b->data + b->l_data; |
4997 | 0 | new_tag = 3; |
4998 | 0 | } |
4999 | 0 | } |
5000 | | |
5001 | 0 | if (old_ln < ln + need_nul + new_tag) { |
5002 | 0 | ptrdiff_t s_offset = s - b->data; |
5003 | 0 | if (possibly_expand_bam_data(b, ln + need_nul + new_tag - old_ln) < 0) |
5004 | 0 | return -1; |
5005 | 0 | s = b->data + s_offset; |
5006 | 0 | } |
5007 | 0 | if (!new_tag) { |
5008 | 0 | memmove(s + 3 + ln + need_nul, |
5009 | 0 | s + 3 + old_ln, |
5010 | 0 | b->l_data - (s + 3 - b->data) - old_ln); |
5011 | 0 | } |
5012 | 0 | b->l_data += new_tag + ln + need_nul - old_ln; |
5013 | |
|
5014 | 0 | s[0] = tag[0]; |
5015 | 0 | s[1] = tag[1]; |
5016 | 0 | s[2] = 'Z'; |
5017 | 0 | memmove(s+3,data,ln); |
5018 | 0 | if (need_nul) s[3 + ln] = '\0'; |
5019 | 0 | return 0; |
5020 | 0 | } |
5021 | | |
5022 | | int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val) |
5023 | 0 | { |
5024 | 0 | uint32_t sz, old_sz = 0, new = 0; |
5025 | 0 | uint8_t *s, type; |
5026 | |
|
5027 | 0 | if (val < INT32_MIN || val > UINT32_MAX) { |
5028 | 0 | errno = EOVERFLOW; |
5029 | 0 | return -1; |
5030 | 0 | } |
5031 | 0 | if (val < INT16_MIN) { type = 'i'; sz = 4; } |
5032 | 0 | else if (val < INT8_MIN) { type = 's'; sz = 2; } |
5033 | 0 | else if (val < 0) { type = 'c'; sz = 1; } |
5034 | 0 | else if (val < UINT8_MAX) { type = 'C'; sz = 1; } |
5035 | 0 | else if (val < UINT16_MAX) { type = 'S'; sz = 2; } |
5036 | 0 | else { type = 'I'; sz = 4; } |
5037 | |
|
5038 | 0 | s = bam_aux_get(b, tag); |
5039 | 0 | if (s) { // Tag present - how big was the old one? |
5040 | 0 | switch (*s) { |
5041 | 0 | case 'c': case 'C': old_sz = 1; break; |
5042 | 0 | case 's': case 'S': old_sz = 2; break; |
5043 | 0 | case 'i': case 'I': old_sz = 4; break; |
5044 | 0 | default: errno = EINVAL; return -1; // Not an integer |
5045 | 0 | } |
5046 | 0 | } else { |
5047 | 0 | if (errno == ENOENT) { // Tag doesn't exist - add a new one |
5048 | 0 | s = b->data + b->l_data; |
5049 | 0 | new = 1; |
5050 | 0 | } else { // Invalid aux data, give up. |
5051 | 0 | return -1; |
5052 | 0 | } |
5053 | 0 | } |
5054 | | |
5055 | 0 | if (new || old_sz < sz) { |
5056 | | // Make room for new tag |
5057 | 0 | ptrdiff_t s_offset = s - b->data; |
5058 | 0 | if (possibly_expand_bam_data(b, (new ? 3 : 0) + sz - old_sz) < 0) |
5059 | 0 | return -1; |
5060 | 0 | s = b->data + s_offset; |
5061 | 0 | if (new) { // Add tag id |
5062 | 0 | *s++ = tag[0]; |
5063 | 0 | *s++ = tag[1]; |
5064 | 0 | } else { // Shift following data so we have space |
5065 | 0 | memmove(s + sz, s + old_sz, b->l_data - s_offset - old_sz); |
5066 | 0 | } |
5067 | 0 | } else { |
5068 | | // Reuse old space. Data value may be bigger than necessary but |
5069 | | // we avoid having to move everything else |
5070 | 0 | sz = old_sz; |
5071 | 0 | type = (val < 0 ? "\0cs\0i" : "\0CS\0I")[old_sz]; |
5072 | 0 | assert(type > 0); |
5073 | 0 | } |
5074 | 0 | *s++ = type; |
5075 | 0 | #ifdef HTS_LITTLE_ENDIAN |
5076 | 0 | memcpy(s, &val, sz); |
5077 | | #else |
5078 | | switch (sz) { |
5079 | | case 4: u32_to_le(val, s); break; |
5080 | | case 2: u16_to_le(val, s); break; |
5081 | | default: *s = val; break; |
5082 | | } |
5083 | | #endif |
5084 | 0 | b->l_data += (new ? 3 : 0) + sz - old_sz; |
5085 | 0 | return 0; |
5086 | 0 | } |
5087 | | |
5088 | | int bam_aux_update_float(bam1_t *b, const char tag[2], float val) |
5089 | 0 | { |
5090 | 0 | uint8_t *s = bam_aux_get(b, tag); |
5091 | 0 | int shrink = 0, new = 0; |
5092 | |
|
5093 | 0 | if (s) { // Tag present - what was it? |
5094 | 0 | switch (*s) { |
5095 | 0 | case 'f': break; |
5096 | 0 | case 'd': shrink = 1; break; |
5097 | 0 | default: errno = EINVAL; return -1; // Not a float |
5098 | 0 | } |
5099 | 0 | } else { |
5100 | 0 | if (errno == ENOENT) { // Tag doesn't exist - add a new one |
5101 | 0 | new = 1; |
5102 | 0 | } else { // Invalid aux data, give up. |
5103 | 0 | return -1; |
5104 | 0 | } |
5105 | 0 | } |
5106 | | |
5107 | 0 | if (new) { // Ensure there's room |
5108 | 0 | if (possibly_expand_bam_data(b, 3 + 4) < 0) |
5109 | 0 | return -1; |
5110 | 0 | s = b->data + b->l_data; |
5111 | 0 | *s++ = tag[0]; |
5112 | 0 | *s++ = tag[1]; |
5113 | 0 | } else if (shrink) { // Convert non-standard double tag to float |
5114 | 0 | memmove(s + 5, s + 9, b->l_data - ((s + 9) - b->data)); |
5115 | 0 | b->l_data -= 4; |
5116 | 0 | } |
5117 | 0 | *s++ = 'f'; |
5118 | 0 | float_to_le(val, s); |
5119 | 0 | if (new) b->l_data += 7; |
5120 | |
|
5121 | 0 | return 0; |
5122 | 0 | } |
5123 | | |
5124 | | int bam_aux_update_array(bam1_t *b, const char tag[2], |
5125 | | uint8_t type, uint32_t items, void *data) |
5126 | 0 | { |
5127 | 0 | uint8_t *s = bam_aux_get(b, tag); |
5128 | 0 | size_t old_sz = 0, new_sz; |
5129 | 0 | int new = 0; |
5130 | |
|
5131 | 0 | if (s) { // Tag present |
5132 | 0 | if (*s != 'B') { errno = EINVAL; return -1; } |
5133 | 0 | old_sz = aux_type2size(s[1]); |
5134 | 0 | if (old_sz < 1 || old_sz > 4) { errno = EINVAL; return -1; } |
5135 | 0 | old_sz *= le_to_u32(s + 2); |
5136 | 0 | } else { |
5137 | 0 | if (errno == ENOENT) { // Tag doesn't exist - add a new one |
5138 | 0 | s = b->data + b->l_data; |
5139 | 0 | new = 1; |
5140 | 0 | } else { // Invalid aux data, give up. |
5141 | 0 | return -1; |
5142 | 0 | } |
5143 | 0 | } |
5144 | | |
5145 | 0 | new_sz = aux_type2size(type); |
5146 | 0 | if (new_sz < 1 || new_sz > 4) { errno = EINVAL; return -1; } |
5147 | 0 | if (items > INT32_MAX / new_sz) { errno = ENOMEM; return -1; } |
5148 | 0 | new_sz *= items; |
5149 | |
|
5150 | 0 | if (new || old_sz < new_sz) { |
5151 | | // Make room for new tag |
5152 | 0 | ptrdiff_t s_offset = s - b->data; |
5153 | 0 | if (possibly_expand_bam_data(b, (new ? 8 : 0) + new_sz - old_sz) < 0) |
5154 | 0 | return -1; |
5155 | 0 | s = b->data + s_offset; |
5156 | 0 | } |
5157 | 0 | if (new) { // Add tag id and type |
5158 | 0 | *s++ = tag[0]; |
5159 | 0 | *s++ = tag[1]; |
5160 | 0 | *s = 'B'; |
5161 | 0 | b->l_data += 8 + new_sz; |
5162 | 0 | } else if (old_sz != new_sz) { // shift following data if necessary |
5163 | 0 | memmove(s + 6 + new_sz, s + 6 + old_sz, |
5164 | 0 | b->l_data - ((s + 6 + old_sz) - b->data)); |
5165 | 0 | b->l_data -= old_sz; |
5166 | 0 | b->l_data += new_sz; |
5167 | 0 | } |
5168 | |
|
5169 | 0 | s[1] = type; |
5170 | 0 | u32_to_le(items, s + 2); |
5171 | 0 | if (new_sz > 0) { |
5172 | 0 | #ifdef HTS_LITTLE_ENDIAN |
5173 | 0 | memcpy(s + 6, data, new_sz); |
5174 | | #else |
5175 | | return aux_to_le(type, s + 6, data, new_sz); |
5176 | | #endif |
5177 | 0 | } |
5178 | 0 | return 0; |
5179 | 0 | } |
5180 | | |
5181 | | static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s, |
5182 | | uint32_t idx) |
5183 | 0 | { |
5184 | 0 | switch (type) { |
5185 | 0 | case 'c': return le_to_i8(s + idx); |
5186 | 0 | case 'C': return s[idx]; |
5187 | 0 | case 's': return le_to_i16(s + 2 * idx); |
5188 | 0 | case 'S': return le_to_u16(s + 2 * idx); |
5189 | 0 | case 'i': return le_to_i32(s + 4 * idx); |
5190 | 0 | case 'I': return le_to_u32(s + 4 * idx); |
5191 | 0 | default: |
5192 | 0 | errno = EINVAL; |
5193 | 0 | return 0; |
5194 | 0 | } |
5195 | 0 | } |
5196 | | |
5197 | | int64_t bam_aux2i(const uint8_t *s) |
5198 | 0 | { |
5199 | 0 | int type; |
5200 | 0 | type = *s++; |
5201 | 0 | return get_int_aux_val(type, s, 0); |
5202 | 0 | } |
5203 | | |
5204 | | double bam_aux2f(const uint8_t *s) |
5205 | 0 | { |
5206 | 0 | int type; |
5207 | 0 | type = *s++; |
5208 | 0 | if (type == 'd') return le_to_double(s); |
5209 | 0 | else if (type == 'f') return le_to_float(s); |
5210 | 0 | else return get_int_aux_val(type, s, 0); |
5211 | 0 | } |
5212 | | |
5213 | | char bam_aux2A(const uint8_t *s) |
5214 | 0 | { |
5215 | 0 | int type; |
5216 | 0 | type = *s++; |
5217 | 0 | if (type == 'A') return *(char*)s; |
5218 | 0 | errno = EINVAL; |
5219 | 0 | return 0; |
5220 | 0 | } |
5221 | | |
5222 | | char *bam_aux2Z(const uint8_t *s) |
5223 | 0 | { |
5224 | 0 | int type; |
5225 | 0 | type = *s++; |
5226 | 0 | if (type == 'Z' || type == 'H') return (char*)s; |
5227 | 0 | errno = EINVAL; |
5228 | 0 | return 0; |
5229 | 0 | } |
5230 | | |
5231 | | uint32_t bam_auxB_len(const uint8_t *s) |
5232 | 0 | { |
5233 | 0 | if (s[0] != 'B') { |
5234 | 0 | errno = EINVAL; |
5235 | 0 | return 0; |
5236 | 0 | } |
5237 | 0 | return le_to_u32(s + 2); |
5238 | 0 | } |
5239 | | |
5240 | | int64_t bam_auxB2i(const uint8_t *s, uint32_t idx) |
5241 | 0 | { |
5242 | 0 | uint32_t len = bam_auxB_len(s); |
5243 | 0 | if (idx >= len) { |
5244 | 0 | errno = ERANGE; |
5245 | 0 | return 0; |
5246 | 0 | } |
5247 | 0 | return get_int_aux_val(s[1], s + 6, idx); |
5248 | 0 | } |
5249 | | |
5250 | | double bam_auxB2f(const uint8_t *s, uint32_t idx) |
5251 | 0 | { |
5252 | 0 | uint32_t len = bam_auxB_len(s); |
5253 | 0 | if (idx >= len) { |
5254 | 0 | errno = ERANGE; |
5255 | 0 | return 0.0; |
5256 | 0 | } |
5257 | 0 | if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx); |
5258 | 0 | else return get_int_aux_val(s[1], s + 6, idx); |
5259 | 0 | } |
5260 | | |
5261 | | int sam_open_mode(char *mode, const char *fn, const char *format) |
5262 | 0 | { |
5263 | | // TODO Parse "bam5" etc for compression level |
5264 | 0 | if (format == NULL) { |
5265 | | // Try to pick a format based on the filename extension |
5266 | 0 | char extension[HTS_MAX_EXT_LEN]; |
5267 | 0 | if (find_file_extension(fn, extension) < 0) return -1; |
5268 | 0 | return sam_open_mode(mode, fn, extension); |
5269 | 0 | } |
5270 | 0 | else if (strcasecmp(format, "bam") == 0) strcpy(mode, "b"); |
5271 | 0 | else if (strcasecmp(format, "cram") == 0) strcpy(mode, "c"); |
5272 | 0 | else if (strcasecmp(format, "sam") == 0) strcpy(mode, ""); |
5273 | 0 | else if (strcasecmp(format, "sam.gz") == 0) strcpy(mode, "z"); |
5274 | 0 | else if (strcasecmp(format, "fastq") == 0 || |
5275 | 0 | strcasecmp(format, "fq") == 0) strcpy(mode, "f"); |
5276 | 0 | else if (strcasecmp(format, "fastq.gz") == 0 || |
5277 | 0 | strcasecmp(format, "fq.gz") == 0) strcpy(mode, "fz"); |
5278 | 0 | else if (strcasecmp(format, "fasta") == 0 || |
5279 | 0 | strcasecmp(format, "fa") == 0) strcpy(mode, "F"); |
5280 | 0 | else if (strcasecmp(format, "fasta.gz") == 0 || |
5281 | 0 | strcasecmp(format, "fa.gz") == 0) strcpy(mode, "Fz"); |
5282 | 0 | else return -1; |
5283 | | |
5284 | 0 | return 0; |
5285 | 0 | } |
5286 | | |
5287 | | // A version of sam_open_mode that can handle ,key=value options. |
5288 | | // The format string is allocated and returned, to be freed by the caller. |
5289 | | // Prefix should be "r" or "w", |
5290 | | char *sam_open_mode_opts(const char *fn, |
5291 | | const char *mode, |
5292 | | const char *format) |
5293 | 0 | { |
5294 | 0 | char *mode_opts = malloc((format ? strlen(format) : 1) + |
5295 | 0 | (mode ? strlen(mode) : 1) + 12); |
5296 | 0 | char *opts, *cp; |
5297 | 0 | int format_len; |
5298 | |
|
5299 | 0 | if (!mode_opts) |
5300 | 0 | return NULL; |
5301 | | |
5302 | 0 | strcpy(mode_opts, mode ? mode : "r"); |
5303 | 0 | cp = mode_opts + strlen(mode_opts); |
5304 | |
|
5305 | 0 | if (format == NULL) { |
5306 | | // Try to pick a format based on the filename extension |
5307 | 0 | char extension[HTS_MAX_EXT_LEN]; |
5308 | 0 | if (find_file_extension(fn, extension) < 0) { |
5309 | 0 | free(mode_opts); |
5310 | 0 | return NULL; |
5311 | 0 | } |
5312 | 0 | if (sam_open_mode(cp, fn, extension) == 0) { |
5313 | 0 | return mode_opts; |
5314 | 0 | } else { |
5315 | 0 | free(mode_opts); |
5316 | 0 | return NULL; |
5317 | 0 | } |
5318 | 0 | } |
5319 | | |
5320 | 0 | if ((opts = strchr(format, ','))) { |
5321 | 0 | format_len = opts-format; |
5322 | 0 | } else { |
5323 | 0 | opts=""; |
5324 | 0 | format_len = strlen(format); |
5325 | 0 | } |
5326 | |
|
5327 | 0 | if (strncmp(format, "bam", format_len) == 0) { |
5328 | 0 | *cp++ = 'b'; |
5329 | 0 | } else if (strncmp(format, "cram", format_len) == 0) { |
5330 | 0 | *cp++ = 'c'; |
5331 | 0 | } else if (strncmp(format, "cram2", format_len) == 0) { |
5332 | 0 | *cp++ = 'c'; |
5333 | 0 | strcpy(cp, ",VERSION=2.1"); |
5334 | 0 | cp += 12; |
5335 | 0 | } else if (strncmp(format, "cram3", format_len) == 0) { |
5336 | 0 | *cp++ = 'c'; |
5337 | 0 | strcpy(cp, ",VERSION=3.0"); |
5338 | 0 | cp += 12; |
5339 | 0 | } else if (strncmp(format, "sam", format_len) == 0) { |
5340 | 0 | ; // format mode="" |
5341 | 0 | } else if (strncmp(format, "sam.gz", format_len) == 0) { |
5342 | 0 | *cp++ = 'z'; |
5343 | 0 | } else if (strncmp(format, "fastq", format_len) == 0 || |
5344 | 0 | strncmp(format, "fq", format_len) == 0) { |
5345 | 0 | *cp++ = 'f'; |
5346 | 0 | } else if (strncmp(format, "fastq.gz", format_len) == 0 || |
5347 | 0 | strncmp(format, "fq.gz", format_len) == 0) { |
5348 | 0 | *cp++ = 'f'; |
5349 | 0 | *cp++ = 'z'; |
5350 | 0 | } else if (strncmp(format, "fasta", format_len) == 0 || |
5351 | 0 | strncmp(format, "fa", format_len) == 0) { |
5352 | 0 | *cp++ = 'F'; |
5353 | 0 | } else if (strncmp(format, "fasta.gz", format_len) == 0 || |
5354 | 0 | strncmp(format, "fa", format_len) == 0) { |
5355 | 0 | *cp++ = 'F'; |
5356 | 0 | *cp++ = 'z'; |
5357 | 0 | } else { |
5358 | 0 | free(mode_opts); |
5359 | 0 | return NULL; |
5360 | 0 | } |
5361 | | |
5362 | 0 | strcpy(cp, opts); |
5363 | |
|
5364 | 0 | return mode_opts; |
5365 | 0 | } |
5366 | | |
5367 | 0 | #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n)) |
5368 | | int bam_str2flag(const char *str) |
5369 | 0 | { |
5370 | 0 | char *end, *beg = (char*) str; |
5371 | 0 | long int flag = strtol(str, &end, 0); |
5372 | 0 | if ( end!=str ) return flag; // the conversion was successful |
5373 | 0 | flag = 0; |
5374 | 0 | while ( *str ) |
5375 | 0 | { |
5376 | 0 | end = beg; |
5377 | 0 | while ( *end && *end!=',' ) end++; |
5378 | 0 | if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED; |
5379 | 0 | else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR; |
5380 | 0 | else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP; |
5381 | 0 | else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP; |
5382 | 0 | else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE; |
5383 | 0 | else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE; |
5384 | 0 | else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1; |
5385 | 0 | else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2; |
5386 | 0 | else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY; |
5387 | 0 | else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL; |
5388 | 0 | else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP; |
5389 | 0 | else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY; |
5390 | 0 | else return -1; |
5391 | 0 | if ( !*end ) break; |
5392 | 0 | beg = end + 1; |
5393 | 0 | } |
5394 | 0 | return flag; |
5395 | 0 | } |
5396 | | |
5397 | | char *bam_flag2str(int flag) |
5398 | 0 | { |
5399 | 0 | kstring_t str = {0,0,0}; |
5400 | 0 | if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED"); |
5401 | 0 | if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR"); |
5402 | 0 | if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP"); |
5403 | 0 | if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP"); |
5404 | 0 | if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE"); |
5405 | 0 | if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE"); |
5406 | 0 | if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1"); |
5407 | 0 | if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2"); |
5408 | 0 | if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY"); |
5409 | 0 | if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL"); |
5410 | 0 | if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP"); |
5411 | 0 | if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY"); |
5412 | 0 | if ( str.l == 0 ) kputsn("", 0, &str); |
5413 | 0 | return str.s; |
5414 | 0 | } |
5415 | | |
5416 | | |
5417 | | /************************** |
5418 | | *** Pileup and Mpileup *** |
5419 | | **************************/ |
5420 | | |
5421 | | #if !defined(BAM_NO_PILEUP) |
5422 | | |
5423 | | #include <assert.h> |
5424 | | |
5425 | | /******************* |
5426 | | *** Memory pool *** |
5427 | | *******************/ |
5428 | | |
5429 | | typedef struct { |
5430 | | int k, y; |
5431 | | hts_pos_t x, end; |
5432 | | } cstate_t; |
5433 | | |
5434 | | static cstate_t g_cstate_null = { -1, 0, 0, 0 }; |
5435 | | |
5436 | | typedef struct __linkbuf_t { |
5437 | | bam1_t b; |
5438 | | hts_pos_t beg, end; |
5439 | | cstate_t s; |
5440 | | struct __linkbuf_t *next; |
5441 | | bam_pileup_cd cd; |
5442 | | } lbnode_t; |
5443 | | |
5444 | | typedef struct { |
5445 | | int cnt, n, max; |
5446 | | lbnode_t **buf; |
5447 | | } mempool_t; |
5448 | | |
5449 | | static mempool_t *mp_init(void) |
5450 | 0 | { |
5451 | 0 | mempool_t *mp; |
5452 | 0 | mp = (mempool_t*)calloc(1, sizeof(mempool_t)); |
5453 | 0 | return mp; |
5454 | 0 | } |
5455 | | static void mp_destroy(mempool_t *mp) |
5456 | 0 | { |
5457 | 0 | int k; |
5458 | 0 | for (k = 0; k < mp->n; ++k) { |
5459 | 0 | free(mp->buf[k]->b.data); |
5460 | 0 | free(mp->buf[k]); |
5461 | 0 | } |
5462 | 0 | free(mp->buf); |
5463 | 0 | free(mp); |
5464 | 0 | } |
5465 | | static inline lbnode_t *mp_alloc(mempool_t *mp) |
5466 | 0 | { |
5467 | 0 | ++mp->cnt; |
5468 | 0 | if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t)); |
5469 | 0 | else return mp->buf[--mp->n]; |
5470 | 0 | } |
5471 | | static inline void mp_free(mempool_t *mp, lbnode_t *p) |
5472 | 0 | { |
5473 | 0 | --mp->cnt; p->next = 0; // clear lbnode_t::next here |
5474 | 0 | if (mp->n == mp->max) { |
5475 | 0 | mp->max = mp->max? mp->max<<1 : 256; |
5476 | 0 | mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max); |
5477 | 0 | } |
5478 | 0 | mp->buf[mp->n++] = p; |
5479 | 0 | } |
5480 | | |
5481 | | /********************** |
5482 | | *** CIGAR resolver *** |
5483 | | **********************/ |
5484 | | |
5485 | | /* s->k: the index of the CIGAR operator that has just been processed. |
5486 | | s->x: the reference coordinate of the start of s->k |
5487 | | s->y: the query coordinate of the start of s->k |
5488 | | */ |
5489 | | static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s) |
5490 | 0 | { |
5491 | 0 | #define _cop(c) ((c)&BAM_CIGAR_MASK) |
5492 | 0 | #define _cln(c) ((c)>>BAM_CIGAR_SHIFT) |
5493 | |
|
5494 | 0 | bam1_t *b = p->b; |
5495 | 0 | bam1_core_t *c = &b->core; |
5496 | 0 | uint32_t *cigar = bam_get_cigar(b); |
5497 | 0 | int k; |
5498 | | // determine the current CIGAR operation |
5499 | | //fprintf(stderr, "%s\tpos=%ld\tend=%ld\t(%d,%ld,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y); |
5500 | 0 | if (s->k == -1) { // never processed |
5501 | 0 | p->qpos = 0; |
5502 | 0 | if (c->n_cigar == 1) { // just one operation, save a loop |
5503 | 0 | if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0; |
5504 | 0 | } else { // find the first match or deletion |
5505 | 0 | for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) { |
5506 | 0 | int op = _cop(cigar[k]); |
5507 | 0 | int l = _cln(cigar[k]); |
5508 | 0 | if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || |
5509 | 0 | op == BAM_CEQUAL || op == BAM_CDIFF) break; |
5510 | 0 | else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; |
5511 | 0 | } |
5512 | 0 | assert(k < c->n_cigar); |
5513 | 0 | s->k = k; |
5514 | 0 | } |
5515 | 0 | } else { // the read has been processed before |
5516 | 0 | int op, l = _cln(cigar[s->k]); |
5517 | 0 | if (pos - s->x >= l) { // jump to the next operation |
5518 | 0 | assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case |
5519 | 0 | op = _cop(cigar[s->k+1]); |
5520 | 0 | if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop |
5521 | 0 | if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; |
5522 | 0 | s->x += l; |
5523 | 0 | ++s->k; |
5524 | 0 | } else { // find the next M/D/N/=/X |
5525 | 0 | if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; |
5526 | 0 | s->x += l; |
5527 | 0 | for (k = s->k + 1; k < c->n_cigar; ++k) { |
5528 | 0 | op = _cop(cigar[k]), l = _cln(cigar[k]); |
5529 | 0 | if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break; |
5530 | 0 | else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; |
5531 | 0 | } |
5532 | 0 | s->k = k; |
5533 | 0 | } |
5534 | 0 | assert(s->k < c->n_cigar); // otherwise a bug |
5535 | 0 | } // else, do nothing |
5536 | 0 | } |
5537 | 0 | { // collect pileup information |
5538 | 0 | int op, l; |
5539 | 0 | op = _cop(cigar[s->k]); l = _cln(cigar[s->k]); |
5540 | 0 | p->is_del = p->indel = p->is_refskip = 0; |
5541 | 0 | if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation |
5542 | 0 | int op2 = _cop(cigar[s->k+1]); |
5543 | 0 | int l2 = _cln(cigar[s->k+1]); |
5544 | 0 | if (op2 == BAM_CDEL && op != BAM_CDEL) { |
5545 | | // At start of a new deletion, merge e.g. 1D2D to 3D. |
5546 | | // Within a deletion (the 2D in 1D2D) we keep p->indel=0 |
5547 | | // and rely on is_del=1 as we would for 3D. |
5548 | 0 | p->indel = -(int)l2; |
5549 | 0 | for (k = s->k+2; k < c->n_cigar; ++k) { |
5550 | 0 | op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); |
5551 | 0 | if (op2 == BAM_CDEL) p->indel -= l2; |
5552 | 0 | else break; |
5553 | 0 | } |
5554 | 0 | } else if (op2 == BAM_CINS) { |
5555 | 0 | p->indel = l2; |
5556 | 0 | for (k = s->k+2; k < c->n_cigar; ++k) { |
5557 | 0 | op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); |
5558 | 0 | if (op2 == BAM_CINS) p->indel += l2; |
5559 | 0 | else if (op2 != BAM_CPAD) break; |
5560 | 0 | } |
5561 | 0 | } else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { |
5562 | 0 | int l3 = 0; |
5563 | 0 | for (k = s->k + 2; k < c->n_cigar; ++k) { |
5564 | 0 | op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); |
5565 | 0 | if (op2 == BAM_CINS) l3 += l2; |
5566 | 0 | else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break; |
5567 | 0 | } |
5568 | 0 | if (l3 > 0) p->indel = l3; |
5569 | 0 | } |
5570 | 0 | } |
5571 | 0 | if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { |
5572 | 0 | p->qpos = s->y + (pos - s->x); |
5573 | 0 | } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { |
5574 | 0 | p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!! |
5575 | 0 | p->is_refskip = (op == BAM_CREF_SKIP); |
5576 | 0 | } // cannot be other operations; otherwise a bug |
5577 | 0 | p->is_head = (pos == c->pos); p->is_tail = (pos == s->end); |
5578 | 0 | } |
5579 | 0 | p->cigar_ind = s->k; |
5580 | 0 | return 1; |
5581 | 0 | } |
5582 | | |
5583 | | /******************************* |
5584 | | *** Expansion of insertions *** |
5585 | | *******************************/ |
5586 | | |
5587 | | /* |
5588 | | * Fills out the kstring with the padded insertion sequence for the current |
5589 | | * location in 'p'. If this is not an insertion site, the string is blank. |
5590 | | * |
5591 | | * This variant handles base modifications, but only when "m" is non-NULL. |
5592 | | * |
5593 | | * Returns the number of inserted base on success, with string length being |
5594 | | * accessable via ins->l; |
5595 | | * -1 on failure. |
5596 | | */ |
5597 | | int bam_plp_insertion_mod(const bam_pileup1_t *p, |
5598 | | hts_base_mod_state *m, |
5599 | 0 | kstring_t *ins, int *del_len) { |
5600 | 0 | int j, k, indel, nb = 0; |
5601 | 0 | uint32_t *cigar; |
5602 | |
|
5603 | 0 | if (p->indel <= 0) { |
5604 | 0 | if (ks_resize(ins, 1) < 0) |
5605 | 0 | return -1; |
5606 | 0 | ins->l = 0; |
5607 | 0 | ins->s[0] = '\0'; |
5608 | 0 | return 0; |
5609 | 0 | } |
5610 | | |
5611 | 0 | if (del_len) |
5612 | 0 | *del_len = 0; |
5613 | | |
5614 | | // Measure indel length including pads |
5615 | 0 | indel = 0; |
5616 | 0 | k = p->cigar_ind+1; |
5617 | 0 | cigar = bam_get_cigar(p->b); |
5618 | 0 | while (k < p->b->core.n_cigar) { |
5619 | 0 | switch (cigar[k] & BAM_CIGAR_MASK) { |
5620 | 0 | case BAM_CPAD: |
5621 | 0 | case BAM_CINS: |
5622 | 0 | indel += (cigar[k] >> BAM_CIGAR_SHIFT); |
5623 | 0 | break; |
5624 | 0 | default: |
5625 | 0 | k = p->b->core.n_cigar; |
5626 | 0 | break; |
5627 | 0 | } |
5628 | 0 | k++; |
5629 | 0 | } |
5630 | 0 | nb = ins->l = indel; |
5631 | | |
5632 | | // Produce sequence |
5633 | 0 | if (ks_resize(ins, indel+1) < 0) |
5634 | 0 | return -1; |
5635 | 0 | indel = 0; |
5636 | 0 | k = p->cigar_ind+1; |
5637 | 0 | j = 1; |
5638 | 0 | while (k < p->b->core.n_cigar) { |
5639 | 0 | int l, c; |
5640 | 0 | switch (cigar[k] & BAM_CIGAR_MASK) { |
5641 | 0 | case BAM_CPAD: |
5642 | 0 | for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++) |
5643 | 0 | ins->s[indel++] = '*'; |
5644 | 0 | break; |
5645 | 0 | case BAM_CINS: |
5646 | 0 | for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) { |
5647 | 0 | c = p->qpos + j - p->is_del < p->b->core.l_qseq |
5648 | 0 | ? seq_nt16_str[bam_seqi(bam_get_seq(p->b), |
5649 | 0 | p->qpos + j - p->is_del)] |
5650 | 0 | : 'N'; |
5651 | 0 | ins->s[indel++] = c; |
5652 | 0 | int nm; |
5653 | 0 | hts_base_mod mod[256]; |
5654 | 0 | if (m && (nm = bam_mods_at_qpos(p->b, p->qpos + j - p->is_del, |
5655 | 0 | m, mod, 256)) > 0) { |
5656 | 0 | int o_indel = indel; |
5657 | 0 | if (ks_resize(ins, ins->l + nm*16+3) < 0) |
5658 | 0 | return -1; |
5659 | 0 | ins->s[indel++] = '['; |
5660 | 0 | int j; |
5661 | 0 | for (j = 0; j < nm; j++) { |
5662 | 0 | char qual[20]; |
5663 | 0 | if (mod[j].qual >= 0) |
5664 | 0 | snprintf(qual, sizeof(qual), "%d", mod[j].qual); |
5665 | 0 | else |
5666 | 0 | *qual=0; |
5667 | 0 | if (mod[j].modified_base < 0) |
5668 | | // ChEBI |
5669 | 0 | indel += snprintf(&ins->s[indel], ins->m - indel, |
5670 | 0 | "%c(%d)%s", |
5671 | 0 | "+-"[mod[j].strand], |
5672 | 0 | -mod[j].modified_base, |
5673 | 0 | qual); |
5674 | 0 | else |
5675 | 0 | indel += snprintf(&ins->s[indel], ins->m - indel, |
5676 | 0 | "%c%c%s", |
5677 | 0 | "+-"[mod[j].strand], |
5678 | 0 | mod[j].modified_base, |
5679 | 0 | qual); |
5680 | 0 | } |
5681 | 0 | ins->s[indel++] = ']'; |
5682 | 0 | ins->l += indel - o_indel; // grow by amount we used |
5683 | 0 | } |
5684 | 0 | } |
5685 | 0 | break; |
5686 | 0 | case BAM_CDEL: |
5687 | | // eg cigar 1M2I1D gives mpileup output in T+2AA-1C style |
5688 | 0 | if (del_len) |
5689 | 0 | *del_len = cigar[k]>>BAM_CIGAR_SHIFT; |
5690 | | // fall through |
5691 | 0 | default: |
5692 | 0 | k = p->b->core.n_cigar; |
5693 | 0 | break; |
5694 | 0 | } |
5695 | 0 | k++; |
5696 | 0 | } |
5697 | 0 | ins->s[indel] = '\0'; |
5698 | 0 | ins->l = indel; // string length |
5699 | |
|
5700 | 0 | return nb; // base length |
5701 | 0 | } |
5702 | | |
5703 | | /* |
5704 | | * Fills out the kstring with the padded insertion sequence for the current |
5705 | | * location in 'p'. If this is not an insertion site, the string is blank. |
5706 | | * |
5707 | | * This is the original interface with no capability for reporting base |
5708 | | * modifications. |
5709 | | * |
5710 | | * Returns the length of insertion string on success; |
5711 | | * -1 on failure. |
5712 | | */ |
5713 | 0 | int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) { |
5714 | 0 | return bam_plp_insertion_mod(p, NULL, ins, del_len); |
5715 | 0 | } |
5716 | | |
5717 | | /*********************** |
5718 | | *** Pileup iterator *** |
5719 | | ***********************/ |
5720 | | |
5721 | | // Dictionary of overlapping reads |
5722 | | KHASH_MAP_INIT_STR(olap_hash, lbnode_t *) |
5723 | | typedef khash_t(olap_hash) olap_hash_t; |
5724 | | |
5725 | | struct bam_plp_s { |
5726 | | mempool_t *mp; |
5727 | | lbnode_t *head, *tail; |
5728 | | int32_t tid, max_tid; |
5729 | | hts_pos_t pos, max_pos; |
5730 | | int is_eof, max_plp, error, maxcnt; |
5731 | | uint64_t id; |
5732 | | bam_pileup1_t *plp; |
5733 | | // for the "auto" interface only |
5734 | | bam1_t *b; |
5735 | | bam_plp_auto_f func; |
5736 | | void *data; |
5737 | | olap_hash_t *overlaps; |
5738 | | |
5739 | | // For notification of creation and destruction events |
5740 | | // and associated client-owned pointer. |
5741 | | int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd); |
5742 | | int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd); |
5743 | | }; |
5744 | | |
5745 | | bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data) |
5746 | 0 | { |
5747 | 0 | bam_plp_t iter; |
5748 | 0 | iter = (bam_plp_t)calloc(1, sizeof(struct bam_plp_s)); |
5749 | 0 | iter->mp = mp_init(); |
5750 | 0 | iter->head = iter->tail = mp_alloc(iter->mp); |
5751 | 0 | iter->max_tid = iter->max_pos = -1; |
5752 | 0 | iter->maxcnt = 8000; |
5753 | 0 | if (func) { |
5754 | 0 | iter->func = func; |
5755 | 0 | iter->data = data; |
5756 | 0 | iter->b = bam_init1(); |
5757 | 0 | } |
5758 | 0 | return iter; |
5759 | 0 | } |
5760 | | |
5761 | | int bam_plp_init_overlaps(bam_plp_t iter) |
5762 | 0 | { |
5763 | 0 | iter->overlaps = kh_init(olap_hash); // hash for tweaking quality of bases in overlapping reads |
5764 | 0 | return iter->overlaps ? 0 : -1; |
5765 | 0 | } |
5766 | | |
5767 | | void bam_plp_destroy(bam_plp_t iter) |
5768 | 0 | { |
5769 | 0 | lbnode_t *p, *pnext; |
5770 | 0 | if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps); |
5771 | 0 | for (p = iter->head; p != NULL; p = pnext) { |
5772 | 0 | if (iter->plp_destruct && p != iter->tail) |
5773 | 0 | iter->plp_destruct(iter->data, &p->b, &p->cd); |
5774 | 0 | pnext = p->next; |
5775 | 0 | mp_free(iter->mp, p); |
5776 | 0 | } |
5777 | 0 | mp_destroy(iter->mp); |
5778 | 0 | if (iter->b) bam_destroy1(iter->b); |
5779 | 0 | free(iter->plp); |
5780 | 0 | free(iter); |
5781 | 0 | } |
5782 | | |
5783 | | void bam_plp_constructor(bam_plp_t plp, |
5784 | 0 | int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) { |
5785 | 0 | plp->plp_construct = func; |
5786 | 0 | } |
5787 | | |
5788 | | void bam_plp_destructor(bam_plp_t plp, |
5789 | 0 | int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) { |
5790 | 0 | plp->plp_destruct = func; |
5791 | 0 | } |
5792 | | |
5793 | | //--------------------------------- |
5794 | | //--- Tweak overlapping reads |
5795 | | //--------------------------------- |
5796 | | |
5797 | | /** |
5798 | | * cigar_iref2iseq_set() - find the first CMATCH setting the ref and the read index |
5799 | | * cigar_iref2iseq_next() - get the next CMATCH base |
5800 | | * @cigar: pointer to current cigar block (rw) |
5801 | | * @cigar_max: pointer just beyond the last cigar block |
5802 | | * @icig: position within the current cigar block (rw) |
5803 | | * @iseq: position in the sequence (rw) |
5804 | | * @iref: position with respect to the beginning of the read (iref_pos - b->core.pos) (rw) |
5805 | | * |
5806 | | * Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered, |
5807 | | * or -2 on error. |
5808 | | */ |
5809 | | static inline int cigar_iref2iseq_set(const uint32_t **cigar, |
5810 | | const uint32_t *cigar_max, |
5811 | | hts_pos_t *icig, |
5812 | | hts_pos_t *iseq, |
5813 | | hts_pos_t *iref) |
5814 | 0 | { |
5815 | 0 | hts_pos_t pos = *iref; |
5816 | 0 | if ( pos < 0 ) return -1; |
5817 | 0 | *icig = 0; |
5818 | 0 | *iseq = 0; |
5819 | 0 | *iref = 0; |
5820 | 0 | while ( *cigar<cigar_max ) |
5821 | 0 | { |
5822 | 0 | int cig = (**cigar) & BAM_CIGAR_MASK; |
5823 | 0 | int ncig = (**cigar) >> BAM_CIGAR_SHIFT; |
5824 | |
|
5825 | 0 | if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } |
5826 | 0 | if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; } |
5827 | 0 | if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) |
5828 | 0 | { |
5829 | 0 | pos -= ncig; |
5830 | 0 | if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; } |
5831 | 0 | (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig; |
5832 | 0 | continue; |
5833 | 0 | } |
5834 | 0 | if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } |
5835 | 0 | if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) |
5836 | 0 | { |
5837 | 0 | pos -= ncig; |
5838 | 0 | if ( pos<0 ) pos = 0; |
5839 | 0 | (*cigar)++; *icig = 0; *iref += ncig; |
5840 | 0 | continue; |
5841 | 0 | } |
5842 | 0 | hts_log_error("Unexpected cigar %d", cig); |
5843 | 0 | return -2; |
5844 | 0 | } |
5845 | 0 | *iseq = -1; |
5846 | 0 | return -1; |
5847 | 0 | } |
5848 | | static inline int cigar_iref2iseq_next(const uint32_t **cigar, |
5849 | | const uint32_t *cigar_max, |
5850 | | hts_pos_t *icig, |
5851 | | hts_pos_t *iseq, |
5852 | | hts_pos_t *iref) |
5853 | 0 | { |
5854 | 0 | while ( *cigar < cigar_max ) |
5855 | 0 | { |
5856 | 0 | int cig = (**cigar) & BAM_CIGAR_MASK; |
5857 | 0 | int ncig = (**cigar) >> BAM_CIGAR_SHIFT; |
5858 | |
|
5859 | 0 | if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) |
5860 | 0 | { |
5861 | 0 | if ( *icig >= ncig - 1 ) { *icig = -1; (*cigar)++; continue; } |
5862 | 0 | (*iseq)++; (*icig)++; (*iref)++; |
5863 | 0 | return BAM_CMATCH; |
5864 | 0 | } |
5865 | 0 | if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = -1; continue; } |
5866 | 0 | if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; } |
5867 | 0 | if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; } |
5868 | 0 | if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = -1; continue; } |
5869 | 0 | hts_log_error("Unexpected cigar %d", cig); |
5870 | 0 | return -2; |
5871 | 0 | } |
5872 | 0 | *iseq = -1; |
5873 | 0 | *iref = -1; |
5874 | 0 | return -1; |
5875 | 0 | } |
5876 | | |
5877 | | // Given overlapping read 'a' (left) and 'b' (right) on the same |
5878 | | // template, adjust quality values to zero for either a or b. |
5879 | | // Note versions 1.12 and earlier always removed quality from 'b' for |
5880 | | // matching bases. Now we select a or b semi-randomly based on name hash. |
5881 | | // Returns 0 on success, |
5882 | | // -1 on failure |
5883 | | static int tweak_overlap_quality(bam1_t *a, bam1_t *b) |
5884 | 0 | { |
5885 | 0 | const uint32_t *a_cigar = bam_get_cigar(a), |
5886 | 0 | *a_cigar_max = a_cigar + a->core.n_cigar; |
5887 | 0 | const uint32_t *b_cigar = bam_get_cigar(b), |
5888 | 0 | *b_cigar_max = b_cigar + b->core.n_cigar; |
5889 | 0 | hts_pos_t a_icig = 0, a_iseq = 0; |
5890 | 0 | hts_pos_t b_icig = 0, b_iseq = 0; |
5891 | 0 | uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b); |
5892 | 0 | uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b); |
5893 | |
|
5894 | 0 | hts_pos_t iref = b->core.pos; |
5895 | 0 | hts_pos_t a_iref = iref - a->core.pos; |
5896 | 0 | hts_pos_t b_iref = iref - b->core.pos; |
5897 | |
|
5898 | 0 | int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, |
5899 | 0 | &a_icig, &a_iseq, &a_iref); |
5900 | 0 | if ( a_ret<0 ) |
5901 | | // no overlap or error |
5902 | 0 | return a_ret<-1 ? -1:0; |
5903 | | |
5904 | 0 | int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, |
5905 | 0 | &b_icig, &b_iseq, &b_iref); |
5906 | 0 | if ( b_ret<0 ) |
5907 | | // no overlap or error |
5908 | 0 | return b_ret<-1 ? -1:0; |
5909 | | |
5910 | | // Determine which seq is the one getting modified qualities. |
5911 | 0 | uint8_t amul, bmul; |
5912 | 0 | if (__ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(a))) & 1) { |
5913 | 0 | amul = 1; |
5914 | 0 | bmul = 0; |
5915 | 0 | } else { |
5916 | 0 | amul = 0; |
5917 | 0 | bmul = 1; |
5918 | 0 | } |
5919 | | |
5920 | | // Loop over the overlapping region nulling qualities in either |
5921 | | // seq a or b. |
5922 | 0 | int err = 0; |
5923 | 0 | while ( 1 ) { |
5924 | | // Step to next matching reference position in a and b |
5925 | 0 | while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos ) |
5926 | 0 | a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, |
5927 | 0 | &a_icig, &a_iseq, &a_iref); |
5928 | 0 | if ( a_ret<0 ) { // done |
5929 | 0 | err = a_ret<-1?-1:0; |
5930 | 0 | break; |
5931 | 0 | } |
5932 | | |
5933 | 0 | while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos ) |
5934 | 0 | b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, |
5935 | 0 | &b_iseq, &b_iref); |
5936 | 0 | if ( b_ret<0 ) { // done |
5937 | 0 | err = b_ret<-1?-1:0; |
5938 | 0 | break; |
5939 | 0 | } |
5940 | | |
5941 | 0 | if ( iref < a_iref + a->core.pos ) |
5942 | 0 | iref = a_iref + a->core.pos; |
5943 | |
|
5944 | 0 | if ( iref < b_iref + b->core.pos ) |
5945 | 0 | iref = b_iref + b->core.pos; |
5946 | |
|
5947 | 0 | iref++; |
5948 | | |
5949 | | // If A or B has a deletion then we catch up the other to this point. |
5950 | | // We also amend quality values using the same rules for mismatch. |
5951 | 0 | if (a_iref+a->core.pos != b_iref+b->core.pos) { |
5952 | 0 | if (a_iref+a->core.pos < b_iref+b->core.pos |
5953 | 0 | && b_cigar > bam_get_cigar(b) |
5954 | 0 | && bam_cigar_op(b_cigar[-1]) == BAM_CDEL) { |
5955 | | // Del in B means it's moved on further than A |
5956 | 0 | do { |
5957 | 0 | a_qual[a_iseq] = amul |
5958 | 0 | ? a_qual[a_iseq]*0.8 |
5959 | 0 | : 0; |
5960 | 0 | a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, |
5961 | 0 | &a_icig, &a_iseq, &a_iref); |
5962 | 0 | if (a_ret < 0) |
5963 | 0 | return -(a_ret<-1); // 0 or -1 |
5964 | 0 | } while (a_iref + a->core.pos < b_iref+b->core.pos); |
5965 | 0 | } else if (a_cigar > bam_get_cigar(a) |
5966 | 0 | && bam_cigar_op(a_cigar[-1]) == BAM_CDEL) { |
5967 | | // Del in A means it's moved on further than B |
5968 | 0 | do { |
5969 | 0 | b_qual[b_iseq] = bmul |
5970 | 0 | ? b_qual[b_iseq]*0.8 |
5971 | 0 | : 0; |
5972 | 0 | b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, |
5973 | 0 | &b_icig, &b_iseq, &b_iref); |
5974 | 0 | if (b_ret < 0) |
5975 | 0 | return -(b_ret<-1); // 0 or -1 |
5976 | 0 | } while (b_iref + b->core.pos < a_iref+a->core.pos); |
5977 | 0 | } else { |
5978 | | // Anything else, eg ref-skip, we don't support here |
5979 | 0 | continue; |
5980 | 0 | } |
5981 | 0 | } |
5982 | | |
5983 | | // fprintf(stderr, "a_cig=%ld,%ld b_cig=%ld,%ld iref=%ld " |
5984 | | // "a_iref=%ld b_iref=%ld a_iseq=%ld b_iseq=%ld\n", |
5985 | | // a_cigar-bam_get_cigar(a), a_icig, |
5986 | | // b_cigar-bam_get_cigar(b), b_icig, |
5987 | | // iref, a_iref+a->core.pos+1, b_iref+b->core.pos+1, |
5988 | | // a_iseq, b_iseq); |
5989 | | |
5990 | 0 | if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq) |
5991 | | // Fell off end of sequence, bad CIGAR? |
5992 | 0 | return -1; |
5993 | | |
5994 | | // We're finally at the same ref base in both a and b. |
5995 | | // Check if the bases match (confident) or mismatch |
5996 | | // (not so confident). |
5997 | 0 | if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) { |
5998 | | // We are very confident about this base. Use sum of quals |
5999 | 0 | int qual = a_qual[a_iseq] + b_qual[b_iseq]; |
6000 | 0 | a_qual[a_iseq] = amul * (qual>200 ? 200 : qual); |
6001 | 0 | b_qual[b_iseq] = bmul * (qual>200 ? 200 : qual);; |
6002 | 0 | } else { |
6003 | | // Not so confident about anymore given the mismatch. |
6004 | | // Reduce qual for lowest quality base. |
6005 | 0 | if ( a_qual[a_iseq] > b_qual[b_iseq] ) { |
6006 | | // A highest qual base; keep |
6007 | 0 | a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; |
6008 | 0 | b_qual[b_iseq] = 0; |
6009 | 0 | } else if (a_qual[a_iseq] < b_qual[b_iseq] ) { |
6010 | | // B highest qual base; keep |
6011 | 0 | b_qual[b_iseq] = 0.8 * b_qual[b_iseq]; |
6012 | 0 | a_qual[a_iseq] = 0; |
6013 | 0 | } else { |
6014 | | // Both equal, so pick randomly |
6015 | 0 | a_qual[a_iseq] = amul * 0.8 * a_qual[a_iseq]; |
6016 | 0 | b_qual[b_iseq] = bmul * 0.8 * b_qual[b_iseq]; |
6017 | 0 | } |
6018 | 0 | } |
6019 | 0 | } |
6020 | | |
6021 | 0 | return err; |
6022 | 0 | } |
6023 | | |
6024 | | // Fix overlapping reads. Simple soft-clipping did not give good results. |
6025 | | // Lowering qualities of unwanted bases is more selective and works better. |
6026 | | // |
6027 | | // Returns 0 on success, -1 on failure |
6028 | | static int overlap_push(bam_plp_t iter, lbnode_t *node) |
6029 | 0 | { |
6030 | 0 | if ( !iter->overlaps ) return 0; |
6031 | | |
6032 | | // mapped mates and paired reads only |
6033 | 0 | if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return 0; |
6034 | | |
6035 | | // no overlap possible, unless some wild cigar |
6036 | 0 | if ( (node->b.core.mtid >= 0 && node->b.core.tid != node->b.core.mtid) |
6037 | 0 | || (llabs(node->b.core.isize) >= 2*node->b.core.l_qseq |
6038 | 0 | && node->b.core.mpos >= node->end) // for those wild cigars |
6039 | 0 | ) return 0; |
6040 | | |
6041 | 0 | khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b)); |
6042 | 0 | if ( kitr==kh_end(iter->overlaps) ) |
6043 | 0 | { |
6044 | | // Only add reads where the mate is still to arrive |
6045 | 0 | if (node->b.core.mpos >= node->b.core.pos || |
6046 | 0 | ((node->b.core.flag & BAM_FPAIRED) && node->b.core.mpos == -1)) { |
6047 | 0 | int ret; |
6048 | 0 | kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret); |
6049 | 0 | if (ret < 0) return -1; |
6050 | 0 | kh_value(iter->overlaps, kitr) = node; |
6051 | 0 | } |
6052 | 0 | } |
6053 | 0 | else |
6054 | 0 | { |
6055 | 0 | lbnode_t *a = kh_value(iter->overlaps, kitr); |
6056 | 0 | int err = tweak_overlap_quality(&a->b, &node->b); |
6057 | 0 | kh_del(olap_hash, iter->overlaps, kitr); |
6058 | 0 | assert(a->end-1 == a->s.end); |
6059 | 0 | return err; |
6060 | 0 | } |
6061 | 0 | return 0; |
6062 | 0 | } |
6063 | | |
6064 | | static void overlap_remove(bam_plp_t iter, const bam1_t *b) |
6065 | 0 | { |
6066 | 0 | if ( !iter->overlaps ) return; |
6067 | | |
6068 | 0 | khiter_t kitr; |
6069 | 0 | if ( b ) |
6070 | 0 | { |
6071 | 0 | kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b)); |
6072 | 0 | if ( kitr!=kh_end(iter->overlaps) ) |
6073 | 0 | kh_del(olap_hash, iter->overlaps, kitr); |
6074 | 0 | } |
6075 | 0 | else |
6076 | 0 | { |
6077 | | // remove all |
6078 | 0 | for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++) |
6079 | 0 | if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr); |
6080 | 0 | } |
6081 | 0 | } |
6082 | | |
6083 | | |
6084 | | |
6085 | | // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns |
6086 | | // pointer to the piled records if next position is ready or NULL if there is not enough records in the |
6087 | | // buffer yet (the current position is still the maximum position across all buffered reads). |
6088 | | const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp) |
6089 | 0 | { |
6090 | 0 | if (iter->error) { *_n_plp = -1; return NULL; } |
6091 | 0 | *_n_plp = 0; |
6092 | 0 | if (iter->is_eof && iter->head == iter->tail) return NULL; |
6093 | 0 | while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) { |
6094 | 0 | int n_plp = 0; |
6095 | | // write iter->plp at iter->pos |
6096 | 0 | lbnode_t **pptr = &iter->head; |
6097 | 0 | while (*pptr != iter->tail) { |
6098 | 0 | lbnode_t *p = *pptr; |
6099 | 0 | if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove |
6100 | 0 | overlap_remove(iter, &p->b); |
6101 | 0 | if (iter->plp_destruct) |
6102 | 0 | iter->plp_destruct(iter->data, &p->b, &p->cd); |
6103 | 0 | *pptr = p->next; mp_free(iter->mp, p); |
6104 | 0 | } |
6105 | 0 | else { |
6106 | 0 | if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup |
6107 | 0 | if (n_plp == iter->max_plp) { // then double the capacity |
6108 | 0 | iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256; |
6109 | 0 | iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp); |
6110 | 0 | } |
6111 | 0 | iter->plp[n_plp].b = &p->b; |
6112 | 0 | iter->plp[n_plp].cd = p->cd; |
6113 | 0 | if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true... |
6114 | 0 | } |
6115 | 0 | pptr = &(*pptr)->next; |
6116 | 0 | } |
6117 | 0 | } |
6118 | 0 | *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos; |
6119 | | // update iter->tid and iter->pos |
6120 | 0 | if (iter->head != iter->tail) { |
6121 | 0 | if (iter->tid > iter->head->b.core.tid) { |
6122 | 0 | hts_log_error("Unsorted input. Pileup aborts"); |
6123 | 0 | iter->error = 1; |
6124 | 0 | *_n_plp = -1; |
6125 | 0 | return NULL; |
6126 | 0 | } |
6127 | 0 | } |
6128 | 0 | if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence |
6129 | 0 | iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference |
6130 | 0 | } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid |
6131 | 0 | iter->pos = iter->head->beg; // jump to the next position |
6132 | 0 | } else ++iter->pos; // scan contiguously |
6133 | | // return |
6134 | 0 | if (n_plp) return iter->plp; |
6135 | 0 | if (iter->is_eof && iter->head == iter->tail) break; |
6136 | 0 | } |
6137 | 0 | return NULL; |
6138 | 0 | } |
6139 | | |
6140 | | const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) |
6141 | 0 | { |
6142 | 0 | hts_pos_t pos64 = 0; |
6143 | 0 | const bam_pileup1_t *p = bam_plp64_next(iter, _tid, &pos64, _n_plp); |
6144 | 0 | if (pos64 < INT_MAX) { |
6145 | 0 | *_pos = pos64; |
6146 | 0 | } else { |
6147 | 0 | hts_log_error("Position %"PRId64" too large", pos64); |
6148 | 0 | *_pos = INT_MAX; |
6149 | 0 | iter->error = 1; |
6150 | 0 | *_n_plp = -1; |
6151 | 0 | return NULL; |
6152 | 0 | } |
6153 | 0 | return p; |
6154 | 0 | } |
6155 | | |
6156 | | int bam_plp_push(bam_plp_t iter, const bam1_t *b) |
6157 | 0 | { |
6158 | 0 | if (iter->error) return -1; |
6159 | 0 | if (b) { |
6160 | 0 | if (b->core.tid < 0) { overlap_remove(iter, b); return 0; } |
6161 | | // Skip only unmapped reads here, any additional filtering must be done in iter->func |
6162 | 0 | if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; } |
6163 | 0 | if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt) |
6164 | 0 | { |
6165 | 0 | overlap_remove(iter, b); |
6166 | 0 | return 0; |
6167 | 0 | } |
6168 | 0 | if (bam_copy1(&iter->tail->b, b) == NULL) |
6169 | 0 | return -1; |
6170 | 0 | iter->tail->b.id = iter->id++; |
6171 | 0 | iter->tail->beg = b->core.pos; |
6172 | | // Use raw rlen rather than bam_endpos() which adjusts rlen=0 to rlen=1 |
6173 | 0 | iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); |
6174 | 0 | iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t |
6175 | 0 | if (b->core.tid < iter->max_tid) { |
6176 | 0 | hts_log_error("The input is not sorted (chromosomes out of order)"); |
6177 | 0 | iter->error = 1; |
6178 | 0 | return -1; |
6179 | 0 | } |
6180 | 0 | if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) { |
6181 | 0 | hts_log_error("The input is not sorted (reads out of order)"); |
6182 | 0 | iter->error = 1; |
6183 | 0 | return -1; |
6184 | 0 | } |
6185 | 0 | iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg; |
6186 | 0 | if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) { |
6187 | 0 | lbnode_t *next = mp_alloc(iter->mp); |
6188 | 0 | if (!next) { |
6189 | 0 | iter->error = 1; |
6190 | 0 | return -1; |
6191 | 0 | } |
6192 | 0 | if (iter->plp_construct) { |
6193 | 0 | if (iter->plp_construct(iter->data, &iter->tail->b, |
6194 | 0 | &iter->tail->cd) < 0) { |
6195 | 0 | mp_free(iter->mp, next); |
6196 | 0 | iter->error = 1; |
6197 | 0 | return -1; |
6198 | 0 | } |
6199 | 0 | } |
6200 | 0 | if (overlap_push(iter, iter->tail) < 0) { |
6201 | 0 | mp_free(iter->mp, next); |
6202 | 0 | iter->error = 1; |
6203 | 0 | return -1; |
6204 | 0 | } |
6205 | 0 | iter->tail->next = next; |
6206 | 0 | iter->tail = iter->tail->next; |
6207 | 0 | } |
6208 | 0 | } else iter->is_eof = 1; |
6209 | 0 | return 0; |
6210 | 0 | } |
6211 | | |
6212 | | const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp) |
6213 | 0 | { |
6214 | 0 | const bam_pileup1_t *plp; |
6215 | 0 | if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; } |
6216 | 0 | if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp; |
6217 | 0 | else { // no pileup line can be obtained; read alignments |
6218 | 0 | *_n_plp = 0; |
6219 | 0 | if (iter->is_eof) return 0; |
6220 | 0 | int ret; |
6221 | 0 | while ( (ret=iter->func(iter->data, iter->b)) >= 0) { |
6222 | 0 | if (bam_plp_push(iter, iter->b) < 0) { |
6223 | 0 | *_n_plp = -1; |
6224 | 0 | return 0; |
6225 | 0 | } |
6226 | 0 | if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp; |
6227 | | // otherwise no pileup line can be returned; read the next alignment. |
6228 | 0 | } |
6229 | 0 | if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; } |
6230 | 0 | if (bam_plp_push(iter, 0) < 0) { |
6231 | 0 | *_n_plp = -1; |
6232 | 0 | return 0; |
6233 | 0 | } |
6234 | 0 | if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp; |
6235 | 0 | return 0; |
6236 | 0 | } |
6237 | 0 | } |
6238 | | |
6239 | | const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) |
6240 | 0 | { |
6241 | 0 | hts_pos_t pos64 = 0; |
6242 | 0 | const bam_pileup1_t *p = bam_plp64_auto(iter, _tid, &pos64, _n_plp); |
6243 | 0 | if (pos64 < INT_MAX) { |
6244 | 0 | *_pos = pos64; |
6245 | 0 | } else { |
6246 | 0 | hts_log_error("Position %"PRId64" too large", pos64); |
6247 | 0 | *_pos = INT_MAX; |
6248 | 0 | iter->error = 1; |
6249 | 0 | *_n_plp = -1; |
6250 | 0 | return NULL; |
6251 | 0 | } |
6252 | 0 | return p; |
6253 | 0 | } |
6254 | | |
6255 | | void bam_plp_reset(bam_plp_t iter) |
6256 | 0 | { |
6257 | 0 | overlap_remove(iter, NULL); |
6258 | 0 | iter->max_tid = iter->max_pos = -1; |
6259 | 0 | iter->tid = iter->pos = 0; |
6260 | 0 | iter->is_eof = 0; |
6261 | 0 | while (iter->head != iter->tail) { |
6262 | 0 | lbnode_t *p = iter->head; |
6263 | 0 | iter->head = p->next; |
6264 | 0 | mp_free(iter->mp, p); |
6265 | 0 | } |
6266 | 0 | } |
6267 | | |
6268 | | void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt) |
6269 | 0 | { |
6270 | 0 | iter->maxcnt = maxcnt; |
6271 | 0 | } |
6272 | | |
6273 | | /************************ |
6274 | | *** Mpileup iterator *** |
6275 | | ************************/ |
6276 | | |
6277 | | struct bam_mplp_s { |
6278 | | int n; |
6279 | | int32_t min_tid, *tid; |
6280 | | hts_pos_t min_pos, *pos; |
6281 | | bam_plp_t *iter; |
6282 | | int *n_plp; |
6283 | | const bam_pileup1_t **plp; |
6284 | | }; |
6285 | | |
6286 | | bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) |
6287 | 0 | { |
6288 | 0 | int i; |
6289 | 0 | bam_mplp_t iter; |
6290 | 0 | iter = (bam_mplp_t)calloc(1, sizeof(struct bam_mplp_s)); |
6291 | 0 | iter->pos = (hts_pos_t*)calloc(n, sizeof(hts_pos_t)); |
6292 | 0 | iter->tid = (int32_t*)calloc(n, sizeof(int32_t)); |
6293 | 0 | iter->n_plp = (int*)calloc(n, sizeof(int)); |
6294 | 0 | iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*)); |
6295 | 0 | iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t)); |
6296 | 0 | iter->n = n; |
6297 | 0 | iter->min_pos = HTS_POS_MAX; |
6298 | 0 | iter->min_tid = (uint32_t)-1; |
6299 | 0 | for (i = 0; i < n; ++i) { |
6300 | 0 | iter->iter[i] = bam_plp_init(func, data[i]); |
6301 | 0 | iter->pos[i] = iter->min_pos; |
6302 | 0 | iter->tid[i] = iter->min_tid; |
6303 | 0 | } |
6304 | 0 | return iter; |
6305 | 0 | } |
6306 | | |
6307 | | int bam_mplp_init_overlaps(bam_mplp_t iter) |
6308 | 0 | { |
6309 | 0 | int i, r = 0; |
6310 | 0 | for (i = 0; i < iter->n; ++i) |
6311 | 0 | r |= bam_plp_init_overlaps(iter->iter[i]); |
6312 | 0 | return r == 0 ? 0 : -1; |
6313 | 0 | } |
6314 | | |
6315 | | void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt) |
6316 | 0 | { |
6317 | 0 | int i; |
6318 | 0 | for (i = 0; i < iter->n; ++i) |
6319 | 0 | iter->iter[i]->maxcnt = maxcnt; |
6320 | 0 | } |
6321 | | |
6322 | | void bam_mplp_destroy(bam_mplp_t iter) |
6323 | 0 | { |
6324 | 0 | int i; |
6325 | 0 | for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]); |
6326 | 0 | free(iter->iter); free(iter->pos); free(iter->tid); |
6327 | 0 | free(iter->n_plp); free(iter->plp); |
6328 | 0 | free(iter); |
6329 | 0 | } |
6330 | | |
6331 | | int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp) |
6332 | 0 | { |
6333 | 0 | int i, ret = 0; |
6334 | 0 | hts_pos_t new_min_pos = HTS_POS_MAX; |
6335 | 0 | uint32_t new_min_tid = (uint32_t)-1; |
6336 | 0 | for (i = 0; i < iter->n; ++i) { |
6337 | 0 | if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) { |
6338 | 0 | int tid; |
6339 | 0 | hts_pos_t pos; |
6340 | 0 | iter->plp[i] = bam_plp64_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]); |
6341 | 0 | if ( iter->iter[i]->error ) return -1; |
6342 | 0 | if (iter->plp[i]) { |
6343 | 0 | iter->tid[i] = tid; |
6344 | 0 | iter->pos[i] = pos; |
6345 | 0 | } else { |
6346 | 0 | iter->tid[i] = 0; |
6347 | 0 | iter->pos[i] = 0; |
6348 | 0 | } |
6349 | 0 | } |
6350 | 0 | if (iter->plp[i]) { |
6351 | 0 | if (iter->tid[i] < new_min_tid) { |
6352 | 0 | new_min_tid = iter->tid[i]; |
6353 | 0 | new_min_pos = iter->pos[i]; |
6354 | 0 | } else if (iter->tid[i] == new_min_tid && iter->pos[i] < new_min_pos) { |
6355 | 0 | new_min_pos = iter->pos[i]; |
6356 | 0 | } |
6357 | 0 | } |
6358 | 0 | } |
6359 | 0 | iter->min_pos = new_min_pos; |
6360 | 0 | iter->min_tid = new_min_tid; |
6361 | 0 | if (new_min_pos == HTS_POS_MAX) return 0; |
6362 | 0 | *_tid = new_min_tid; *_pos = new_min_pos; |
6363 | 0 | for (i = 0; i < iter->n; ++i) { |
6364 | 0 | if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) { |
6365 | 0 | n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i]; |
6366 | 0 | ++ret; |
6367 | 0 | } else n_plp[i] = 0, plp[i] = 0; |
6368 | 0 | } |
6369 | 0 | return ret; |
6370 | 0 | } |
6371 | | |
6372 | | int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) |
6373 | 0 | { |
6374 | 0 | hts_pos_t pos64 = 0; |
6375 | 0 | int ret = bam_mplp64_auto(iter, _tid, &pos64, n_plp, plp); |
6376 | 0 | if (ret >= 0) { |
6377 | 0 | if (pos64 < INT_MAX) { |
6378 | 0 | *_pos = pos64; |
6379 | 0 | } else { |
6380 | 0 | hts_log_error("Position %"PRId64" too large", pos64); |
6381 | 0 | *_pos = INT_MAX; |
6382 | 0 | return -1; |
6383 | 0 | } |
6384 | 0 | } |
6385 | 0 | return ret; |
6386 | 0 | } |
6387 | | |
6388 | | void bam_mplp_reset(bam_mplp_t iter) |
6389 | 0 | { |
6390 | 0 | int i; |
6391 | 0 | iter->min_pos = HTS_POS_MAX; |
6392 | 0 | iter->min_tid = (uint32_t)-1; |
6393 | 0 | for (i = 0; i < iter->n; ++i) { |
6394 | 0 | bam_plp_reset(iter->iter[i]); |
6395 | 0 | iter->pos[i] = HTS_POS_MAX; |
6396 | 0 | iter->tid[i] = (uint32_t)-1; |
6397 | 0 | iter->n_plp[i] = 0; |
6398 | 0 | iter->plp[i] = NULL; |
6399 | 0 | } |
6400 | 0 | } |
6401 | | |
6402 | | void bam_mplp_constructor(bam_mplp_t iter, |
6403 | 0 | int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) { |
6404 | 0 | int i; |
6405 | 0 | for (i = 0; i < iter->n; ++i) |
6406 | 0 | bam_plp_constructor(iter->iter[i], func); |
6407 | 0 | } |
6408 | | |
6409 | | void bam_mplp_destructor(bam_mplp_t iter, |
6410 | 0 | int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) { |
6411 | 0 | int i; |
6412 | 0 | for (i = 0; i < iter->n; ++i) |
6413 | 0 | bam_plp_destructor(iter->iter[i], func); |
6414 | 0 | } |
6415 | | |
6416 | | #endif // ~!defined(BAM_NO_PILEUP) |