Line | Count | Source |
1 | | /* sam.c -- SAM and BAM file I/O and manipulation. |
2 | | |
3 | | Copyright (C) 2008-2010, 2012-2025 Genome Research Ltd. |
4 | | Copyright (C) 2010, 2012, 2013 Broad Institute. |
5 | | |
6 | | Author: Heng Li <lh3@sanger.ac.uk> |
7 | | |
8 | | Permission is hereby granted, free of charge, to any person obtaining a copy |
9 | | of this software and associated documentation files (the "Software"), to deal |
10 | | in the Software without restriction, including without limitation the rights |
11 | | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
12 | | copies of the Software, and to permit persons to whom the Software is |
13 | | furnished to do so, subject to the following conditions: |
14 | | |
15 | | The above copyright notice and this permission notice shall be included in |
16 | | all copies or substantial portions of the Software. |
17 | | |
18 | | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
19 | | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
20 | | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
21 | | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
22 | | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
23 | | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
24 | | DEALINGS IN THE SOFTWARE. */ |
25 | | |
26 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
27 | | #include <config.h> |
28 | | |
29 | | #include <strings.h> |
30 | | #include <stdio.h> |
31 | | #include <stdlib.h> |
32 | | #include <string.h> |
33 | | #include <errno.h> |
34 | | #include <zlib.h> |
35 | | #include <assert.h> |
36 | | #include <signal.h> |
37 | | #include <inttypes.h> |
38 | | #include <unistd.h> |
39 | | #include <regex.h> |
40 | | |
41 | | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
42 | | #include "fuzz_settings.h" |
43 | | #endif |
44 | | |
45 | | // Suppress deprecation message for cigar_tab, which we initialise |
46 | | #include "htslib/hts_defs.h" |
47 | | #undef HTS_DEPRECATED |
48 | | #define HTS_DEPRECATED(message) |
49 | | |
50 | | #include "htslib/sam.h" |
51 | | #include "htslib/bgzf.h" |
52 | | #include "cram/cram.h" |
53 | | #include "hts_internal.h" |
54 | | #include "sam_internal.h" |
55 | | #include "htslib/hfile.h" |
56 | | #include "htslib/hts_endian.h" |
57 | | #include "htslib/hts_expr.h" |
58 | | #include "header.h" |
59 | | |
60 | | #include "htslib/khash.h" |
61 | | KHASH_DECLARE(s2i, kh_cstr_t, int64_t) |
62 | | KHASH_SET_INIT_INT(tag) |
63 | | |
64 | | #ifndef EFTYPE |
65 | 0 | #define EFTYPE ENOEXEC |
66 | | #endif |
67 | | #ifndef EOVERFLOW |
68 | | #define EOVERFLOW ERANGE |
69 | | #endif |
70 | | |
71 | | /********************** |
72 | | *** BAM header I/O *** |
73 | | **********************/ |
74 | | |
75 | | HTSLIB_EXPORT |
76 | | const int8_t bam_cigar_table[256] = { |
77 | | // 0 .. 47 |
78 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
79 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
80 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
81 | | |
82 | | // 48 .. 63 (including =) |
83 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, BAM_CEQUAL, -1, -1, |
84 | | |
85 | | // 64 .. 79 (including MIDNHB) |
86 | | -1, -1, BAM_CBACK, -1, BAM_CDEL, -1, -1, -1, |
87 | | BAM_CHARD_CLIP, BAM_CINS, -1, -1, -1, BAM_CMATCH, BAM_CREF_SKIP, -1, |
88 | | |
89 | | // 80 .. 95 (including SPX) |
90 | | BAM_CPAD, -1, -1, BAM_CSOFT_CLIP, -1, -1, -1, -1, |
91 | | BAM_CDIFF, -1, -1, -1, -1, -1, -1, -1, |
92 | | |
93 | | // 96 .. 127 |
94 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
95 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
96 | | |
97 | | // 128 .. 255 |
98 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
99 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
100 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
101 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
102 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
103 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
104 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
105 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 |
106 | | }; |
107 | | |
108 | | sam_hdr_t *sam_hdr_init(void) |
109 | 6.66k | { |
110 | 6.66k | sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t)); |
111 | 6.66k | if (bh == NULL) return NULL; |
112 | | |
113 | 6.66k | bh->cigar_tab = bam_cigar_table; |
114 | 6.66k | return bh; |
115 | 6.66k | } |
116 | | |
117 | | void sam_hdr_destroy(sam_hdr_t *bh) |
118 | 15.2k | { |
119 | 15.2k | int32_t i; |
120 | | |
121 | 15.2k | if (bh == NULL) return; |
122 | | |
123 | 8.53k | if (bh->ref_count > 0) { |
124 | 1.86k | --bh->ref_count; |
125 | 1.86k | return; |
126 | 1.86k | } |
127 | | |
128 | 6.66k | if (bh->target_name) { |
129 | 7.37k | for (i = 0; i < bh->n_targets; ++i) |
130 | 4.17k | free(bh->target_name[i]); |
131 | 3.19k | free(bh->target_name); |
132 | 3.19k | free(bh->target_len); |
133 | 3.19k | } |
134 | 6.66k | free(bh->text); |
135 | 6.66k | if (bh->hrecs) |
136 | 4.14k | sam_hrecs_free(bh->hrecs); |
137 | 6.66k | if (bh->sdict) |
138 | 288 | kh_destroy(s2i, (khash_t(s2i) *) bh->sdict); |
139 | 6.66k | free(bh); |
140 | 6.66k | } |
141 | | |
142 | | // Copy the sam_hdr_t::sdict hash, used to store the real lengths of long |
143 | | // references before sam_hdr_t::hrecs is populated |
144 | | int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h) |
145 | 0 | { |
146 | 0 | const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict; |
147 | 0 | khash_t(s2i) *dest_long_refs = kh_init(s2i); |
148 | 0 | int i; |
149 | 0 | if (!dest_long_refs) return -1; |
150 | | |
151 | 0 | for (i = 0; i < h->n_targets; i++) { |
152 | 0 | int ret; |
153 | 0 | khiter_t ksrc, kdest; |
154 | 0 | if (h->target_len[i] < UINT32_MAX) continue; |
155 | 0 | ksrc = kh_get(s2i, src_long_refs, h->target_name[i]); |
156 | 0 | if (ksrc == kh_end(src_long_refs)) continue; |
157 | 0 | kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret); |
158 | 0 | if (ret < 0) { |
159 | 0 | kh_destroy(s2i, dest_long_refs); |
160 | 0 | return -1; |
161 | 0 | } |
162 | 0 | kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc); |
163 | 0 | } |
164 | | |
165 | 0 | h->sdict = dest_long_refs; |
166 | 0 | return 0; |
167 | 0 | } |
168 | | |
169 | | sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0) |
170 | 3.87k | { |
171 | 3.87k | if (h0 == NULL) return NULL; |
172 | 3.87k | sam_hdr_t *h; |
173 | 3.87k | if ((h = sam_hdr_init()) == NULL) return NULL; |
174 | | // copy the simple data |
175 | 3.87k | h->n_targets = 0; |
176 | 3.87k | h->ignore_sam_err = h0->ignore_sam_err; |
177 | 3.87k | h->l_text = 0; |
178 | | |
179 | | // Then the pointery stuff |
180 | | |
181 | 3.87k | if (!h0->hrecs) { |
182 | 3 | h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t)); |
183 | 3 | if (!h->target_len) goto fail; |
184 | 3 | h->target_name = (char**)calloc(h0->n_targets, sizeof(char*)); |
185 | 3 | if (!h->target_name) goto fail; |
186 | | |
187 | 3 | int i; |
188 | 3 | for (i = 0; i < h0->n_targets; ++i) { |
189 | 0 | h->target_len[i] = h0->target_len[i]; |
190 | 0 | h->target_name[i] = strdup(h0->target_name[i]); |
191 | 0 | if (!h->target_name[i]) break; |
192 | 0 | } |
193 | 3 | h->n_targets = i; |
194 | 3 | if (i < h0->n_targets) goto fail; |
195 | | |
196 | 3 | if (h0->sdict) { |
197 | 0 | if (sam_hdr_dup_sdict(h0, h) < 0) goto fail; |
198 | 0 | } |
199 | 3 | } |
200 | | |
201 | 3.87k | if (h0->hrecs) { |
202 | 3.87k | kstring_t tmp = { 0, 0, NULL }; |
203 | 3.87k | if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) { |
204 | 0 | free(ks_release(&tmp)); |
205 | 0 | goto fail; |
206 | 0 | } |
207 | | |
208 | 3.87k | h->l_text = tmp.l; |
209 | 3.87k | h->text = ks_release(&tmp); |
210 | | |
211 | 3.87k | if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0) |
212 | 0 | goto fail; |
213 | 3.87k | } else { |
214 | 3 | h->l_text = h0->text ? h0->l_text : 0; |
215 | 3 | h->text = malloc(h->l_text + 1); |
216 | 3 | if (!h->text) goto fail; |
217 | 3 | if (h0->text) |
218 | 3 | memcpy(h->text, h0->text, h->l_text); |
219 | 3 | h->text[h->l_text] = '\0'; |
220 | 3 | } |
221 | | |
222 | 3.87k | return h; |
223 | | |
224 | 0 | fail: |
225 | 0 | sam_hdr_destroy(h); |
226 | 0 | return NULL; |
227 | 3.87k | } |
228 | | |
229 | | sam_hdr_t *bam_hdr_read(BGZF *fp) |
230 | 42 | { |
231 | 42 | sam_hdr_t *h; |
232 | 42 | uint8_t buf[4]; |
233 | 42 | int magic_len, has_EOF; |
234 | 42 | int32_t i, name_len, num_names = 0; |
235 | 42 | size_t bufsize; |
236 | 42 | ssize_t bytes; |
237 | | // check EOF |
238 | 42 | has_EOF = bgzf_check_EOF(fp); |
239 | 42 | if (has_EOF < 0) { |
240 | 0 | perror("[W::bam_hdr_read] bgzf_check_EOF"); |
241 | 42 | } else if (has_EOF == 0) { |
242 | 42 | hts_log_warning("EOF marker is absent. The input is probably truncated"); |
243 | 42 | } |
244 | | // read "BAM1" |
245 | 42 | magic_len = bgzf_read(fp, buf, 4); |
246 | 42 | if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) { |
247 | 0 | hts_log_error("Invalid BAM binary header"); |
248 | 0 | return 0; |
249 | 0 | } |
250 | 42 | h = sam_hdr_init(); |
251 | 42 | if (!h) goto nomem; |
252 | | |
253 | | // read plain text and the number of reference sequences |
254 | 42 | bytes = bgzf_read(fp, buf, 4); |
255 | 42 | if (bytes != 4) goto read_err; |
256 | 42 | h->l_text = le_to_u32(buf); |
257 | | |
258 | 42 | bufsize = h->l_text + 1; |
259 | 42 | if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed |
260 | 42 | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
261 | 42 | if (bufsize > FUZZ_ALLOC_LIMIT) goto nomem; |
262 | 42 | #endif |
263 | 42 | h->text = (char*)malloc(bufsize); |
264 | 42 | if (!h->text) goto nomem; |
265 | 42 | h->text[h->l_text] = 0; // make sure it is NULL terminated |
266 | 42 | bytes = bgzf_read(fp, h->text, h->l_text); |
267 | 42 | if (bytes != h->l_text) goto read_err; |
268 | | |
269 | 42 | bytes = bgzf_read(fp, &h->n_targets, 4); |
270 | 42 | if (bytes != 4) goto read_err; |
271 | 42 | if (fp->is_be) ed_swap_4p(&h->n_targets); |
272 | | |
273 | 42 | if (h->n_targets < 0) goto invalid; |
274 | | |
275 | | // read reference sequence names and lengths |
276 | 42 | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
277 | 42 | if (h->n_targets > (FUZZ_ALLOC_LIMIT - bufsize)/(sizeof(char*)+sizeof(uint32_t))) |
278 | 0 | goto nomem; |
279 | 42 | #endif |
280 | 42 | if (h->n_targets > 0) { |
281 | 21 | h->target_name = (char**)calloc(h->n_targets, sizeof(char*)); |
282 | 21 | if (!h->target_name) goto nomem; |
283 | 21 | h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t)); |
284 | 21 | if (!h->target_len) goto nomem; |
285 | 21 | } |
286 | 21 | else { |
287 | 21 | h->target_name = NULL; |
288 | 21 | h->target_len = NULL; |
289 | 21 | } |
290 | | |
291 | 57 | for (i = 0; i != h->n_targets; ++i) { |
292 | 24 | bytes = bgzf_read(fp, &name_len, 4); |
293 | 24 | if (bytes != 4) goto read_err; |
294 | 24 | if (fp->is_be) ed_swap_4p(&name_len); |
295 | 24 | if (name_len <= 0) goto invalid; |
296 | | |
297 | 24 | h->target_name[i] = (char*)malloc(name_len); |
298 | 24 | if (!h->target_name[i]) goto nomem; |
299 | 24 | num_names++; |
300 | | |
301 | 24 | bytes = bgzf_read(fp, h->target_name[i], name_len); |
302 | 24 | if (bytes != name_len) goto read_err; |
303 | | |
304 | 15 | if (h->target_name[i][name_len - 1] != '\0') { |
305 | | /* Fix missing NUL-termination. Is this being too nice? |
306 | | We could alternatively bail out with an error. */ |
307 | 15 | char *new_name; |
308 | 15 | if (name_len == INT32_MAX) goto invalid; |
309 | 15 | new_name = realloc(h->target_name[i], name_len + 1); |
310 | 15 | if (new_name == NULL) goto nomem; |
311 | 15 | h->target_name[i] = new_name; |
312 | 15 | h->target_name[i][name_len] = '\0'; |
313 | 15 | } |
314 | | |
315 | 15 | bytes = bgzf_read(fp, &h->target_len[i], 4); |
316 | 15 | if (bytes != 4) goto read_err; |
317 | 15 | if (fp->is_be) ed_swap_4p(&h->target_len[i]); |
318 | 15 | } |
319 | 33 | return h; |
320 | | |
321 | 0 | nomem: |
322 | 0 | hts_log_error("Out of memory"); |
323 | 0 | goto clean; |
324 | | |
325 | 9 | read_err: |
326 | 9 | if (bytes < 0) { |
327 | 9 | hts_log_error("Error reading BGZF stream"); |
328 | 9 | } else { |
329 | 0 | hts_log_error("Truncated BAM header"); |
330 | 0 | } |
331 | 9 | goto clean; |
332 | | |
333 | 0 | invalid: |
334 | 0 | hts_log_error("Invalid BAM binary header"); |
335 | |
|
336 | 9 | clean: |
337 | 9 | if (h != NULL) { |
338 | 9 | h->n_targets = num_names; // ensure we free only allocated target_names |
339 | 9 | sam_hdr_destroy(h); |
340 | 9 | } |
341 | 9 | return NULL; |
342 | 0 | } |
343 | | |
344 | | int bam_hdr_write(BGZF *fp, const sam_hdr_t *h) |
345 | 837 | { |
346 | 837 | int32_t i, name_len, x; |
347 | 837 | kstring_t hdr_ks = { 0, 0, NULL }; |
348 | 837 | char *text; |
349 | 837 | uint32_t l_text; |
350 | | |
351 | 837 | if (!h) return -1; |
352 | | |
353 | 837 | if (h->hrecs) { |
354 | 836 | if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1; |
355 | 836 | if (hdr_ks.l > UINT32_MAX) { |
356 | 0 | hts_log_error("Header too long for BAM format"); |
357 | 0 | free(hdr_ks.s); |
358 | 0 | return -1; |
359 | 836 | } else if (hdr_ks.l > INT32_MAX) { |
360 | 0 | hts_log_warning("Header too long for BAM specification (>2GB)"); |
361 | 0 | hts_log_warning("Output file may not be portable"); |
362 | 0 | } |
363 | 836 | text = hdr_ks.s; |
364 | 836 | l_text = hdr_ks.l; |
365 | 836 | } else { |
366 | 1 | if (h->l_text > UINT32_MAX) { |
367 | 0 | hts_log_error("Header too long for BAM format"); |
368 | 0 | return -1; |
369 | 1 | } else if (h->l_text > INT32_MAX) { |
370 | 0 | hts_log_warning("Header too long for BAM specification (>2GB)"); |
371 | 0 | hts_log_warning("Output file may not be portable"); |
372 | 0 | } |
373 | 1 | text = h->text; |
374 | 1 | l_text = h->l_text; |
375 | 1 | } |
376 | | // write "BAM1" |
377 | 837 | if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; } |
378 | | // write plain text and the number of reference sequences |
379 | 837 | if (fp->is_be) { |
380 | 0 | x = ed_swap_4(l_text); |
381 | 0 | if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; } |
382 | 0 | if (l_text) { |
383 | 0 | if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; } |
384 | 0 | } |
385 | 0 | x = ed_swap_4(h->n_targets); |
386 | 0 | if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; } |
387 | 837 | } else { |
388 | 837 | if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; } |
389 | 837 | if (l_text) { |
390 | 489 | if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; } |
391 | 489 | } |
392 | 837 | if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; } |
393 | 837 | } |
394 | 837 | free(hdr_ks.s); |
395 | | // write sequence names and lengths |
396 | 1.37k | for (i = 0; i != h->n_targets; ++i) { |
397 | 539 | char *p = h->target_name[i]; |
398 | 539 | name_len = strlen(p) + 1; |
399 | 539 | if (fp->is_be) { |
400 | 0 | x = ed_swap_4(name_len); |
401 | 0 | if (bgzf_write(fp, &x, 4) < 0) return -1; |
402 | 539 | } else { |
403 | 539 | if (bgzf_write(fp, &name_len, 4) < 0) return -1; |
404 | 539 | } |
405 | 539 | if (bgzf_write(fp, p, name_len) < 0) return -1; |
406 | 539 | if (fp->is_be) { |
407 | 0 | x = ed_swap_4(h->target_len[i]); |
408 | 0 | if (bgzf_write(fp, &x, 4) < 0) return -1; |
409 | 539 | } else { |
410 | 539 | if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1; |
411 | 539 | } |
412 | 539 | } |
413 | 837 | if (bgzf_flush(fp) < 0) return -1; |
414 | 837 | return 0; |
415 | 837 | } |
416 | | |
417 | | const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, |
418 | 0 | hts_pos_t *beg, hts_pos_t *end, int flags) { |
419 | 0 | return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags); |
420 | 0 | } |
421 | | |
422 | | /************************* |
423 | | *** BAM alignment I/O *** |
424 | | *************************/ |
425 | | |
426 | | bam1_t *bam_init1(void) |
427 | 536k | { |
428 | 536k | return (bam1_t*)calloc(1, sizeof(bam1_t)); |
429 | 536k | } |
430 | | |
431 | | int sam_realloc_bam_data(bam1_t *b, size_t desired) |
432 | 557k | { |
433 | 557k | uint32_t new_m_data; |
434 | 557k | uint8_t *new_data; |
435 | 557k | new_m_data = desired; |
436 | 557k | kroundup32(new_m_data); // next power of 2 |
437 | 557k | new_m_data += 32; // reduces malloc arena migrations? |
438 | 557k | if (new_m_data < desired) { |
439 | 0 | errno = ENOMEM; // Not strictly true but we can't store the size |
440 | 0 | return -1; |
441 | 0 | } |
442 | 557k | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
443 | 557k | if (new_m_data > FUZZ_ALLOC_LIMIT) { |
444 | 2 | errno = ENOMEM; |
445 | 2 | return -1; |
446 | 2 | } |
447 | 557k | #endif |
448 | 557k | if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) { |
449 | 557k | new_data = realloc(b->data, new_m_data); |
450 | 557k | } else { |
451 | 0 | if ((new_data = malloc(new_m_data)) != NULL) { |
452 | 0 | if (b->l_data > 0) |
453 | 0 | memcpy(new_data, b->data, |
454 | 0 | b->l_data < b->m_data ? b->l_data : b->m_data); |
455 | 0 | bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA)); |
456 | 0 | } |
457 | 0 | } |
458 | 557k | if (!new_data) return -1; |
459 | 557k | b->data = new_data; |
460 | 557k | b->m_data = new_m_data; |
461 | 557k | return 0; |
462 | 557k | } |
463 | | |
464 | | void bam_destroy1(bam1_t *b) |
465 | 5.35M | { |
466 | 5.35M | if (b == 0) return; |
467 | 536k | if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) { |
468 | 536k | free(b->data); |
469 | 536k | if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) { |
470 | | // In case of reuse |
471 | 0 | b->data = NULL; |
472 | 0 | b->m_data = 0; |
473 | 0 | b->l_data = 0; |
474 | 0 | } |
475 | 536k | } |
476 | | |
477 | 536k | if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0) |
478 | 536k | free(b); |
479 | 536k | } |
480 | | |
481 | | bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) |
482 | 3.79M | { |
483 | 3.79M | if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL; |
484 | 3.79M | memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data |
485 | 3.79M | memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest |
486 | 3.79M | bdst->l_data = bsrc->l_data; |
487 | 3.79M | bdst->id = bsrc->id; |
488 | 3.79M | return bdst; |
489 | 3.79M | } |
490 | | |
491 | | bam1_t *bam_dup1(const bam1_t *bsrc) |
492 | 533k | { |
493 | 533k | if (bsrc == NULL) return NULL; |
494 | 533k | bam1_t *bdst = bam_init1(); |
495 | 533k | if (bdst == NULL) return NULL; |
496 | 533k | if (bam_copy1(bdst, bsrc) == NULL) { |
497 | 0 | bam_destroy1(bdst); |
498 | 0 | return NULL; |
499 | 0 | } |
500 | 533k | return bdst; |
501 | 533k | } |
502 | | |
503 | | static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar, |
504 | | hts_pos_t *rlen, hts_pos_t *qlen) |
505 | 189 | { |
506 | 189 | int k; |
507 | 189 | *rlen = *qlen = 0; |
508 | 627 | for (k = 0; k < n_cigar; ++k) { |
509 | 438 | int type = bam_cigar_type(bam_cigar_op(cigar[k])); |
510 | 438 | int len = bam_cigar_oplen(cigar[k]); |
511 | 438 | if (type & 1) *qlen += len; |
512 | 438 | if (type & 2) *rlen += len; |
513 | 438 | } |
514 | 189 | } |
515 | | |
516 | | static int subtract_check_underflow(size_t length, size_t *limit) |
517 | 56.5M | { |
518 | 56.5M | if (length <= *limit) { |
519 | 56.5M | *limit -= length; |
520 | 56.5M | return 0; |
521 | 56.5M | } |
522 | | |
523 | 0 | return -1; |
524 | 56.5M | } |
525 | | |
526 | | int bam_set1(bam1_t *bam, |
527 | | size_t l_qname, const char *qname, |
528 | | uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq, |
529 | | size_t n_cigar, const uint32_t *cigar, |
530 | | int32_t mtid, hts_pos_t mpos, hts_pos_t isize, |
531 | | size_t l_seq, const char *seq, const char *qual, |
532 | | size_t l_aux) |
533 | 11.3M | { |
534 | | // use a default qname "*" if none is provided |
535 | 11.3M | if (l_qname == 0) { |
536 | 10.1M | l_qname = 1; |
537 | 10.1M | qname = "*"; |
538 | 10.1M | } |
539 | | |
540 | | // note: the qname is stored nul terminated and padded as described in the |
541 | | // documentation for the bam1_t struct. |
542 | 11.3M | size_t qname_nuls = 4 - l_qname % 4; |
543 | | |
544 | | // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos(). |
545 | | // can't use bam_endpos() directly as some fields not yet set up. |
546 | 11.3M | hts_pos_t rlen = 0, qlen = 0; |
547 | 11.3M | if (!(flag & BAM_FUNMAP)) { |
548 | 0 | bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen); |
549 | 0 | } |
550 | 11.3M | if (rlen == 0) { |
551 | 11.3M | rlen = 1; |
552 | 11.3M | } |
553 | | |
554 | | // validate parameters |
555 | 11.3M | if (l_qname > 254) { |
556 | 48 | hts_log_error("Query name too long"); |
557 | 48 | errno = EINVAL; |
558 | 48 | return -1; |
559 | 48 | } |
560 | 11.3M | if (HTS_POS_MAX - rlen <= pos) { |
561 | 0 | hts_log_error("Read ends beyond highest supported position"); |
562 | 0 | errno = EINVAL; |
563 | 0 | return -1; |
564 | 0 | } |
565 | 11.3M | if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) { |
566 | 0 | hts_log_error("Mapped query must have a CIGAR"); |
567 | 0 | errno = EINVAL; |
568 | 0 | return -1; |
569 | 0 | } |
570 | 11.3M | if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) { |
571 | 0 | hts_log_error("CIGAR and query sequence are of different length"); |
572 | 0 | errno = EINVAL; |
573 | 0 | return -1; |
574 | 0 | } |
575 | | |
576 | 11.3M | size_t limit = INT32_MAX; |
577 | 11.3M | int u = subtract_check_underflow(l_qname + qname_nuls, &limit); |
578 | 11.3M | u += subtract_check_underflow(n_cigar * 4, &limit); |
579 | 11.3M | u += subtract_check_underflow((l_seq + 1) / 2, &limit); |
580 | 11.3M | u += subtract_check_underflow(l_seq, &limit); |
581 | 11.3M | u += subtract_check_underflow(l_aux, &limit); |
582 | 11.3M | if (u != 0) { |
583 | 0 | hts_log_error("Size overflow"); |
584 | 0 | errno = EINVAL; |
585 | 0 | return -1; |
586 | 0 | } |
587 | | |
588 | | // re-allocate the data buffer as needed. |
589 | 11.3M | size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq; |
590 | 11.3M | if (realloc_bam_data(bam, data_len + l_aux) < 0) { |
591 | 0 | return -1; |
592 | 0 | } |
593 | | |
594 | 11.3M | bam->l_data = (int)data_len; |
595 | 11.3M | bam->core.pos = pos; |
596 | 11.3M | bam->core.tid = tid; |
597 | 11.3M | bam->core.bin = bam_reg2bin(pos, pos + rlen); |
598 | 11.3M | bam->core.qual = mapq; |
599 | 11.3M | bam->core.l_extranul = (uint8_t)(qname_nuls - 1); |
600 | 11.3M | bam->core.flag = flag; |
601 | 11.3M | bam->core.l_qname = (uint16_t)(l_qname + qname_nuls); |
602 | 11.3M | bam->core.n_cigar = (uint32_t)n_cigar; |
603 | 11.3M | bam->core.l_qseq = (int32_t)l_seq; |
604 | 11.3M | bam->core.mtid = mtid; |
605 | 11.3M | bam->core.mpos = mpos; |
606 | 11.3M | bam->core.isize = isize; |
607 | | |
608 | 11.3M | uint8_t *cp = bam->data; |
609 | 11.3M | strncpy((char *)cp, qname, l_qname); |
610 | 11.3M | int i; |
611 | 44.8M | for (i = 0; i < qname_nuls; i++) { |
612 | 33.5M | cp[l_qname + i] = '\0'; |
613 | 33.5M | } |
614 | 11.3M | cp += l_qname + qname_nuls; |
615 | | |
616 | 11.3M | if (n_cigar > 0) { |
617 | 0 | memcpy(cp, cigar, n_cigar * 4); |
618 | 0 | } |
619 | 11.3M | cp += n_cigar * 4; |
620 | | |
621 | 455M | #define NN 16 |
622 | 11.3M | const uint8_t *useq = (uint8_t *)seq; |
623 | 48.3M | for (i = 0; i + NN < l_seq; i += NN) { |
624 | 37.0M | int j; |
625 | 37.0M | const uint8_t *u2 = useq+i; |
626 | 333M | for (j = 0; j < NN/2; j++) |
627 | 296M | cp[j] = (seq_nt16_table[u2[j*2]]<<4) | seq_nt16_table[u2[j*2+1]]; |
628 | 37.0M | cp += NN/2; |
629 | 37.0M | } |
630 | 12.5M | for (; i + 1 < l_seq; i += 2) { |
631 | 1.20M | *cp++ = (seq_nt16_table[useq[i]] << 4) | seq_nt16_table[useq[i + 1]]; |
632 | 1.20M | } |
633 | | |
634 | 11.4M | for (; i < l_seq; i++) { |
635 | 128k | *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4; |
636 | 128k | } |
637 | | |
638 | 11.3M | if (qual) { |
639 | 486 | memcpy(cp, qual, l_seq); |
640 | 486 | } |
641 | 11.3M | else { |
642 | 11.3M | memset(cp, '\xff', l_seq); |
643 | 11.3M | } |
644 | | |
645 | 11.3M | return (int)data_len; |
646 | 11.3M | } |
647 | | |
648 | | hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar) |
649 | 3.74M | { |
650 | 3.74M | int k; |
651 | 3.74M | hts_pos_t l; |
652 | 4.48M | for (k = l = 0; k < n_cigar; ++k) |
653 | 734k | if (bam_cigar_type(bam_cigar_op(cigar[k]))&1) |
654 | 649k | l += bam_cigar_oplen(cigar[k]); |
655 | 3.74M | return l; |
656 | 3.74M | } |
657 | | |
658 | | hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar) |
659 | 66.5k | { |
660 | 66.5k | int k; |
661 | 66.5k | hts_pos_t l; |
662 | 2.61M | for (k = l = 0; k < n_cigar; ++k) |
663 | 2.54M | if (bam_cigar_type(bam_cigar_op(cigar[k]))&2) |
664 | 2.34M | l += bam_cigar_oplen(cigar[k]); |
665 | 66.5k | return l; |
666 | 66.5k | } |
667 | | |
668 | | hts_pos_t bam_endpos(const bam1_t *b) |
669 | 390 | { |
670 | 390 | hts_pos_t rlen = (b->core.flag & BAM_FUNMAP)? 0 : bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); |
671 | 390 | if (rlen == 0) rlen = 1; |
672 | 390 | return b->core.pos + rlen; |
673 | 390 | } |
674 | | |
675 | | static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG |
676 | 91.8k | { |
677 | 91.8k | bam1_core_t *c = &b->core; |
678 | | |
679 | | // Bail out as fast as possible for the easy case |
680 | 91.8k | uint32_t test_CG = BAM_CSOFT_CLIP | (c->l_qseq << BAM_CIGAR_SHIFT); |
681 | 91.8k | if (c->n_cigar == 0 || test_CG != *bam_get_cigar(b)) |
682 | 69.2k | return 0; |
683 | | |
684 | | // The above isn't fool proof - we may have old CIGAR tags that aren't used, |
685 | | // but this is much less likely so do as a secondary check. |
686 | 22.5k | if (c->tid < 0 || c->pos < 0) |
687 | 10.9k | return 0; |
688 | | |
689 | | // Do we have a CG tag? |
690 | 11.5k | uint8_t *CG = bam_aux_get(b, "CG"); |
691 | 11.5k | int saved_errno = errno; |
692 | 11.5k | if (!CG) { |
693 | 11.0k | if (errno != ENOENT) return -1; // Bad aux data |
694 | 11.0k | errno = saved_errno; // restore errno on expected no-CG-tag case |
695 | 11.0k | return 0; |
696 | 11.0k | } |
697 | | |
698 | | // Now we start with the serious work migrating CG to CIGAR |
699 | 512 | uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, |
700 | 512 | *cigar0, CG_len, fake_bytes; |
701 | 512 | cigar0 = bam_get_cigar(b); |
702 | 512 | fake_bytes = c->n_cigar * 4; |
703 | 512 | if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i')) |
704 | 122 | return 0; // not of type B,I |
705 | 390 | CG_len = le_to_u32(CG + 2); |
706 | | // don't move if the real CIGAR length is shorter than the fake cigar length |
707 | 390 | if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; |
708 | | |
709 | | // move from the CG tag to the right position |
710 | 390 | cigar_st = (uint8_t*)cigar0 - b->data; |
711 | 390 | c->n_cigar = CG_len; |
712 | 390 | n_cigar4 = c->n_cigar * 4; |
713 | 390 | CG_st = CG - b->data - 2; |
714 | 390 | CG_en = CG_st + 8 + n_cigar4; |
715 | 390 | if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1; |
716 | | // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place |
717 | 390 | b->l_data = b->l_data - fake_bytes + n_cigar4; |
718 | | // insert c->n_cigar-fake_bytes empty space to make room |
719 | 390 | memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); |
720 | | // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR |
721 | 390 | memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); |
722 | 390 | if (ori_len > CG_en) // move data after the CG tag |
723 | 43 | memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en); |
724 | 390 | b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4) |
725 | 390 | if (recal_bin) |
726 | 390 | b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5); |
727 | 390 | if (give_warning) |
728 | 390 | hts_log_warning("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar); |
729 | 390 | return 1; |
730 | 390 | } |
731 | | |
732 | | static inline int aux_type2size(uint8_t type) |
733 | 1.48M | { |
734 | 1.48M | switch (type) { |
735 | 822k | case 'A': case 'c': case 'C': |
736 | 822k | return 1; |
737 | 89.8k | case 's': case 'S': |
738 | 89.8k | return 2; |
739 | 280k | case 'i': case 'I': case 'f': |
740 | 280k | return 4; |
741 | 4.86k | case 'd': |
742 | 4.86k | return 8; |
743 | 290k | case 'Z': case 'H': case 'B': |
744 | 290k | return type; |
745 | 47 | default: |
746 | 47 | return 0; |
747 | 1.48M | } |
748 | 1.48M | } |
749 | | |
750 | | static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host) |
751 | 0 | { |
752 | 0 | uint32_t *cigar = (uint32_t*)(data + c->l_qname); |
753 | 0 | uint32_t i; |
754 | 0 | for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]); |
755 | 0 | } |
756 | | |
757 | | // Fix bad records where qname is not terminated correctly. |
758 | 97 | static int fixup_missing_qname_nul(bam1_t *b) { |
759 | 97 | bam1_core_t *c = &b->core; |
760 | | |
761 | | // Note this is called before c->l_extranul is added to c->l_qname |
762 | 97 | if (c->l_extranul > 0) { |
763 | 97 | b->data[c->l_qname++] = '\0'; |
764 | 97 | c->l_extranul--; |
765 | 97 | } else { |
766 | 0 | if (b->l_data > INT_MAX - 4) return -1; |
767 | 0 | if (realloc_bam_data(b, b->l_data + 4) < 0) return -1; |
768 | 0 | b->l_data += 4; |
769 | 0 | b->data[c->l_qname++] = '\0'; |
770 | 0 | c->l_extranul = 3; |
771 | 0 | } |
772 | 97 | return 0; |
773 | 97 | } |
774 | | |
775 | | /* |
776 | | * Note a second interface that returns a bam pointer instead would avoid bam_copy1 |
777 | | * in multi-threaded handling. This may be worth considering for htslib2. |
778 | | */ |
779 | | int bam_read1(BGZF *fp, bam1_t *b) |
780 | 209 | { |
781 | 209 | bam1_core_t *c = &b->core; |
782 | 209 | int32_t block_len, ret, i; |
783 | 209 | uint32_t new_l_data; |
784 | 209 | uint8_t tmp[32], *x; |
785 | | |
786 | 209 | b->l_data = 0; |
787 | | |
788 | 209 | if ((ret = bgzf_read_small(fp, &block_len, 4)) != 4) { |
789 | 0 | if (ret == 0) return -1; // normal end-of-file |
790 | 0 | else return -2; // truncated |
791 | 0 | } |
792 | 209 | if (fp->is_be) |
793 | 0 | ed_swap_4p(&block_len); |
794 | 209 | if (block_len < 32) return -4; // block_len includes core data |
795 | 209 | if (fp->block_length - fp->block_offset > 32) { |
796 | | // Avoid bgzf_read and a temporary copy to a local buffer |
797 | 207 | x = (uint8_t *)fp->uncompressed_block + fp->block_offset; |
798 | 207 | fp->block_offset += 32; |
799 | 207 | } else { |
800 | 2 | x = tmp; |
801 | 2 | if (bgzf_read(fp, x, 32) != 32) return -3; |
802 | 2 | } |
803 | | |
804 | 207 | c->tid = le_to_u32(x); |
805 | 207 | c->pos = le_to_i32(x+4); |
806 | 207 | uint32_t x2 = le_to_u32(x+8); |
807 | 207 | c->bin = x2>>16; |
808 | 207 | c->qual = x2>>8&0xff; |
809 | 207 | c->l_qname = x2&0xff; |
810 | 207 | c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0; |
811 | 207 | uint32_t x3 = le_to_u32(x+12); |
812 | 207 | c->flag = x3>>16; |
813 | 207 | c->n_cigar = x3&0xffff; |
814 | 207 | c->l_qseq = le_to_u32(x+16); |
815 | 207 | c->mtid = le_to_u32(x+20); |
816 | 207 | c->mpos = le_to_i32(x+24); |
817 | 207 | c->isize = le_to_i32(x+28); |
818 | | |
819 | 207 | new_l_data = block_len - 32 + c->l_extranul; |
820 | 207 | if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4; |
821 | 202 | if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul |
822 | 202 | + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data) |
823 | 5 | return -4; |
824 | 197 | if (realloc_bam_data(b, new_l_data) < 0) return -4; |
825 | 195 | b->l_data = new_l_data; |
826 | | |
827 | 195 | if (bgzf_read_small(fp, b->data, c->l_qname) != c->l_qname) return -4; |
828 | 193 | if (b->data[c->l_qname - 1] != '\0') { // try to fix missing nul termination |
829 | 97 | if (fixup_missing_qname_nul(b) < 0) return -4; |
830 | 97 | } |
831 | 362 | for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0'; |
832 | 193 | c->l_qname += c->l_extranul; |
833 | 193 | if (b->l_data < c->l_qname || |
834 | 193 | bgzf_read_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname) |
835 | 4 | return -4; |
836 | 189 | if (fp->is_be) swap_data(c, b->l_data, b->data, 0); |
837 | 189 | if (bam_tag2cigar(b, 0, 0) < 0) |
838 | 0 | return -4; |
839 | | |
840 | | // TODO: consider making this conditional |
841 | 189 | if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency |
842 | 189 | hts_pos_t rlen, qlen; |
843 | 189 | bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen); |
844 | 189 | if ((b->core.flag & BAM_FUNMAP) || rlen == 0) rlen = 1; |
845 | 189 | b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5); |
846 | | // Sanity check for broken CIGAR alignments |
847 | 189 | if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) { |
848 | 2 | hts_log_error("CIGAR and query sequence lengths differ for %s", |
849 | 2 | bam_get_qname(b)); |
850 | 2 | return -4; |
851 | 2 | } |
852 | 189 | } |
853 | | |
854 | 187 | return 4 + block_len; |
855 | 189 | } |
856 | | |
857 | | int bam_write1(BGZF *fp, const bam1_t *b) |
858 | 3.79M | { |
859 | 3.79M | const bam1_core_t *c = &b->core; |
860 | 3.79M | uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y; |
861 | 3.79M | int i, ok; |
862 | 3.79M | if (c->l_qname - c->l_extranul > 255) { |
863 | 0 | hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b)); |
864 | 0 | errno = EOVERFLOW; |
865 | 0 | return -1; |
866 | 0 | } |
867 | 3.79M | if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR |
868 | 3.79M | if (c->pos > INT_MAX || |
869 | 3.79M | c->mpos > INT_MAX || |
870 | 3.79M | c->isize < INT_MIN || c->isize > INT_MAX) { |
871 | 28 | hts_log_error("Positional data is too large for BAM format"); |
872 | 28 | return -1; |
873 | 28 | } |
874 | 3.79M | x[0] = c->tid; |
875 | 3.79M | x[1] = c->pos; |
876 | 3.79M | x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul); |
877 | 3.79M | if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2; |
878 | 3.79M | else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff); |
879 | 3.79M | x[4] = c->l_qseq; |
880 | 3.79M | x[5] = c->mtid; |
881 | 3.79M | x[6] = c->mpos; |
882 | 3.79M | x[7] = c->isize; |
883 | 3.79M | ok = (bgzf_flush_try(fp, 4 + block_len) >= 0); |
884 | 3.79M | if (fp->is_be) { |
885 | 0 | for (i = 0; i < 8; ++i) ed_swap_4p(x + i); |
886 | 0 | y = block_len; |
887 | 0 | if (ok) ok = (bgzf_write_small(fp, ed_swap_4p(&y), 4) >= 0); |
888 | 0 | swap_data(c, b->l_data, b->data, 1); |
889 | 3.79M | } else { |
890 | 3.79M | if (ok) ok = (bgzf_write_small(fp, &block_len, 4) >= 0); |
891 | 3.79M | } |
892 | 3.79M | if (ok) ok = (bgzf_write_small(fp, x, 32) >= 0); |
893 | 3.79M | if (ok) ok = (bgzf_write_small(fp, b->data, c->l_qname - c->l_extranul) >= 0); |
894 | 3.79M | if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally |
895 | 3.79M | if (ok) ok = (bgzf_write_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0); |
896 | 3.79M | } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag |
897 | 6 | uint8_t buf[8]; |
898 | 6 | uint32_t cigar_st, cigar_en, cigar[2]; |
899 | 6 | hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b)); |
900 | 6 | if (cigreflen >= (1<<28)) { |
901 | | // Length of reference covered is greater than the biggest |
902 | | // CIGAR operation currently allowed. |
903 | 0 | hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos |
904 | 0 | " cannot be written in BAM. Try writing SAM or CRAM instead.\n", |
905 | 0 | bam_get_qname(b), c->n_cigar, cigreflen); |
906 | 0 | return -1; |
907 | 0 | } |
908 | 6 | cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; |
909 | 6 | cigar_en = cigar_st + c->n_cigar * 4; |
910 | 6 | cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP; |
911 | 6 | cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP; |
912 | 6 | u32_to_le(cigar[0], buf); |
913 | 6 | u32_to_le(cigar[1], buf + 4); |
914 | 6 | if (ok) ok = (bgzf_write_small(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N |
915 | 6 | if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR |
916 | 6 | if (ok) ok = (bgzf_write_small(fp, "CGBI", 4) >= 0); // write CG:B,I |
917 | 6 | u32_to_le(c->n_cigar, buf); |
918 | 6 | if (ok) ok = (bgzf_write_small(fp, buf, 4) >= 0); // write the true CIGAR length |
919 | 6 | if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR |
920 | 6 | } |
921 | 3.79M | if (fp->is_be) swap_data(c, b->l_data, b->data, 0); |
922 | 3.79M | return ok? 4 + block_len : -1; |
923 | 3.79M | } |
924 | | |
925 | | /* |
926 | | * Write a BAM file and append to the in-memory index simultaneously. |
927 | | */ |
928 | 3.79M | static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) { |
929 | 3.79M | BGZF *bfp = fp->fp.bgzf; |
930 | | |
931 | 3.79M | if (!fp->idx) |
932 | 3.79M | return bam_write1(bfp, b); |
933 | | |
934 | 0 | uint32_t block_len = b->l_data - b->core.l_extranul + 32; |
935 | 0 | if (bgzf_flush_try(bfp, 4 + block_len) < 0) |
936 | 0 | return -1; |
937 | 0 | if (!bfp->mt) |
938 | 0 | hts_idx_amend_last(fp->idx, bgzf_tell(bfp)); |
939 | |
|
940 | 0 | int ret = bam_write1(bfp, b); |
941 | 0 | if (ret < 0) |
942 | 0 | return -1; |
943 | | |
944 | 0 | if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) { |
945 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
946 | 0 | bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); |
947 | 0 | ret = -1; |
948 | 0 | } |
949 | |
|
950 | 0 | return ret; |
951 | 0 | } |
952 | | |
953 | | /* |
954 | | * Set the qname in a BAM record |
955 | | */ |
956 | | int bam_set_qname(bam1_t *rec, const char *qname) |
957 | 0 | { |
958 | 0 | if (!rec) return -1; |
959 | 0 | if (!qname || !*qname) return -1; |
960 | | |
961 | 0 | size_t old_len = rec->core.l_qname; |
962 | 0 | size_t new_len = strlen(qname) + 1; |
963 | 0 | if (new_len < 1 || new_len > 255) return -1; |
964 | | |
965 | 0 | int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0; |
966 | |
|
967 | 0 | size_t new_data_len = rec->l_data - old_len + new_len + extranul; |
968 | 0 | if (realloc_bam_data(rec, new_data_len) < 0) return -1; |
969 | | |
970 | | // Make room |
971 | 0 | if (new_len + extranul != rec->core.l_qname) |
972 | 0 | memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname); |
973 | | // Copy in new name and pad if needed |
974 | 0 | memcpy(rec->data, qname, new_len); |
975 | 0 | int n; |
976 | 0 | for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0'; |
977 | |
|
978 | 0 | rec->l_data = new_data_len; |
979 | 0 | rec->core.l_qname = new_len + extranul; |
980 | 0 | rec->core.l_extranul = extranul; |
981 | |
|
982 | 0 | return 0; |
983 | 0 | } |
984 | | |
985 | | /******************** |
986 | | *** BAM indexing *** |
987 | | ********************/ |
988 | | |
989 | | static hts_idx_t *sam_index(htsFile *fp, int min_shift) |
990 | 0 | { |
991 | 0 | int n_lvls, i, fmt, ret; |
992 | 0 | bam1_t *b; |
993 | 0 | hts_idx_t *idx; |
994 | 0 | sam_hdr_t *h; |
995 | 0 | h = sam_hdr_read(fp); |
996 | 0 | if (h == NULL) return NULL; |
997 | 0 | if (min_shift > 0) { |
998 | 0 | hts_pos_t max_len = 0, s; |
999 | 0 | for (i = 0; i < h->n_targets; ++i) { |
1000 | 0 | hts_pos_t len = sam_hdr_tid2len(h, i); |
1001 | 0 | if (max_len < len) max_len = len; |
1002 | 0 | } |
1003 | 0 | max_len += 256; |
1004 | 0 | for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3); |
1005 | 0 | fmt = HTS_FMT_CSI; |
1006 | 0 | } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI; |
1007 | 0 | idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); |
1008 | 0 | b = bam_init1(); |
1009 | 0 | while ((ret = sam_read1(fp, h, b)) >= 0) { |
1010 | 0 | ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)); |
1011 | 0 | if (ret < 0) { // unsorted or doesn't fit |
1012 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); |
1013 | 0 | goto err; |
1014 | 0 | } |
1015 | 0 | } |
1016 | 0 | if (ret < -1) goto err; // corrupted BAM file |
1017 | | |
1018 | 0 | hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf)); |
1019 | 0 | sam_hdr_destroy(h); |
1020 | 0 | bam_destroy1(b); |
1021 | 0 | return idx; |
1022 | | |
1023 | 0 | err: |
1024 | 0 | bam_destroy1(b); |
1025 | 0 | hts_idx_destroy(idx); |
1026 | 0 | return NULL; |
1027 | 0 | } |
1028 | | |
1029 | | int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads) |
1030 | 0 | { |
1031 | 0 | hts_idx_t *idx; |
1032 | 0 | htsFile *fp; |
1033 | 0 | int ret = 0; |
1034 | |
|
1035 | 0 | if ((fp = hts_open(fn, "r")) == 0) return -2; |
1036 | 0 | if (nthreads) |
1037 | 0 | hts_set_threads(fp, nthreads); |
1038 | |
|
1039 | 0 | switch (fp->format.format) { |
1040 | 0 | case cram: |
1041 | |
|
1042 | 0 | ret = cram_index_build(fp->fp.cram, fn, fnidx); |
1043 | 0 | break; |
1044 | | |
1045 | 0 | case bam: |
1046 | 0 | case sam: |
1047 | 0 | if (fp->format.compression != bgzf) { |
1048 | 0 | hts_log_error("%s file \"%s\" not BGZF compressed", |
1049 | 0 | fp->format.format == bam ? "BAM" : "SAM", fn); |
1050 | 0 | ret = -1; |
1051 | 0 | break; |
1052 | 0 | } |
1053 | 0 | idx = sam_index(fp, min_shift); |
1054 | 0 | if (idx) { |
1055 | 0 | ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI); |
1056 | 0 | if (ret < 0) ret = -4; |
1057 | 0 | hts_idx_destroy(idx); |
1058 | 0 | } |
1059 | 0 | else ret = -1; |
1060 | 0 | break; |
1061 | | |
1062 | 0 | default: |
1063 | 0 | ret = -3; |
1064 | 0 | break; |
1065 | 0 | } |
1066 | 0 | hts_close(fp); |
1067 | |
|
1068 | 0 | return ret; |
1069 | 0 | } |
1070 | | |
1071 | | int sam_index_build2(const char *fn, const char *fnidx, int min_shift) |
1072 | 0 | { |
1073 | 0 | return sam_index_build3(fn, fnidx, min_shift, 0); |
1074 | 0 | } |
1075 | | |
1076 | | int sam_index_build(const char *fn, int min_shift) |
1077 | 0 | { |
1078 | 0 | return sam_index_build3(fn, NULL, min_shift, 0); |
1079 | 0 | } |
1080 | | |
1081 | | // Provide bam_index_build() symbol for binary compatibility with earlier HTSlib |
1082 | | #undef bam_index_build |
1083 | | int bam_index_build(const char *fn, int min_shift) |
1084 | 0 | { |
1085 | 0 | return sam_index_build2(fn, NULL, min_shift); |
1086 | 0 | } |
1087 | | |
1088 | | // Initialise fp->idx for the current format type. |
1089 | | // This must be called after the header has been written but no other data. |
1090 | 0 | int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) { |
1091 | 0 | fp->fnidx = fnidx; |
1092 | 0 | if (fp->format.format == bam || fp->format.format == bcf || |
1093 | 0 | (fp->format.format == sam && fp->format.compression == bgzf)) { |
1094 | 0 | int n_lvls, fmt = HTS_FMT_CSI; |
1095 | 0 | if (min_shift > 0) { |
1096 | 0 | int64_t max_len = 0, s; |
1097 | 0 | int i; |
1098 | 0 | for (i = 0; i < h->n_targets; ++i) |
1099 | 0 | if (max_len < h->target_len[i]) max_len = h->target_len[i]; |
1100 | 0 | max_len += 256; |
1101 | 0 | for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3); |
1102 | |
|
1103 | 0 | } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI; |
1104 | |
|
1105 | 0 | fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); |
1106 | 0 | return fp->idx ? 0 : -1; |
1107 | 0 | } |
1108 | | |
1109 | 0 | if (fp->format.format == cram) { |
1110 | 0 | fp->fp.cram->idxfp = bgzf_open(fnidx, "wg"); |
1111 | 0 | return fp->fp.cram->idxfp ? 0 : -1; |
1112 | 0 | } |
1113 | | |
1114 | 0 | return -1; |
1115 | 0 | } |
1116 | | |
1117 | | // Finishes an index. Call after the last record has been written. |
1118 | | // Returns 0 on success, <0 on failure. |
1119 | 0 | int sam_idx_save(htsFile *fp) { |
1120 | 0 | if (fp->format.format == bam || fp->format.format == bcf || |
1121 | 0 | fp->format.format == vcf || fp->format.format == sam) { |
1122 | 0 | int ret; |
1123 | 0 | if ((ret = sam_state_destroy(fp)) < 0) { |
1124 | 0 | errno = -ret; |
1125 | 0 | return -1; |
1126 | 0 | } |
1127 | 0 | if (!fp->is_bgzf || bgzf_flush(fp->fp.bgzf) < 0) |
1128 | 0 | return -1; |
1129 | 0 | hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf)); |
1130 | |
|
1131 | 0 | if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0) |
1132 | 0 | return -1; |
1133 | | |
1134 | 0 | return hts_idx_save_but_not_close(fp->idx, fp->fnidx, hts_idx_fmt(fp->idx)); |
1135 | |
|
1136 | 0 | } else if (fp->format.format == cram) { |
1137 | | // flushed and closed by cram_close |
1138 | 0 | } |
1139 | | |
1140 | 0 | return 0; |
1141 | 0 | } |
1142 | | |
1143 | | static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) |
1144 | 0 | { |
1145 | 0 | htsFile *fp = (htsFile *)fpv; |
1146 | 0 | bam1_t *b = bv; |
1147 | 0 | fp->line.l = 0; |
1148 | 0 | int ret = sam_read1(fp, fp->bam_header, b); |
1149 | 0 | if (ret >= 0) { |
1150 | 0 | *tid = b->core.tid; |
1151 | 0 | *beg = b->core.pos; |
1152 | 0 | *end = bam_endpos(b); |
1153 | 0 | } |
1154 | 0 | return ret; |
1155 | 0 | } |
1156 | | |
1157 | | // This is used only with read_rest=1 iterators, so need not set tid/beg/end. |
1158 | | static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) |
1159 | 0 | { |
1160 | 0 | htsFile *fp = (htsFile *)fpv; |
1161 | 0 | bam1_t *b = bv; |
1162 | 0 | fp->line.l = 0; |
1163 | 0 | int ret = sam_read1(fp, fp->bam_header, b); |
1164 | 0 | return ret; |
1165 | 0 | } |
1166 | | |
1167 | | // Internal (for now) func used by bam_sym_lookup. This is copied from |
1168 | | // samtools/bam.c. |
1169 | | static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b) |
1170 | 0 | { |
1171 | 0 | const char *rg; |
1172 | 0 | kstring_t lib = { 0, 0, NULL }; |
1173 | 0 | rg = (char *)bam_aux_get(b, "RG"); |
1174 | |
|
1175 | 0 | if (!rg) |
1176 | 0 | return NULL; |
1177 | 0 | else |
1178 | 0 | rg++; |
1179 | | |
1180 | 0 | if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib) < 0) |
1181 | 0 | return NULL; |
1182 | | |
1183 | 0 | static char LB_text[1024]; |
1184 | 0 | int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1; |
1185 | |
|
1186 | 0 | memcpy(LB_text, lib.s, len); |
1187 | 0 | LB_text[len] = 0; |
1188 | |
|
1189 | 0 | free(lib.s); |
1190 | |
|
1191 | 0 | return LB_text; |
1192 | 0 | } |
1193 | | |
1194 | | |
1195 | | // Bam record pointer and SAM header combined |
1196 | | typedef struct { |
1197 | | const sam_hdr_t *h; |
1198 | | const bam1_t *b; |
1199 | | } hb_pair; |
1200 | | |
1201 | | // Looks up variable names in str and replaces them with their value. |
1202 | | // Also supports aux tags. |
1203 | | // |
1204 | | // Note the expression parser deliberately overallocates str size so it |
1205 | | // is safe to use memcmp over strcmp. |
1206 | | static int bam_sym_lookup(void *data, char *str, char **end, |
1207 | 0 | hts_expr_val_t *res) { |
1208 | 0 | hb_pair *hb = (hb_pair *)data; |
1209 | 0 | const bam1_t *b = hb->b; |
1210 | |
|
1211 | 0 | res->is_str = 0; |
1212 | 0 | switch(*str) { |
1213 | 0 | case 'c': |
1214 | 0 | if (memcmp(str, "cigar", 5) == 0) { |
1215 | 0 | *end = str+5; |
1216 | 0 | res->is_str = 1; |
1217 | 0 | ks_clear(&res->s); |
1218 | 0 | uint32_t *cigar = bam_get_cigar(b); |
1219 | 0 | int i, n = b->core.n_cigar, r = 0; |
1220 | 0 | if (n) { |
1221 | 0 | for (i = 0; i < n; i++) { |
1222 | 0 | r |= kputw (bam_cigar_oplen(cigar[i]), &res->s) < 0; |
1223 | 0 | r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s) < 0; |
1224 | 0 | } |
1225 | 0 | r |= kputs("", &res->s) < 0; |
1226 | 0 | } else { |
1227 | 0 | r |= kputs("*", &res->s) < 0; |
1228 | 0 | } |
1229 | 0 | return r ? -1 : 0; |
1230 | 0 | } |
1231 | 0 | break; |
1232 | | |
1233 | 0 | case 'e': |
1234 | 0 | if (memcmp(str, "endpos", 6) == 0) { |
1235 | 0 | *end = str+6; |
1236 | 0 | res->d = bam_endpos(b); |
1237 | 0 | return 0; |
1238 | 0 | } |
1239 | 0 | break; |
1240 | | |
1241 | 0 | case 'f': |
1242 | 0 | if (memcmp(str, "flag", 4) == 0) { |
1243 | 0 | str = *end = str+4; |
1244 | 0 | if (*str != '.') { |
1245 | 0 | res->d = b->core.flag; |
1246 | 0 | return 0; |
1247 | 0 | } else { |
1248 | 0 | str++; |
1249 | 0 | if (!memcmp(str, "paired", 6)) { |
1250 | 0 | *end = str+6; |
1251 | 0 | res->d = b->core.flag & BAM_FPAIRED; |
1252 | 0 | return 0; |
1253 | 0 | } else if (!memcmp(str, "proper_pair", 11)) { |
1254 | 0 | *end = str+11; |
1255 | 0 | res->d = b->core.flag & BAM_FPROPER_PAIR; |
1256 | 0 | return 0; |
1257 | 0 | } else if (!memcmp(str, "unmap", 5)) { |
1258 | 0 | *end = str+5; |
1259 | 0 | res->d = b->core.flag & BAM_FUNMAP; |
1260 | 0 | return 0; |
1261 | 0 | } else if (!memcmp(str, "munmap", 6)) { |
1262 | 0 | *end = str+6; |
1263 | 0 | res->d = b->core.flag & BAM_FMUNMAP; |
1264 | 0 | return 0; |
1265 | 0 | } else if (!memcmp(str, "reverse", 7)) { |
1266 | 0 | *end = str+7; |
1267 | 0 | res->d = b->core.flag & BAM_FREVERSE; |
1268 | 0 | return 0; |
1269 | 0 | } else if (!memcmp(str, "mreverse", 8)) { |
1270 | 0 | *end = str+8; |
1271 | 0 | res->d = b->core.flag & BAM_FMREVERSE; |
1272 | 0 | return 0; |
1273 | 0 | } else if (!memcmp(str, "read1", 5)) { |
1274 | 0 | *end = str+5; |
1275 | 0 | res->d = b->core.flag & BAM_FREAD1; |
1276 | 0 | return 0; |
1277 | 0 | } else if (!memcmp(str, "read2", 5)) { |
1278 | 0 | *end = str+5; |
1279 | 0 | res->d = b->core.flag & BAM_FREAD2; |
1280 | 0 | return 0; |
1281 | 0 | } else if (!memcmp(str, "secondary", 9)) { |
1282 | 0 | *end = str+9; |
1283 | 0 | res->d = b->core.flag & BAM_FSECONDARY; |
1284 | 0 | return 0; |
1285 | 0 | } else if (!memcmp(str, "qcfail", 6)) { |
1286 | 0 | *end = str+6; |
1287 | 0 | res->d = b->core.flag & BAM_FQCFAIL; |
1288 | 0 | return 0; |
1289 | 0 | } else if (!memcmp(str, "dup", 3)) { |
1290 | 0 | *end = str+3; |
1291 | 0 | res->d = b->core.flag & BAM_FDUP; |
1292 | 0 | return 0; |
1293 | 0 | } else if (!memcmp(str, "supplementary", 13)) { |
1294 | 0 | *end = str+13; |
1295 | 0 | res->d = b->core.flag & BAM_FSUPPLEMENTARY; |
1296 | 0 | return 0; |
1297 | 0 | } else { |
1298 | 0 | hts_log_error("Unrecognised flag string"); |
1299 | 0 | return -1; |
1300 | 0 | } |
1301 | 0 | } |
1302 | 0 | } |
1303 | 0 | break; |
1304 | | |
1305 | 0 | case 'h': |
1306 | 0 | if (memcmp(str, "hclen", 5) == 0) { |
1307 | 0 | int hclen = 0; |
1308 | 0 | uint32_t *cigar = bam_get_cigar(b); |
1309 | 0 | uint32_t ncigar = b->core.n_cigar; |
1310 | | |
1311 | | // left |
1312 | 0 | if (ncigar > 0 && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP) |
1313 | 0 | hclen = bam_cigar_oplen(cigar[0]); |
1314 | | |
1315 | | // right |
1316 | 0 | if (ncigar > 1 && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP) |
1317 | 0 | hclen += bam_cigar_oplen(cigar[ncigar-1]); |
1318 | |
|
1319 | 0 | *end = str+5; |
1320 | 0 | res->d = hclen; |
1321 | 0 | return 0; |
1322 | 0 | } |
1323 | 0 | break; |
1324 | | |
1325 | 0 | case 'l': |
1326 | 0 | if (memcmp(str, "library", 7) == 0) { |
1327 | 0 | *end = str+7; |
1328 | 0 | res->is_str = 1; |
1329 | 0 | const char *lib = bam_get_library(hb->h, b); |
1330 | 0 | kputs(lib ? lib : "", ks_clear(&res->s)); |
1331 | 0 | return 0; |
1332 | 0 | } |
1333 | 0 | break; |
1334 | | |
1335 | 0 | case 'm': |
1336 | 0 | if (memcmp(str, "mapq", 4) == 0) { |
1337 | 0 | *end = str+4; |
1338 | 0 | res->d = b->core.qual; |
1339 | 0 | return 0; |
1340 | 0 | } else if (memcmp(str, "mpos", 4) == 0) { |
1341 | 0 | *end = str+4; |
1342 | 0 | res->d = b->core.mpos+1; |
1343 | 0 | return 0; |
1344 | 0 | } else if (memcmp(str, "mrname", 6) == 0) { |
1345 | 0 | *end = str+6; |
1346 | 0 | res->is_str = 1; |
1347 | 0 | const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); |
1348 | 0 | kputs(rn ? rn : "*", ks_clear(&res->s)); |
1349 | 0 | return 0; |
1350 | 0 | } else if (memcmp(str, "mrefid", 6) == 0) { |
1351 | 0 | *end = str+6; |
1352 | 0 | res->d = b->core.mtid; |
1353 | 0 | return 0; |
1354 | 0 | } |
1355 | 0 | break; |
1356 | | |
1357 | 0 | case 'n': |
1358 | 0 | if (memcmp(str, "ncigar", 6) == 0) { |
1359 | 0 | *end = str+6; |
1360 | 0 | res->d = b->core.n_cigar; |
1361 | 0 | return 0; |
1362 | 0 | } |
1363 | 0 | break; |
1364 | | |
1365 | 0 | case 'p': |
1366 | 0 | if (memcmp(str, "pos", 3) == 0) { |
1367 | 0 | *end = str+3; |
1368 | 0 | res->d = b->core.pos+1; |
1369 | 0 | return 0; |
1370 | 0 | } else if (memcmp(str, "pnext", 5) == 0) { |
1371 | 0 | *end = str+5; |
1372 | 0 | res->d = b->core.mpos+1; |
1373 | 0 | return 0; |
1374 | 0 | } |
1375 | 0 | break; |
1376 | | |
1377 | 0 | case 'q': |
1378 | 0 | if (memcmp(str, "qlen", 4) == 0) { |
1379 | 0 | *end = str+4; |
1380 | 0 | res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)); |
1381 | 0 | return 0; |
1382 | 0 | } else if (memcmp(str, "qname", 5) == 0) { |
1383 | 0 | *end = str+5; |
1384 | 0 | res->is_str = 1; |
1385 | 0 | kputs(bam_get_qname(b), ks_clear(&res->s)); |
1386 | 0 | return 0; |
1387 | 0 | } else if (memcmp(str, "qual", 4) == 0) { |
1388 | 0 | *end = str+4; |
1389 | 0 | ks_clear(&res->s); |
1390 | 0 | if (ks_resize(&res->s, b->core.l_qseq+1) < 0) |
1391 | 0 | return -1; |
1392 | 0 | memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq); |
1393 | 0 | res->s.l = b->core.l_qseq; |
1394 | 0 | res->is_str = 1; |
1395 | 0 | return 0; |
1396 | 0 | } |
1397 | 0 | break; |
1398 | | |
1399 | 0 | case 'r': |
1400 | 0 | if (memcmp(str, "rlen", 4) == 0) { |
1401 | 0 | *end = str+4; |
1402 | 0 | res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); |
1403 | 0 | return 0; |
1404 | 0 | } else if (memcmp(str, "rname", 5) == 0) { |
1405 | 0 | *end = str+5; |
1406 | 0 | res->is_str = 1; |
1407 | 0 | const char *rn = sam_hdr_tid2name(hb->h, b->core.tid); |
1408 | 0 | kputs(rn ? rn : "*", ks_clear(&res->s)); |
1409 | 0 | return 0; |
1410 | 0 | } else if (memcmp(str, "rnext", 5) == 0) { |
1411 | 0 | *end = str+5; |
1412 | 0 | res->is_str = 1; |
1413 | 0 | const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); |
1414 | 0 | kputs(rn ? rn : "*", ks_clear(&res->s)); |
1415 | 0 | return 0; |
1416 | 0 | } else if (memcmp(str, "refid", 5) == 0) { |
1417 | 0 | *end = str+5; |
1418 | 0 | res->d = b->core.tid; |
1419 | 0 | return 0; |
1420 | 0 | } |
1421 | 0 | break; |
1422 | | |
1423 | 0 | case 's': |
1424 | 0 | if (memcmp(str, "seq", 3) == 0) { |
1425 | 0 | *end = str+3; |
1426 | 0 | ks_clear(&res->s); |
1427 | 0 | if (ks_resize(&res->s, b->core.l_qseq+1) < 0) |
1428 | 0 | return -1; |
1429 | 0 | nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq); |
1430 | 0 | res->s.s[b->core.l_qseq] = 0; |
1431 | 0 | res->s.l = b->core.l_qseq; |
1432 | 0 | res->is_str = 1; |
1433 | 0 | return 0; |
1434 | 0 | } else if (memcmp(str, "sclen", 5) == 0) { |
1435 | 0 | int sclen = 0; |
1436 | 0 | uint32_t *cigar = bam_get_cigar(b); |
1437 | 0 | int ncigar = b->core.n_cigar; |
1438 | 0 | int left = 0; |
1439 | | |
1440 | | // left |
1441 | 0 | if (ncigar > 0 |
1442 | 0 | && bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) |
1443 | 0 | left = 0, sclen += bam_cigar_oplen(cigar[0]); |
1444 | 0 | else if (ncigar > 1 |
1445 | 0 | && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP |
1446 | 0 | && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP) |
1447 | 0 | left = 1, sclen += bam_cigar_oplen(cigar[1]); |
1448 | | |
1449 | | // right |
1450 | 0 | if (ncigar-1 > left |
1451 | 0 | && bam_cigar_op(cigar[ncigar-1]) == BAM_CSOFT_CLIP) |
1452 | 0 | sclen += bam_cigar_oplen(cigar[ncigar-1]); |
1453 | 0 | else if (ncigar-2 > left |
1454 | 0 | && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP |
1455 | 0 | && bam_cigar_op(cigar[ncigar-2]) == BAM_CSOFT_CLIP) |
1456 | 0 | sclen += bam_cigar_oplen(cigar[ncigar-2]); |
1457 | |
|
1458 | 0 | *end = str+5; |
1459 | 0 | res->d = sclen; |
1460 | 0 | return 0; |
1461 | 0 | } |
1462 | 0 | break; |
1463 | | |
1464 | 0 | case 't': |
1465 | 0 | if (memcmp(str, "tlen", 4) == 0) { |
1466 | 0 | *end = str+4; |
1467 | 0 | res->d = b->core.isize; |
1468 | 0 | return 0; |
1469 | 0 | } |
1470 | 0 | break; |
1471 | | |
1472 | 0 | case '[': |
1473 | 0 | if (*str == '[' && str[1] && str[2] && str[3] == ']') { |
1474 | | /* aux tags */ |
1475 | 0 | *end = str+4; |
1476 | |
|
1477 | 0 | uint8_t *aux = bam_aux_get(b, str+1); |
1478 | 0 | if (aux) { |
1479 | | // we define the truth of a tag to be its presence, even if 0. |
1480 | 0 | res->is_true = 1; |
1481 | 0 | switch (*aux) { |
1482 | 0 | case 'Z': |
1483 | 0 | case 'H': |
1484 | 0 | res->is_str = 1; |
1485 | 0 | kputs((char *)aux+1, ks_clear(&res->s)); |
1486 | 0 | break; |
1487 | | |
1488 | 0 | case 'A': |
1489 | 0 | res->is_str = 1; |
1490 | 0 | kputsn((char *)aux+1, 1, ks_clear(&res->s)); |
1491 | 0 | break; |
1492 | | |
1493 | 0 | case 'i': case 'I': |
1494 | 0 | case 's': case 'S': |
1495 | 0 | case 'c': case 'C': |
1496 | 0 | res->is_str = 0; |
1497 | 0 | res->d = bam_aux2i(aux); |
1498 | 0 | break; |
1499 | | |
1500 | 0 | case 'f': |
1501 | 0 | case 'd': |
1502 | 0 | res->is_str = 0; |
1503 | 0 | res->d = bam_aux2f(aux); |
1504 | 0 | break; |
1505 | | |
1506 | 0 | default: |
1507 | 0 | hts_log_error("Aux type '%c not yet supported by filters", |
1508 | 0 | *aux); |
1509 | 0 | return -1; |
1510 | 0 | } |
1511 | 0 | return 0; |
1512 | |
|
1513 | 0 | } else { |
1514 | | // hence absent tags are always false (and strings) |
1515 | 0 | res->is_str = 1; |
1516 | 0 | res->s.l = 0; |
1517 | 0 | res->d = 0; |
1518 | 0 | res->is_true = 0; |
1519 | 0 | return 0; |
1520 | 0 | } |
1521 | 0 | } |
1522 | 0 | break; |
1523 | 0 | } |
1524 | | |
1525 | | // All successful matches in switch should return 0. |
1526 | | // So if we didn't match, it's a parse error. |
1527 | 0 | return -1; |
1528 | 0 | } |
1529 | | |
1530 | | // Returns 1 when accepted by the filter, 0 if not, -1 on error. |
1531 | | int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt) |
1532 | 0 | { |
1533 | 0 | hb_pair hb = {h, b}; |
1534 | 0 | hts_expr_val_t res = HTS_EXPR_VAL_INIT; |
1535 | 0 | if (hts_filter_eval2(filt, &hb, bam_sym_lookup, &res)) { |
1536 | 0 | hts_log_error("Couldn't process filter expression"); |
1537 | 0 | hts_expr_val_free(&res); |
1538 | 0 | return -1; |
1539 | 0 | } |
1540 | | |
1541 | 0 | int t = res.is_true; |
1542 | 0 | hts_expr_val_free(&res); |
1543 | |
|
1544 | 0 | return t; |
1545 | 0 | } |
1546 | | |
1547 | | static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) |
1548 | 0 | { |
1549 | 0 | htsFile *fp = fpv; |
1550 | 0 | bam1_t *b = bv; |
1551 | 0 | int pass_filter, ret; |
1552 | |
|
1553 | 0 | do { |
1554 | 0 | ret = cram_get_bam_seq(fp->fp.cram, &b); |
1555 | 0 | if (ret < 0) |
1556 | 0 | return cram_eof(fp->fp.cram) ? -1 : -2; |
1557 | | |
1558 | 0 | if (bam_tag2cigar(b, 1, 1) < 0) |
1559 | 0 | return -2; |
1560 | | |
1561 | 0 | *tid = b->core.tid; |
1562 | 0 | *beg = b->core.pos; |
1563 | 0 | *end = bam_endpos(b); |
1564 | |
|
1565 | 0 | if (fp->filter) { |
1566 | 0 | pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter); |
1567 | 0 | if (pass_filter < 0) |
1568 | 0 | return -2; |
1569 | 0 | } else { |
1570 | 0 | pass_filter = 1; |
1571 | 0 | } |
1572 | 0 | } while (pass_filter == 0); |
1573 | | |
1574 | 0 | return ret; |
1575 | 0 | } |
1576 | | |
1577 | | static int cram_pseek(void *fp, int64_t offset, int whence) |
1578 | 0 | { |
1579 | 0 | cram_fd *fd = (cram_fd *)fp; |
1580 | |
|
1581 | 0 | if ((0 != cram_seek(fd, offset, SEEK_SET)) |
1582 | 0 | && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR))) |
1583 | 0 | return -1; |
1584 | | |
1585 | 0 | fd->curr_position = offset; |
1586 | |
|
1587 | 0 | if (fd->ctr) { |
1588 | 0 | cram_free_container(fd->ctr); |
1589 | 0 | if (fd->ctr_mt && fd->ctr_mt != fd->ctr) |
1590 | 0 | cram_free_container(fd->ctr_mt); |
1591 | |
|
1592 | 0 | fd->ctr = NULL; |
1593 | 0 | fd->ctr_mt = NULL; |
1594 | 0 | fd->ooc = 0; |
1595 | 0 | } |
1596 | |
|
1597 | 0 | return 0; |
1598 | 0 | } |
1599 | | |
1600 | | /* |
1601 | | * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only |
1602 | | * after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered |
1603 | | * container previously fetched. It was designed like this to integrate with the functionality |
1604 | | * of the iterator stepping logic. |
1605 | | */ |
1606 | | |
1607 | | static int64_t cram_ptell(void *fp) |
1608 | 0 | { |
1609 | 0 | cram_fd *fd = (cram_fd *)fp; |
1610 | 0 | cram_container *c; |
1611 | 0 | cram_slice *s; |
1612 | 0 | int64_t ret = -1L; |
1613 | |
|
1614 | 0 | if (fd) { |
1615 | 0 | if ((c = fd->ctr) != NULL) { |
1616 | 0 | if ((s = c->slice) != NULL && s->max_rec) { |
1617 | 0 | if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1)) |
1618 | 0 | fd->curr_position += c->offset + c->length; |
1619 | 0 | } |
1620 | 0 | } |
1621 | 0 | ret = fd->curr_position; |
1622 | 0 | } |
1623 | |
|
1624 | 0 | return ret; |
1625 | 0 | } |
1626 | | |
1627 | | static int bam_pseek(void *fp, int64_t offset, int whence) |
1628 | 0 | { |
1629 | 0 | BGZF *fd = (BGZF *)fp; |
1630 | |
|
1631 | 0 | return bgzf_seek(fd, offset, whence); |
1632 | 0 | } |
1633 | | |
1634 | | static int64_t bam_ptell(void *fp) |
1635 | 0 | { |
1636 | 0 | BGZF *fd = (BGZF *)fp; |
1637 | 0 | if (!fd) |
1638 | 0 | return -1L; |
1639 | | |
1640 | 0 | return bgzf_tell(fd); |
1641 | 0 | } |
1642 | | |
1643 | | |
1644 | | |
1645 | | static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags) |
1646 | 0 | { |
1647 | 0 | switch (fp->format.format) { |
1648 | 0 | case bam: |
1649 | 0 | case sam: |
1650 | 0 | return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags); |
1651 | | |
1652 | 0 | case cram: { |
1653 | 0 | if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL; |
1654 | | |
1655 | | // Cons up a fake "index" just pointing at the associated cram_fd: |
1656 | 0 | hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t)); |
1657 | 0 | if (idx == NULL) return NULL; |
1658 | 0 | idx->fmt = HTS_FMT_CRAI; |
1659 | 0 | idx->cram = fp->fp.cram; |
1660 | 0 | return (hts_idx_t *) idx; |
1661 | 0 | } |
1662 | | |
1663 | 0 | default: |
1664 | 0 | return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t |
1665 | 0 | } |
1666 | 0 | } |
1667 | | |
1668 | | hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags) |
1669 | 0 | { |
1670 | 0 | return index_load(fp, fn, fnidx, flags); |
1671 | 0 | } |
1672 | | |
1673 | 0 | hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) { |
1674 | 0 | return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE); |
1675 | 0 | } |
1676 | | |
1677 | | hts_idx_t *sam_index_load(htsFile *fp, const char *fn) |
1678 | 0 | { |
1679 | 0 | return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE); |
1680 | 0 | } |
1681 | | |
1682 | | static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec) |
1683 | 0 | { |
1684 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1685 | 0 | hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t)); |
1686 | 0 | if (iter == NULL) return NULL; |
1687 | | |
1688 | | // Cons up a dummy iterator for which hts_itr_next() will simply invoke |
1689 | | // the readrec function: |
1690 | 0 | iter->is_cram = 1; |
1691 | 0 | iter->read_rest = 1; |
1692 | 0 | iter->off = NULL; |
1693 | 0 | iter->bins.a = NULL; |
1694 | 0 | iter->readrec = readrec; |
1695 | |
|
1696 | 0 | if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) { |
1697 | 0 | cram_range r = { tid, beg+1, end }; |
1698 | 0 | int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r); |
1699 | |
|
1700 | 0 | iter->curr_off = 0; |
1701 | | // The following fields are not required by hts_itr_next(), but are |
1702 | | // filled in in case user code wants to look at them. |
1703 | 0 | iter->tid = tid; |
1704 | 0 | iter->beg = beg; |
1705 | 0 | iter->end = end; |
1706 | |
|
1707 | 0 | switch (ret) { |
1708 | 0 | case 0: |
1709 | 0 | break; |
1710 | | |
1711 | 0 | case -2: |
1712 | | // No data vs this ref, so mark iterator as completed. |
1713 | | // Same as HTS_IDX_NONE. |
1714 | 0 | iter->finished = 1; |
1715 | 0 | break; |
1716 | | |
1717 | 0 | default: |
1718 | 0 | free(iter); |
1719 | 0 | return NULL; |
1720 | 0 | } |
1721 | 0 | } |
1722 | 0 | else switch (tid) { |
1723 | 0 | case HTS_IDX_REST: |
1724 | 0 | iter->curr_off = 0; |
1725 | 0 | break; |
1726 | 0 | case HTS_IDX_NONE: |
1727 | 0 | iter->curr_off = 0; |
1728 | 0 | iter->finished = 1; |
1729 | 0 | break; |
1730 | 0 | default: |
1731 | 0 | hts_log_error("Query with tid=%d not implemented for CRAM files", tid); |
1732 | 0 | abort(); |
1733 | 0 | break; |
1734 | 0 | } |
1735 | | |
1736 | 0 | return iter; |
1737 | 0 | } |
1738 | | |
1739 | | hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end) |
1740 | 0 | { |
1741 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1742 | 0 | if (idx == NULL) |
1743 | 0 | return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest); |
1744 | 0 | else if (cidx->fmt == HTS_FMT_CRAI) |
1745 | 0 | return cram_itr_query(idx, tid, beg, end, sam_readrec); |
1746 | 0 | else |
1747 | 0 | return hts_itr_query(idx, tid, beg, end, sam_readrec); |
1748 | 0 | } |
1749 | | |
1750 | | static int cram_name2id(void *fdv, const char *ref) |
1751 | 0 | { |
1752 | 0 | cram_fd *fd = (cram_fd *) fdv; |
1753 | 0 | return sam_hdr_name2tid(fd->header, ref); |
1754 | 0 | } |
1755 | | |
1756 | | hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region) |
1757 | 0 | { |
1758 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1759 | 0 | return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, |
1760 | 0 | cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query, |
1761 | 0 | sam_readrec); |
1762 | 0 | } |
1763 | | |
1764 | | hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount) |
1765 | 0 | { |
1766 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1767 | 0 | hts_reglist_t *r_list = NULL; |
1768 | 0 | int r_count = 0; |
1769 | |
|
1770 | 0 | if (!cidx || !hdr) |
1771 | 0 | return NULL; |
1772 | | |
1773 | 0 | hts_itr_t *itr = NULL; |
1774 | 0 | if (cidx->fmt == HTS_FMT_CRAI) { |
1775 | 0 | r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id); |
1776 | 0 | if (!r_list) |
1777 | 0 | return NULL; |
1778 | 0 | itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram, |
1779 | 0 | hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell); |
1780 | 0 | } else { |
1781 | 0 | r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, (hts_name2id_f)(bam_name2id)); |
1782 | 0 | if (!r_list) |
1783 | 0 | return NULL; |
1784 | 0 | itr = hts_itr_regions(idx, r_list, r_count, (hts_name2id_f)(bam_name2id), hdr, |
1785 | 0 | hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell); |
1786 | 0 | } |
1787 | | |
1788 | 0 | if (!itr) |
1789 | 0 | hts_reglist_free(r_list, r_count); |
1790 | |
|
1791 | 0 | return itr; |
1792 | 0 | } |
1793 | | |
1794 | | hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount) |
1795 | 0 | { |
1796 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1797 | |
|
1798 | 0 | if(!cidx || !hdr || !reglist) |
1799 | 0 | return NULL; |
1800 | | |
1801 | 0 | if (cidx->fmt == HTS_FMT_CRAI) |
1802 | 0 | return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram, |
1803 | 0 | hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell); |
1804 | 0 | else |
1805 | 0 | return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr, |
1806 | 0 | hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell); |
1807 | 0 | } |
1808 | | |
1809 | | /********************** |
1810 | | *** SAM header I/O *** |
1811 | | **********************/ |
1812 | | |
1813 | | #include "htslib/kseq.h" |
1814 | | #include "htslib/kstring.h" |
1815 | | |
1816 | | sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text) |
1817 | 0 | { |
1818 | 0 | sam_hdr_t *bh = sam_hdr_init(); |
1819 | 0 | if (!bh) return NULL; |
1820 | | |
1821 | 0 | if (sam_hdr_add_lines(bh, text, l_text) != 0) { |
1822 | 0 | sam_hdr_destroy(bh); |
1823 | 0 | return NULL; |
1824 | 0 | } |
1825 | | |
1826 | 0 | return bh; |
1827 | 0 | } |
1828 | | |
1829 | | // Minimal sanitisation of a header to ensure. |
1830 | | // - null terminated string. |
1831 | | // - all lines start with @ (also implies no blank lines). |
1832 | | // |
1833 | | // Much more could be done, but currently is not, including: |
1834 | | // - checking header types are known (HD, SQ, etc). |
1835 | | // - syntax (eg checking tab separated fields). |
1836 | | // - validating n_targets matches @SQ records. |
1837 | | // - validating target lengths against @SQ records. |
1838 | 1.87k | static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) { |
1839 | 1.87k | if (!h) |
1840 | 9 | return NULL; |
1841 | | |
1842 | | // Special case for empty headers. |
1843 | 1.86k | if (h->l_text == 0) |
1844 | 405 | return h; |
1845 | | |
1846 | 1.45k | size_t i; |
1847 | 1.45k | unsigned int lnum = 0; |
1848 | 1.45k | char *cp = h->text, last = '\n'; |
1849 | 18.0M | for (i = 0; i < h->l_text; i++) { |
1850 | | // NB: l_text excludes terminating nul. This finds early ones. |
1851 | 18.0M | if (cp[i] == 0) |
1852 | 564 | break; |
1853 | | |
1854 | | // Error on \n[^@], including duplicate newlines |
1855 | 18.0M | if (last == '\n') { |
1856 | 106k | lnum++; |
1857 | 106k | if (cp[i] != '@') { |
1858 | 0 | hts_log_error("Malformed SAM header at line %u", lnum); |
1859 | 0 | sam_hdr_destroy(h); |
1860 | 0 | return NULL; |
1861 | 0 | } |
1862 | 106k | } |
1863 | | |
1864 | 18.0M | last = cp[i]; |
1865 | 18.0M | } |
1866 | | |
1867 | 1.45k | if (i < h->l_text) { // Early nul found. Complain if not just padding. |
1868 | 564 | size_t j = i; |
1869 | 3.65k | while (j < h->l_text && cp[j] == '\0') j++; |
1870 | 564 | if (j < h->l_text) |
1871 | 561 | hts_log_warning("Unexpected NUL character in header. Possibly truncated"); |
1872 | 564 | } |
1873 | | |
1874 | | // Add trailing newline and/or trailing nul if required. |
1875 | 1.45k | if (last != '\n') { |
1876 | 561 | hts_log_warning("Missing trailing newline on SAM header. Possibly truncated"); |
1877 | | |
1878 | 561 | if (h->l_text < 2 || i >= h->l_text - 2) { |
1879 | 81 | if (h->l_text >= SIZE_MAX - 2) { |
1880 | 0 | hts_log_error("No room for extra newline"); |
1881 | 0 | sam_hdr_destroy(h); |
1882 | 0 | return NULL; |
1883 | 0 | } |
1884 | | |
1885 | 81 | cp = realloc(h->text, (size_t) h->l_text+2); |
1886 | 81 | if (!cp) { |
1887 | 0 | sam_hdr_destroy(h); |
1888 | 0 | return NULL; |
1889 | 0 | } |
1890 | 81 | h->text = cp; |
1891 | 81 | } |
1892 | 561 | cp[i++] = '\n'; |
1893 | | |
1894 | | // l_text may be larger already due to multiple nul padding |
1895 | 561 | if (h->l_text < i) |
1896 | 0 | h->l_text = i; |
1897 | 561 | cp[h->l_text] = '\0'; |
1898 | 561 | } |
1899 | | |
1900 | 1.45k | return h; |
1901 | 1.45k | } |
1902 | | |
1903 | 1.32k | static sam_hdr_t *sam_hdr_create(htsFile* fp) { |
1904 | 1.32k | sam_hdr_t* h = sam_hdr_init(); |
1905 | 1.32k | if (!h) |
1906 | 0 | return NULL; |
1907 | | |
1908 | 1.32k | if (sam_hdr_build_from_sam_file(h, fp) != 0) { |
1909 | 78 | sam_hdr_destroy(h); |
1910 | 78 | return NULL; |
1911 | 78 | } |
1912 | | |
1913 | 1.24k | if (fp->bam_header) |
1914 | 0 | sam_hdr_destroy(fp->bam_header); |
1915 | 1.24k | fp->bam_header = sam_hdr_sanitise(h); |
1916 | 1.24k | fp->bam_header->ref_count = 1; |
1917 | | |
1918 | 1.24k | return fp->bam_header; |
1919 | 1.32k | } |
1920 | | |
1921 | | sam_hdr_t *sam_hdr_read(htsFile *fp) |
1922 | 2.59k | { |
1923 | 2.59k | sam_hdr_t *h = NULL; |
1924 | 2.59k | if (!fp) { |
1925 | 0 | errno = EINVAL; |
1926 | 0 | return NULL; |
1927 | 0 | } |
1928 | | |
1929 | 2.59k | switch (fp->format.format) { |
1930 | 42 | case bam: |
1931 | 42 | h = sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf)); |
1932 | 42 | break; |
1933 | | |
1934 | 582 | case cram: |
1935 | 582 | h = sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header)); |
1936 | 582 | break; |
1937 | | |
1938 | 1.32k | case sam: |
1939 | 1.32k | h = sam_hdr_create(fp); |
1940 | 1.32k | break; |
1941 | | |
1942 | 24 | case fastq_format: |
1943 | 648 | case fasta_format: |
1944 | 648 | return sam_hdr_init(); |
1945 | | |
1946 | 0 | case empty_format: |
1947 | 0 | errno = EPIPE; |
1948 | 0 | return NULL; |
1949 | | |
1950 | 0 | default: |
1951 | 0 | errno = EFTYPE; |
1952 | 0 | return NULL; |
1953 | 2.59k | } |
1954 | | //only sam,bam and cram reaches here |
1955 | 1.95k | if (h && !fp->bam_header) { //set except for sam which already has it |
1956 | | //for cram, it is the o/p header as for rest and not the internal header |
1957 | 615 | fp->bam_header = h; |
1958 | 615 | sam_hdr_incr_ref(fp->bam_header); |
1959 | 615 | } |
1960 | 1.95k | return h; |
1961 | 2.59k | } |
1962 | | |
1963 | | int sam_hdr_write(htsFile *fp, const sam_hdr_t *h) |
1964 | 2.51k | { |
1965 | 2.51k | if (!fp || !h) { |
1966 | 0 | errno = EINVAL; |
1967 | 0 | return -1; |
1968 | 0 | } |
1969 | | |
1970 | 2.51k | switch (fp->format.format) { |
1971 | 837 | case binary_format: |
1972 | 837 | fp->format.category = sequence_data; |
1973 | 837 | fp->format.format = bam; |
1974 | | /* fall-through */ |
1975 | 837 | case bam: |
1976 | 837 | if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1; |
1977 | 837 | break; |
1978 | | |
1979 | 837 | case cram: { |
1980 | 837 | cram_fd *fd = fp->fp.cram; |
1981 | 837 | if (cram_set_header2(fd, h) < 0) return -1; |
1982 | 784 | if (fp->fn_aux) |
1983 | 0 | cram_load_reference(fd, fp->fn_aux); |
1984 | 784 | if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1; |
1985 | 784 | } |
1986 | 784 | break; |
1987 | | |
1988 | 837 | case text_format: |
1989 | 837 | fp->format.category = sequence_data; |
1990 | 837 | fp->format.format = sam; |
1991 | | /* fall-through */ |
1992 | 837 | case sam: { |
1993 | 837 | if (!h->hrecs && !h->text) |
1994 | 0 | return 0; |
1995 | 837 | char *text; |
1996 | 837 | kstring_t hdr_ks = { 0, 0, NULL }; |
1997 | 837 | size_t l_text; |
1998 | 837 | ssize_t bytes; |
1999 | 837 | int r = 0, no_sq = 0; |
2000 | | |
2001 | 837 | if (h->hrecs) { |
2002 | 836 | if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) |
2003 | 0 | return -1; |
2004 | 836 | text = hdr_ks.s; |
2005 | 836 | l_text = hdr_ks.l; |
2006 | 836 | } else { |
2007 | 1 | const char *p = NULL; |
2008 | 1 | do { |
2009 | 1 | const char *q = p == NULL ? h->text : p + 4; |
2010 | 1 | p = strstr(q, "@SQ\t"); |
2011 | 1 | } while (!(p == NULL || p == h->text || *(p - 1) == '\n')); |
2012 | 1 | no_sq = p == NULL; |
2013 | 1 | text = h->text; |
2014 | 1 | l_text = h->l_text; |
2015 | 1 | } |
2016 | | |
2017 | 837 | if (fp->is_bgzf) { |
2018 | 0 | bytes = bgzf_write(fp->fp.bgzf, text, l_text); |
2019 | 837 | } else { |
2020 | 837 | bytes = hwrite(fp->fp.hfile, text, l_text); |
2021 | 837 | } |
2022 | 837 | free(hdr_ks.s); |
2023 | 837 | if (bytes != l_text) |
2024 | 0 | return -1; |
2025 | | |
2026 | 837 | if (no_sq) { |
2027 | 1 | int i; |
2028 | 1 | for (i = 0; i < h->n_targets; ++i) { |
2029 | 0 | fp->line.l = 0; |
2030 | 0 | r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0; |
2031 | 0 | r |= kputs(h->target_name[i], &fp->line) < 0; |
2032 | 0 | r |= kputsn("\tLN:", 4, &fp->line) < 0; |
2033 | 0 | r |= kputw(h->target_len[i], &fp->line) < 0; |
2034 | 0 | r |= kputc('\n', &fp->line) < 0; |
2035 | 0 | if (r != 0) |
2036 | 0 | return -1; |
2037 | | |
2038 | 0 | if (fp->is_bgzf) { |
2039 | 0 | bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l); |
2040 | 0 | } else { |
2041 | 0 | bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l); |
2042 | 0 | } |
2043 | 0 | if (bytes != fp->line.l) |
2044 | 0 | return -1; |
2045 | 0 | } |
2046 | 1 | } |
2047 | 837 | if (fp->is_bgzf) { |
2048 | 0 | if (bgzf_flush(fp->fp.bgzf) != 0) return -1; |
2049 | 837 | } else { |
2050 | 837 | if (hflush(fp->fp.hfile) != 0) return -1; |
2051 | 837 | } |
2052 | 837 | } |
2053 | 837 | break; |
2054 | | |
2055 | 837 | case fastq_format: |
2056 | 0 | case fasta_format: |
2057 | | // Nothing to output; FASTQ has no file headers. |
2058 | 0 | return 0; |
2059 | 0 | break; |
2060 | | |
2061 | 0 | default: |
2062 | 0 | errno = EBADF; |
2063 | 0 | return -1; |
2064 | 2.51k | } |
2065 | | //only sam,bam and cram reaches here |
2066 | 2.45k | if (h) { //the new header |
2067 | 2.45k | sam_hdr_t *tmp = fp->bam_header; |
2068 | 2.45k | fp->bam_header = sam_hdr_dup(h); |
2069 | 2.45k | sam_hdr_destroy(tmp); |
2070 | 2.45k | if (!fp->bam_header && h) |
2071 | 0 | return -1; //failed to duplicate |
2072 | 2.45k | } |
2073 | 2.45k | return 0; |
2074 | 2.45k | } |
2075 | | |
2076 | | static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val) |
2077 | 0 | { |
2078 | 0 | char *p, *q, *beg = NULL, *end = NULL, *newtext; |
2079 | 0 | size_t new_l_text; |
2080 | 0 | if (!h || !key) |
2081 | 0 | return -1; |
2082 | | |
2083 | 0 | if (h->l_text > 3) { |
2084 | 0 | if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists |
2085 | 0 | if ((p = strchr(h->text, '\n')) == 0) return -1; |
2086 | 0 | *p = '\0'; // for strstr call |
2087 | |
|
2088 | 0 | char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' }; |
2089 | |
|
2090 | 0 | if ((q = strstr(h->text, tmp)) != 0) { // key exists |
2091 | 0 | *p = '\n'; // change back |
2092 | | |
2093 | | // mark the key:val |
2094 | 0 | beg = q; |
2095 | 0 | for (q += 4; *q != '\n' && *q != '\t'; ++q); |
2096 | 0 | end = q; |
2097 | |
|
2098 | 0 | if (val && (strncmp(beg + 4, val, end - beg - 4) == 0) |
2099 | 0 | && strlen(val) == end - beg - 4) |
2100 | 0 | return 0; // val is the same, no need to change |
2101 | |
|
2102 | 0 | } else { |
2103 | 0 | beg = end = p; |
2104 | 0 | *p = '\n'; |
2105 | 0 | } |
2106 | 0 | } |
2107 | 0 | } |
2108 | 0 | if (beg == NULL) { // no @HD |
2109 | 0 | new_l_text = h->l_text; |
2110 | 0 | if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9) |
2111 | 0 | return -1; |
2112 | 0 | new_l_text += strlen(SAM_FORMAT_VERSION) + 8; |
2113 | 0 | if (val) { |
2114 | 0 | if (new_l_text > SIZE_MAX - strlen(val) - 5) |
2115 | 0 | return -1; |
2116 | 0 | new_l_text += strlen(val) + 4; |
2117 | 0 | } |
2118 | 0 | newtext = (char*)malloc(new_l_text + 1); |
2119 | 0 | if (!newtext) return -1; |
2120 | | |
2121 | 0 | if (val) |
2122 | 0 | snprintf(newtext, new_l_text + 1, |
2123 | 0 | "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text); |
2124 | 0 | else |
2125 | 0 | snprintf(newtext, new_l_text + 1, |
2126 | 0 | "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text); |
2127 | 0 | } else { // has @HD but different or no key |
2128 | 0 | new_l_text = (beg - h->text) + (h->text + h->l_text - end); |
2129 | 0 | if (val) { |
2130 | 0 | if (new_l_text > SIZE_MAX - strlen(val) - 5) |
2131 | 0 | return -1; |
2132 | 0 | new_l_text += strlen(val) + 4; |
2133 | 0 | } |
2134 | 0 | newtext = (char*)malloc(new_l_text + 1); |
2135 | 0 | if (!newtext) return -1; |
2136 | | |
2137 | 0 | if (val) { |
2138 | 0 | snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s", |
2139 | 0 | (int) (beg - h->text), h->text, key, val, end); |
2140 | 0 | } else { //delete key |
2141 | 0 | snprintf(newtext, new_l_text + 1, "%.*s%s", |
2142 | 0 | (int) (beg - h->text), h->text, end); |
2143 | 0 | } |
2144 | 0 | } |
2145 | 0 | free(h->text); |
2146 | 0 | h->text = newtext; |
2147 | 0 | h->l_text = new_l_text; |
2148 | 0 | return 0; |
2149 | 0 | } |
2150 | | |
2151 | | |
2152 | | int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val) |
2153 | 0 | { |
2154 | 0 | if (!h || !key) |
2155 | 0 | return -1; |
2156 | | |
2157 | 0 | if (!h->hrecs) |
2158 | 0 | return old_sam_hdr_change_HD(h, key, val); |
2159 | | |
2160 | 0 | if (val) { |
2161 | 0 | if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0) |
2162 | 0 | return -1; |
2163 | 0 | } else { |
2164 | 0 | if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0) |
2165 | 0 | return -1; |
2166 | 0 | } |
2167 | 0 | return sam_hdr_rebuild(h); |
2168 | 0 | } |
2169 | | |
2170 | | /* releases existing header and sets new one; increments ref count if not |
2171 | | duplicating */ |
2172 | | int sam_hdr_set(samFile *fp, sam_hdr_t *h, int duplicate) |
2173 | 0 | { |
2174 | 0 | if (!fp) |
2175 | 0 | return -1; |
2176 | | |
2177 | 0 | if (duplicate) { |
2178 | 0 | sam_hdr_t *tmp = fp->bam_header; |
2179 | 0 | fp->bam_header = sam_hdr_dup(h); |
2180 | 0 | sam_hdr_destroy(tmp); |
2181 | 0 | if (!fp->bam_header && h) |
2182 | 0 | return -1; //duplicate failed |
2183 | 0 | } else { |
2184 | 0 | if (fp->bam_header != h) { //if not the same |
2185 | 0 | sam_hdr_destroy(fp->bam_header); |
2186 | 0 | fp->bam_header = h; |
2187 | 0 | sam_hdr_incr_ref(fp->bam_header); |
2188 | 0 | } |
2189 | 0 | } |
2190 | | |
2191 | 0 | return 0; |
2192 | 0 | } |
2193 | | |
2194 | | //return the bam_header, user has to use sam_hdr_incr_ref where ever required |
2195 | | sam_hdr_t* sam_hdr_get(samFile* fp) |
2196 | 0 | { |
2197 | 0 | if (!fp) |
2198 | 0 | return NULL; |
2199 | 0 | return fp->bam_header; |
2200 | 0 | } |
2201 | | |
2202 | | /********************** |
2203 | | *** SAM record I/O *** |
2204 | | **********************/ |
2205 | | |
2206 | | // The speed of this code can vary considerably depending on minor code |
2207 | | // changes elsewhere as some of the tight loops are particularly prone to |
2208 | | // speed changes when the instruction blocks are split over a 32-byte |
2209 | | // boundary. To protect against this, we explicitly specify an alignment |
2210 | | // for this function. If this is insufficient, we may also wish to |
2211 | | // consider alignment of blocks within this function via |
2212 | | // __attribute__((optimize("align-loops=5"))) (gcc) or clang equivalents. |
2213 | | // However it's not very portable. |
2214 | | // Instead we break into separate functions so we can explicitly specify |
2215 | | // use __attribute__((aligned(32))) instead and force consistent loop |
2216 | | // alignment. |
2217 | 19.7k | static inline int64_t grow_B_array(bam1_t *b, uint32_t *n, size_t size) { |
2218 | | // Avoid overflow on 32-bit platforms, but it breaks BAM anyway |
2219 | 19.7k | if (*n > INT32_MAX*0.666) { |
2220 | 0 | errno = ENOMEM; |
2221 | 0 | return -1; |
2222 | 0 | } |
2223 | | |
2224 | 19.7k | size_t bytes = (size_t)size * (size_t)(*n>>1); |
2225 | 19.7k | if (possibly_expand_bam_data(b, bytes) < 0) { |
2226 | 0 | hts_log_error("Out of memory"); |
2227 | 0 | return -1; |
2228 | 0 | } |
2229 | | |
2230 | 19.7k | (*n)+=*n>>1; |
2231 | 19.7k | return 0; |
2232 | 19.7k | } |
2233 | | |
2234 | | |
2235 | | // This ensures that q always ends up at the next comma after |
2236 | | // reading a number even if it's followed by junk. It |
2237 | | // prevents the possibility of trying to read more than n items. |
2238 | 2.33M | #define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0) |
2239 | | |
2240 | | HTS_ALIGN32 |
2241 | | static char *sam_parse_Bc_vals(bam1_t *b, char *q, uint32_t *nused, |
2242 | 1.65k | uint32_t *nalloc, int *overflow) { |
2243 | 51.0k | while (*q == ',') { |
2244 | 49.4k | if ((*nused)++ >= (*nalloc)) { |
2245 | 54 | if (grow_B_array(b, nalloc, 1) < 0) |
2246 | 0 | return NULL; |
2247 | 54 | } |
2248 | 49.4k | *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, overflow); |
2249 | 49.4k | b->l_data++; |
2250 | 49.4k | } |
2251 | 1.65k | return q; |
2252 | 1.65k | } |
2253 | | |
2254 | | HTS_ALIGN32 |
2255 | | static char *sam_parse_BC_vals(bam1_t *b, char *q, uint32_t *nused, |
2256 | 19.0k | uint32_t *nalloc, int *overflow) { |
2257 | 137k | while (*q == ',') { |
2258 | 118k | if ((*nused)++ >= (*nalloc)) { |
2259 | 1.97k | if (grow_B_array(b, nalloc, 1) < 0) |
2260 | 0 | return NULL; |
2261 | 1.97k | } |
2262 | 118k | if (q[1] != '-') { |
2263 | 109k | *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, overflow); |
2264 | 109k | b->l_data++; |
2265 | 109k | } else { |
2266 | 8.62k | *overflow = 1; |
2267 | 8.62k | q++; |
2268 | 8.62k | skip_to_comma_(q); |
2269 | 8.62k | } |
2270 | 118k | } |
2271 | 19.0k | return q; |
2272 | 19.0k | } |
2273 | | |
2274 | | HTS_ALIGN32 |
2275 | | static char *sam_parse_Bs_vals(bam1_t *b, char *q, uint32_t *nused, |
2276 | 4.44k | uint32_t *nalloc, int *overflow) { |
2277 | 46.1k | while (*q == ',') { |
2278 | 41.7k | if ((*nused)++ >= (*nalloc)) { |
2279 | 1.49k | if (grow_B_array(b, nalloc, 2) < 0) |
2280 | 0 | return NULL; |
2281 | 1.49k | } |
2282 | 41.7k | i16_to_le(hts_str2int(q + 1, &q, 16, overflow), |
2283 | 41.7k | b->data + b->l_data); |
2284 | 41.7k | b->l_data += 2; |
2285 | 41.7k | } |
2286 | 4.44k | return q; |
2287 | 4.44k | } |
2288 | | |
2289 | | HTS_ALIGN32 |
2290 | | static char *sam_parse_BS_vals(bam1_t *b, char *q, uint32_t *nused, |
2291 | 1.98k | uint32_t *nalloc, int *overflow) { |
2292 | 1.82M | while (*q == ',') { |
2293 | 1.82M | if ((*nused)++ >= (*nalloc)) { |
2294 | 2.24k | if (grow_B_array(b, nalloc, 2) < 0) |
2295 | 0 | return NULL; |
2296 | 2.24k | } |
2297 | 1.82M | if (q[1] != '-') { |
2298 | 1.79M | u16_to_le(hts_str2uint(q + 1, &q, 16, overflow), |
2299 | 1.79M | b->data + b->l_data); |
2300 | 1.79M | b->l_data += 2; |
2301 | 1.79M | } else { |
2302 | 27.3k | *overflow = 1; |
2303 | 27.3k | q++; |
2304 | 27.3k | skip_to_comma_(q); |
2305 | 27.3k | } |
2306 | 1.82M | } |
2307 | 1.98k | return q; |
2308 | 1.98k | } |
2309 | | |
2310 | | HTS_ALIGN32 |
2311 | | static char *sam_parse_Bi_vals(bam1_t *b, char *q, uint32_t *nused, |
2312 | 2.07k | uint32_t *nalloc, int *overflow) { |
2313 | 2.07M | while (*q == ',') { |
2314 | 2.07M | if ((*nused)++ >= (*nalloc)) { |
2315 | 123 | if (grow_B_array(b, nalloc, 4) < 0) |
2316 | 0 | return NULL; |
2317 | 123 | } |
2318 | 2.07M | i32_to_le(hts_str2int(q + 1, &q, 32, overflow), |
2319 | 2.07M | b->data + b->l_data); |
2320 | 2.07M | b->l_data += 4; |
2321 | 2.07M | } |
2322 | 2.07k | return q; |
2323 | 2.07k | } |
2324 | | |
2325 | | HTS_ALIGN32 |
2326 | | static char *sam_parse_BI_vals(bam1_t *b, char *q, uint32_t *nused, |
2327 | 6.84k | uint32_t *nalloc, int *overflow) { |
2328 | 139k | while (*q == ',') { |
2329 | 132k | if ((*nused)++ >= (*nalloc)) { |
2330 | 11.6k | if (grow_B_array(b, nalloc, 4) < 0) |
2331 | 0 | return NULL; |
2332 | 11.6k | } |
2333 | 132k | if (q[1] != '-') { |
2334 | 126k | u32_to_le(hts_str2uint(q + 1, &q, 32, overflow), |
2335 | 126k | b->data + b->l_data); |
2336 | 126k | b->l_data += 4; |
2337 | 126k | } else { |
2338 | 6.14k | *overflow = 1; |
2339 | 6.14k | q++; |
2340 | 6.14k | skip_to_comma_(q); |
2341 | 6.14k | } |
2342 | 132k | } |
2343 | 6.84k | return q; |
2344 | 6.84k | } |
2345 | | |
2346 | | HTS_ALIGN32 |
2347 | | static char *sam_parse_Bf_vals(bam1_t *b, char *q, uint32_t *nused, |
2348 | 2.20k | uint32_t *nalloc, int *overflow) { |
2349 | 20.7k | while (*q == ',') { |
2350 | 18.5k | if ((*nused)++ >= (*nalloc)) { |
2351 | 2.16k | if (grow_B_array(b, nalloc, 4) < 0) |
2352 | 0 | return NULL; |
2353 | 2.16k | } |
2354 | 18.5k | float_to_le(strtod(q + 1, &q), b->data + b->l_data); |
2355 | 18.5k | b->l_data += 4; |
2356 | 18.5k | } |
2357 | 2.20k | return q; |
2358 | 2.20k | } |
2359 | | |
2360 | | HTS_ALIGN32 |
2361 | | static int sam_parse_B_vals_r(char type, uint32_t nalloc, char *in, |
2362 | | char **end, bam1_t *b, |
2363 | 38.4k | int *ctr) { |
2364 | | // Protect against infinite recursion when dealing with invalid input. |
2365 | | // An example string is "XX:B:C,-". The lack of a number means min=0, |
2366 | | // but it overflowed due to "-" and so we repeat ad-infinitum. |
2367 | | // |
2368 | | // Loop detection is the safest solution incase there are other |
2369 | | // strange corner cases with malformed inputs. |
2370 | 38.4k | if (++(*ctr) > 2) { |
2371 | 0 | hts_log_error("Malformed data in B:%c array", type); |
2372 | 0 | return -1; |
2373 | 0 | } |
2374 | | |
2375 | 38.4k | int orig_l = b->l_data; |
2376 | 38.4k | char *q = in; |
2377 | 38.4k | int32_t size; |
2378 | 38.4k | size_t bytes; |
2379 | 38.4k | int overflow = 0; |
2380 | | |
2381 | 38.4k | size = aux_type2size(type); |
2382 | 38.4k | if (size <= 0 || size > 4) { |
2383 | 2 | hts_log_error("Unrecognized type B:%c", type); |
2384 | 2 | return -1; |
2385 | 2 | } |
2386 | | |
2387 | | // Ensure space for type + values. |
2388 | | // The first pass through here we don't know the number of entries and |
2389 | | // nalloc == 0. We start with a small working set and then parse the |
2390 | | // data, growing as needed. |
2391 | | // |
2392 | | // If we have a second pass through we do know the number of entries |
2393 | | // and nalloc is already known. We have no need to expand the bam data. |
2394 | 38.4k | if (!nalloc) |
2395 | 33.8k | nalloc=7; |
2396 | | |
2397 | | // Ensure allocated memory is big enough (for current nalloc estimate) |
2398 | 38.4k | bytes = (size_t) nalloc * (size_t) size; |
2399 | 38.4k | if (bytes / size != nalloc |
2400 | 38.4k | || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) { |
2401 | 0 | hts_log_error("Out of memory"); |
2402 | 0 | return -1; |
2403 | 0 | } |
2404 | | |
2405 | 38.4k | uint32_t nused = 0; |
2406 | | |
2407 | 38.4k | b->data[b->l_data++] = 'B'; |
2408 | 38.4k | b->data[b->l_data++] = type; |
2409 | | // 32-bit B-array length is inserted later once we know it. |
2410 | 38.4k | int b_len_idx = b->l_data; |
2411 | 38.4k | b->l_data += sizeof(uint32_t); |
2412 | | |
2413 | 38.4k | if (type == 'c') { |
2414 | 1.65k | if (!(q = sam_parse_Bc_vals(b, q, &nused, &nalloc, &overflow))) |
2415 | 0 | return -1; |
2416 | 36.8k | } else if (type == 'C') { |
2417 | 19.0k | if (!(q = sam_parse_BC_vals(b, q, &nused, &nalloc, &overflow))) |
2418 | 0 | return -1; |
2419 | 19.0k | } else if (type == 's') { |
2420 | 4.44k | if (!(q = sam_parse_Bs_vals(b, q, &nused, &nalloc, &overflow))) |
2421 | 0 | return -1; |
2422 | 13.3k | } else if (type == 'S') { |
2423 | 1.98k | if (!(q = sam_parse_BS_vals(b, q, &nused, &nalloc, &overflow))) |
2424 | 0 | return -1; |
2425 | 11.3k | } else if (type == 'i') { |
2426 | 2.07k | if (!(q = sam_parse_Bi_vals(b, q, &nused, &nalloc, &overflow))) |
2427 | 0 | return -1; |
2428 | 9.29k | } else if (type == 'I') { |
2429 | 6.84k | if (!(q = sam_parse_BI_vals(b, q, &nused, &nalloc, &overflow))) |
2430 | 0 | return -1; |
2431 | 6.84k | } else if (type == 'f') { |
2432 | 2.20k | if (!(q = sam_parse_Bf_vals(b, q, &nused, &nalloc, &overflow))) |
2433 | 0 | return -1; |
2434 | 2.20k | } |
2435 | 38.4k | if (*q != '\t' && *q != '\0') { |
2436 | | // Unknown B array type or junk in the numbers |
2437 | 21 | hts_log_error("Malformed B:%c", type); |
2438 | 21 | return -1; |
2439 | 21 | } |
2440 | 38.4k | i32_to_le(nused, b->data + b_len_idx); |
2441 | | |
2442 | 38.4k | if (!overflow) { |
2443 | 33.8k | *end = q; |
2444 | 33.8k | return 0; |
2445 | 33.8k | } else { |
2446 | 4.57k | int64_t max = 0, min = 0, val; |
2447 | | // Given type was incorrect. Try to rescue the situation. |
2448 | 4.57k | char *r = q; |
2449 | 4.57k | q = in; |
2450 | 4.57k | overflow = 0; |
2451 | 4.57k | b->l_data = orig_l; |
2452 | | // Find out what range of values is present |
2453 | 2.12M | while (q < r) { |
2454 | 2.12M | val = hts_str2int(q + 1, &q, 64, &overflow); |
2455 | 2.12M | if (max < val) max = val; |
2456 | 2.12M | if (min > val) min = val; |
2457 | 2.12M | skip_to_comma_(q); |
2458 | 2.12M | } |
2459 | | // Retry with appropriate type |
2460 | 4.57k | if (!overflow) { |
2461 | 4.56k | if (min < 0) { |
2462 | 4.26k | if (min >= INT8_MIN && max <= INT8_MAX) { |
2463 | 1.31k | return sam_parse_B_vals_r('c', nalloc, in, end, b, ctr); |
2464 | 2.95k | } else if (min >= INT16_MIN && max <= INT16_MAX) { |
2465 | 892 | return sam_parse_B_vals_r('s', nalloc, in, end, b, ctr); |
2466 | 2.06k | } else if (min >= INT32_MIN && max <= INT32_MAX) { |
2467 | 2.05k | return sam_parse_B_vals_r('i', nalloc, in, end, b, ctr); |
2468 | 2.05k | } |
2469 | 4.26k | } else { |
2470 | 304 | if (max < UINT8_MAX) { |
2471 | 0 | return sam_parse_B_vals_r('C', nalloc, in, end, b, ctr); |
2472 | 304 | } else if (max <= UINT16_MAX) { |
2473 | 0 | return sam_parse_B_vals_r('S', nalloc, in, end, b, ctr); |
2474 | 304 | } else if (max <= UINT32_MAX) { |
2475 | 301 | return sam_parse_B_vals_r('I', nalloc, in, end, b, ctr); |
2476 | 301 | } |
2477 | 304 | } |
2478 | 4.56k | } |
2479 | | // If here then at least one of the values is too big to store |
2480 | 14 | hts_log_error("Numeric value in B array out of allowed range"); |
2481 | 14 | return -1; |
2482 | 4.57k | } |
2483 | 38.4k | #undef skip_to_comma_ |
2484 | 38.4k | } |
2485 | | |
2486 | | HTS_ALIGN32 |
2487 | | static int sam_parse_B_vals(char type, char *in, char **end, bam1_t *b) |
2488 | 33.8k | { |
2489 | 33.8k | int ctr = 0; |
2490 | 33.8k | uint32_t nalloc = 0; |
2491 | 33.8k | return sam_parse_B_vals_r(type, nalloc, in, end, b, &ctr); |
2492 | 33.8k | } |
2493 | | |
2494 | 92.2k | static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) { |
2495 | 92.2k | if (*v >= '1' && *v <= '9') { |
2496 | 15.9k | return hts_str2uint(v, rv, 16, overflow); |
2497 | 15.9k | } |
2498 | 76.3k | else if (*v == '0') { |
2499 | | // handle single-digit "0" directly; otherwise it's hex or octal |
2500 | 21.1k | if (v[1] == '\t') { *rv = v+1; return 0; } |
2501 | 15 | else { |
2502 | 15 | unsigned long val = strtoul(v, rv, 0); |
2503 | 15 | if (val > 65535) { *overflow = 1; return 65535; } |
2504 | 15 | return val; |
2505 | 15 | } |
2506 | 21.1k | } |
2507 | 55.1k | else { |
2508 | | // TODO implement symbolic flag letters |
2509 | 55.1k | *rv = v; |
2510 | 55.1k | return 0; |
2511 | 55.1k | } |
2512 | 92.2k | } |
2513 | | |
2514 | | // Parse tag line and append to bam object b. |
2515 | | // Shared by both SAM and FASTQ parsers. |
2516 | | // |
2517 | | // The difference between the two is how lenient we are to recognising |
2518 | | // non-compliant strings. The FASTQ parser glosses over arbitrary |
2519 | | // non-SAM looking strings. |
2520 | | static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient, |
2521 | 91.7k | khash_t(tag) *tag_whitelist) { |
2522 | 91.7k | int overflow = 0; |
2523 | 91.7k | int checkpoint; |
2524 | 91.7k | char logbuf[40]; |
2525 | 91.7k | char *q = start, *p = end; |
2526 | | |
2527 | 91.7k | #define _parse_err(cond, ...) \ |
2528 | 3.24M | do { \ |
2529 | 6.99M | if (cond) { \ |
2530 | 90 | if (lenient) { \ |
2531 | 0 | while (q < p && !isspace_c(*q)) \ |
2532 | 0 | q++; \ |
2533 | 0 | while (q < p && isspace_c(*q)) \ |
2534 | 0 | q++; \ |
2535 | 0 | b->l_data = checkpoint; \ |
2536 | 0 | goto loop; \ |
2537 | 90 | } else { \ |
2538 | 90 | hts_log_error(__VA_ARGS__); \ |
2539 | 90 | goto err_ret; \ |
2540 | 90 | } \ |
2541 | 90 | } \ |
2542 | 3.24M | } while (0) |
2543 | | |
2544 | 3.11M | while (q < p) loop: { |
2545 | 3.11M | char type; |
2546 | 3.11M | checkpoint = b->l_data; |
2547 | 3.11M | if (p - q < 5) { |
2548 | 9 | if (lenient) { |
2549 | 0 | break; |
2550 | 9 | } else { |
2551 | 9 | hts_log_error("Incomplete aux field"); |
2552 | 9 | goto err_ret; |
2553 | 9 | } |
2554 | 9 | } |
2555 | 1.55M | _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id"); |
2556 | | |
2557 | 1.55M | if (lenient && (q[2] | q[4]) != ':') { |
2558 | 0 | while (q < p && !isspace_c(*q)) |
2559 | 0 | q++; |
2560 | 0 | while (q < p && isspace_c(*q)) |
2561 | 0 | q++; |
2562 | 0 | continue; |
2563 | 0 | } |
2564 | | |
2565 | 1.55M | if (tag_whitelist) { |
2566 | 0 | int tt = q[0]*256 + q[1]; |
2567 | 0 | if (kh_get(tag, tag_whitelist, tt) == kh_end(tag_whitelist)) { |
2568 | 0 | while (q < p && *q != '\t') |
2569 | 0 | q++; |
2570 | 0 | continue; |
2571 | 0 | } |
2572 | 0 | } |
2573 | | |
2574 | | // Copy over id |
2575 | 1.55M | if (possibly_expand_bam_data(b, 2) < 0) goto err_ret; |
2576 | 1.55M | memcpy(b->data + b->l_data, q, 2); b->l_data += 2; |
2577 | 1.55M | q += 3; type = *q++; ++q; // q points to value |
2578 | 1.55M | if (type != 'Z' && type != 'H') // the only zero length acceptable fields |
2579 | 1.24M | _parse_err(*q <= '\t', "incomplete aux field"); |
2580 | | |
2581 | | // Ensure enough space for a double + type allocated. |
2582 | 1.55M | if (possibly_expand_bam_data(b, 16) < 0) goto err_ret; |
2583 | | |
2584 | 1.55M | if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { |
2585 | 457k | b->data[b->l_data++] = 'A'; |
2586 | 457k | b->data[b->l_data++] = *q++; |
2587 | 1.10M | } else if (type == 'i' || type == 'I') { |
2588 | 716k | if (*q == '-') { |
2589 | 617k | int32_t x = hts_str2int(q, &q, 32, &overflow); |
2590 | 617k | if (x >= INT8_MIN) { |
2591 | 328k | b->data[b->l_data++] = 'c'; |
2592 | 328k | b->data[b->l_data++] = x; |
2593 | 328k | } else if (x >= INT16_MIN) { |
2594 | 78.8k | b->data[b->l_data++] = 's'; |
2595 | 78.8k | i16_to_le(x, b->data + b->l_data); |
2596 | 78.8k | b->l_data += 2; |
2597 | 210k | } else { |
2598 | 210k | b->data[b->l_data++] = 'i'; |
2599 | 210k | i32_to_le(x, b->data + b->l_data); |
2600 | 210k | b->l_data += 4; |
2601 | 210k | } |
2602 | 617k | } else { |
2603 | 99.0k | uint32_t x = hts_str2uint(q, &q, 32, &overflow); |
2604 | 99.0k | if (x <= UINT8_MAX) { |
2605 | 59.5k | b->data[b->l_data++] = 'C'; |
2606 | 59.5k | b->data[b->l_data++] = x; |
2607 | 59.5k | } else if (x <= UINT16_MAX) { |
2608 | 28.4k | b->data[b->l_data++] = 'S'; |
2609 | 28.4k | u16_to_le(x, b->data + b->l_data); |
2610 | 28.4k | b->l_data += 2; |
2611 | 28.4k | } else { |
2612 | 11.0k | b->data[b->l_data++] = 'I'; |
2613 | 11.0k | u32_to_le(x, b->data + b->l_data); |
2614 | 11.0k | b->l_data += 4; |
2615 | 11.0k | } |
2616 | 99.0k | } |
2617 | 716k | } else if (type == 'f') { |
2618 | 23.6k | b->data[b->l_data++] = 'f'; |
2619 | 23.6k | float_to_le(strtod(q, &q), b->data + b->l_data); |
2620 | 23.6k | b->l_data += sizeof(float); |
2621 | 359k | } else if (type == 'd') { |
2622 | 14.6k | b->data[b->l_data++] = 'd'; |
2623 | 14.6k | double_to_le(strtod(q, &q), b->data + b->l_data); |
2624 | 14.6k | b->l_data += sizeof(double); |
2625 | 345k | } else if (type == 'Z' || type == 'H') { |
2626 | 311k | char *end = strchr(q, '\t'); |
2627 | 311k | if (!end) end = q + strlen(q); |
2628 | 311k | _parse_err(type == 'H' && ((end-q)&1) != 0, |
2629 | 311k | "hex field does not have an even number of digits"); |
2630 | 311k | b->data[b->l_data++] = type; |
2631 | 311k | if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret; |
2632 | 311k | memcpy(b->data + b->l_data, q, end - q); |
2633 | 311k | b->l_data += end - q; |
2634 | 311k | b->data[b->l_data++] = '\0'; |
2635 | 311k | q = end; |
2636 | 311k | } else if (type == 'B') { |
2637 | 33.8k | type = *q++; // q points to the first ',' following the typing byte |
2638 | 33.8k | _parse_err(*q && *q != ',' && *q != '\t', |
2639 | 33.8k | "B aux field type not followed by ','"); |
2640 | | |
2641 | 33.8k | if (sam_parse_B_vals(type, q, &q, b) < 0) |
2642 | 37 | goto err_ret; |
2643 | 33.8k | } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1)); |
2644 | | |
2645 | 8.34M | while (*q > '\t') { q++; } // Skip any junk to next tab |
2646 | 1.55M | q++; |
2647 | 1.55M | } |
2648 | | |
2649 | 91.6k | _parse_err(!lenient && overflow != 0, "numeric value out of allowed range"); |
2650 | 91.6k | #undef _parse_err |
2651 | | |
2652 | 91.6k | return 0; |
2653 | | |
2654 | 136 | err_ret: |
2655 | 136 | return -2; |
2656 | 91.6k | } |
2657 | | |
2658 | | int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) |
2659 | 92.3k | { |
2660 | 383k | #define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0) |
2661 | | |
2662 | 92.3k | #if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff |
2663 | | |
2664 | | // Macro that operates on 64-bits at a time. |
2665 | 92.3k | #define COPY_MINUS_N(to,from,n,l,failed) \ |
2666 | 92.3k | do { \ |
2667 | 77.2k | uint64_u *from8 = (uint64_u *)(from); \ |
2668 | 77.2k | uint64_u *to8 = (uint64_u *)(to); \ |
2669 | 77.2k | uint64_t uflow = 0; \ |
2670 | 77.2k | size_t l8 = (l)>>3, i; \ |
2671 | 77.2k | for (i = 0; i < l8; i++) { \ |
2672 | 0 | to8[i] = from8[i] - (n)*0x0101010101010101UL; \ |
2673 | 0 | uflow |= to8[i]; \ |
2674 | 0 | } \ |
2675 | 77.7k | for (i<<=3; i < (l); ++i) { \ |
2676 | 480 | to[i] = from[i] - (n); \ |
2677 | 480 | uflow |= to[i]; \ |
2678 | 480 | } \ |
2679 | 77.2k | failed = (uflow & 0x8080808080808080UL) > 0; \ |
2680 | 77.2k | } while (0) |
2681 | | |
2682 | | #else |
2683 | | |
2684 | | // Basic version which operates a byte at a time |
2685 | | #define COPY_MINUS_N(to,from,n,l,failed) do { \ |
2686 | | uint8_t uflow = 0; \ |
2687 | | for (i = 0; i < (l); ++i) { \ |
2688 | | (to)[i] = (from)[i] - (n); \ |
2689 | | uflow |= (uint8_t) (to)[i]; \ |
2690 | | } \ |
2691 | | failed = (uflow & 0x80) > 0; \ |
2692 | | } while (0) |
2693 | | |
2694 | | #endif |
2695 | | |
2696 | 153k | #define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l) |
2697 | 1.14M | #define _parse_err(cond, ...) do { if (cond) { hts_log_error(__VA_ARGS__); goto err_ret; } } while (0) |
2698 | 324k | #define _parse_warn(cond, ...) do { if (cond) { hts_log_warning(__VA_ARGS__); } } while (0) |
2699 | | |
2700 | 92.3k | uint8_t *t; |
2701 | | |
2702 | 92.3k | char *p = s->s, *q; |
2703 | 92.3k | int i, overflow = 0; |
2704 | 92.3k | char logbuf[40]; |
2705 | 92.3k | hts_pos_t cigreflen; |
2706 | 92.3k | bam1_core_t *c = &b->core; |
2707 | | |
2708 | 92.3k | b->l_data = 0; |
2709 | 92.3k | memset(c, 0, 32); |
2710 | | |
2711 | | // qname |
2712 | 92.3k | q = _read_token(p); |
2713 | | |
2714 | 92.2k | _parse_warn(p - q <= 1, "empty query name"); |
2715 | 92.2k | _parse_err(p - q > 255, "query name too long"); |
2716 | | // resize large enough for name + extranul |
2717 | 92.2k | if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret; |
2718 | 92.2k | memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q; |
2719 | | |
2720 | 92.2k | c->l_extranul = (4 - (b->l_data & 3)) & 3; |
2721 | 92.2k | memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul); |
2722 | 92.2k | b->l_data += c->l_extranul; |
2723 | | |
2724 | 92.2k | c->l_qname = p - q + c->l_extranul; |
2725 | | |
2726 | | // flag |
2727 | 92.2k | c->flag = parse_sam_flag(p, &p, &overflow); |
2728 | 92.2k | if (*p++ != '\t') goto err_ret; // malformated flag |
2729 | | |
2730 | | // chr |
2731 | 92.1k | q = _read_token(p); |
2732 | 92.1k | if (strcmp(q, "*")) { |
2733 | 77.7k | _parse_err(h->n_targets == 0, "no SQ lines present in the header"); |
2734 | 77.7k | c->tid = bam_name2id(h, q); |
2735 | 77.7k | _parse_err(c->tid < -1, "failed to parse header"); |
2736 | 77.7k | _parse_warn(c->tid < 0, "unrecognized reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX)); |
2737 | 77.7k | } else c->tid = -1; |
2738 | | |
2739 | | // pos |
2740 | 92.1k | c->pos = hts_str2uint(p, &p, 62, &overflow) - 1; |
2741 | 92.1k | if (*p++ != '\t') goto err_ret; |
2742 | 92.1k | if (c->pos < 0 && c->tid >= 0) { |
2743 | 5.72k | _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped"); |
2744 | 5.72k | c->tid = -1; |
2745 | 5.72k | } |
2746 | 92.1k | if (c->tid < 0) c->flag |= BAM_FUNMAP; |
2747 | | |
2748 | | // mapq |
2749 | 92.1k | c->qual = hts_str2uint(p, &p, 8, &overflow); |
2750 | 92.1k | if (*p++ != '\t') goto err_ret; |
2751 | | // cigar |
2752 | 92.1k | if (*p != '*') { |
2753 | 77.1k | uint32_t *cigar = NULL; |
2754 | 77.1k | int old_l_data = b->l_data; |
2755 | 77.1k | int n_cigar = bam_parse_cigar(p, &p, b); |
2756 | 77.1k | if (n_cigar < 1 || *p++ != '\t') goto err_ret; |
2757 | 76.9k | cigar = (uint32_t *)(b->data + old_l_data); |
2758 | | |
2759 | | // can't use bam_endpos() directly as some fields not yet set up |
2760 | 76.9k | cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1; |
2761 | 76.9k | if (cigreflen == 0) cigreflen = 1; |
2762 | 76.9k | } else { |
2763 | 14.9k | _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped"); |
2764 | 14.9k | c->flag |= BAM_FUNMAP; |
2765 | 14.9k | q = _read_token(p); |
2766 | 14.9k | cigreflen = 1; |
2767 | 14.9k | } |
2768 | 91.8k | _parse_err(HTS_POS_MAX - cigreflen <= c->pos, |
2769 | 91.8k | "read ends beyond highest supported position"); |
2770 | 91.8k | c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5); |
2771 | | // mate chr |
2772 | 91.8k | q = _read_token(p); |
2773 | 91.8k | if (strcmp(q, "=") == 0) { |
2774 | 3 | c->mtid = c->tid; |
2775 | 91.8k | } else if (strcmp(q, "*") == 0) { |
2776 | 0 | c->mtid = -1; |
2777 | 91.8k | } else { |
2778 | 91.8k | c->mtid = bam_name2id(h, q); |
2779 | 91.8k | _parse_err(c->mtid < -1, "failed to parse header"); |
2780 | 91.8k | _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX)); |
2781 | 91.8k | } |
2782 | | // mpos |
2783 | 91.8k | c->mpos = hts_str2uint(p, &p, 62, &overflow) - 1; |
2784 | 91.8k | if (*p++ != '\t') goto err_ret; |
2785 | 91.8k | if (c->mpos < 0 && c->mtid >= 0) { |
2786 | 42.0k | _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped"); |
2787 | 42.0k | c->mtid = -1; |
2788 | 42.0k | } |
2789 | | // tlen |
2790 | 91.8k | c->isize = hts_str2int(p, &p, 63, &overflow); |
2791 | 91.8k | if (*p++ != '\t') goto err_ret; |
2792 | 91.8k | _parse_err(overflow, "number outside allowed range"); |
2793 | | // seq |
2794 | 91.8k | q = _read_token(p); |
2795 | 91.8k | if (strcmp(q, "*")) { |
2796 | 62.1k | _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long"); |
2797 | 62.1k | c->l_qseq = p - q - 1; |
2798 | 62.1k | hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname)); |
2799 | 62.1k | _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length"); |
2800 | 62.0k | i = (c->l_qseq + 1) >> 1; |
2801 | 62.0k | _get_mem(uint8_t, &t, b, i); |
2802 | | |
2803 | 62.0k | unsigned int lqs2 = c->l_qseq&~1, i; |
2804 | 62.3k | for (i = 0; i < lqs2; i+=2) |
2805 | 263 | t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]]; |
2806 | 76.8k | for (; i < c->l_qseq; ++i) |
2807 | 14.7k | t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2); |
2808 | 62.0k | } else c->l_qseq = 0; |
2809 | | // qual |
2810 | 183k | _get_mem(uint8_t, &t, b, c->l_qseq); |
2811 | 183k | if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) { |
2812 | 14.5k | memset(t, 0xff, c->l_qseq); |
2813 | 14.5k | p += 2; |
2814 | 77.2k | } else { |
2815 | 77.2k | int failed = 0; |
2816 | 77.2k | _parse_err(s->l - (p - s->s) < c->l_qseq |
2817 | 77.2k | || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'), |
2818 | 77.2k | "SEQ and QUAL are of different length"); |
2819 | 77.2k | COPY_MINUS_N(t, p, 33, c->l_qseq, failed); |
2820 | 77.2k | _parse_err(failed, "invalid QUAL character"); |
2821 | 77.2k | p += c->l_qseq + 1; |
2822 | 77.2k | } |
2823 | | |
2824 | | // aux |
2825 | 91.7k | if (aux_parse(p, s->s + s->l, b, 0, NULL) < 0) |
2826 | 136 | goto err_ret; |
2827 | | |
2828 | 91.6k | if (bam_tag2cigar(b, 1, 1) < 0) |
2829 | 0 | return -2; |
2830 | 91.6k | return 0; |
2831 | | |
2832 | 0 | #undef _parse_warn |
2833 | 0 | #undef _parse_err |
2834 | 0 | #undef _get_mem |
2835 | 0 | #undef _read_token |
2836 | 700 | err_ret: |
2837 | 700 | return -2; |
2838 | 91.6k | } |
2839 | | |
2840 | 77.1k | static uint32_t read_ncigar(const char *q) { |
2841 | 77.1k | uint32_t n_cigar = 0; |
2842 | 1.24M | for (; *q && *q != '\t'; ++q) |
2843 | 1.16M | if (!isdigit_c(*q)) ++n_cigar; |
2844 | 77.1k | if (!n_cigar) { |
2845 | 31 | hts_log_error("No CIGAR operations"); |
2846 | 31 | return 0; |
2847 | 31 | } |
2848 | 77.1k | if (n_cigar >= 2147483647) { |
2849 | 0 | hts_log_error("Too many CIGAR operations"); |
2850 | 0 | return 0; |
2851 | 0 | } |
2852 | | |
2853 | 77.1k | return n_cigar; |
2854 | 77.1k | } |
2855 | | |
2856 | | /*! @function |
2857 | | @abstract Parse a CIGAR string into preallocated a uint32_t array |
2858 | | @param in [in] pointer to the source string |
2859 | | @param a_cigar [out] address of the destination uint32_t buffer |
2860 | | @return number of processed input characters; 0 on error |
2861 | | */ |
2862 | 77.1k | static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) { |
2863 | 77.1k | int i, overflow = 0; |
2864 | 77.1k | const char *p = in; |
2865 | 229k | for (i = 0; i < n_cigar; i++) { |
2866 | 152k | uint32_t len; |
2867 | 152k | int op; |
2868 | 152k | char *q; |
2869 | 152k | len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT; |
2870 | 152k | if (q == p) { |
2871 | 36 | hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p); |
2872 | 36 | return 0; |
2873 | 36 | } |
2874 | 152k | if (overflow) { |
2875 | 21 | hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p); |
2876 | 21 | return 0; |
2877 | 21 | } |
2878 | 152k | p = q; |
2879 | 152k | op = bam_cigar_table[(unsigned char)*p++]; |
2880 | 152k | if (op < 0) { |
2881 | 103 | hts_log_error("Unrecognized CIGAR operator"); |
2882 | 103 | return 0; |
2883 | 103 | } |
2884 | 152k | a_cigar[i] = len; |
2885 | 152k | a_cigar[i] |= op; |
2886 | 152k | } |
2887 | | |
2888 | 76.9k | return p-in; |
2889 | 77.1k | } |
2890 | | |
2891 | 0 | ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) { |
2892 | 0 | size_t n_cigar = 0; |
2893 | 0 | int diff; |
2894 | |
|
2895 | 0 | if (!in || !a_cigar || !a_mem) { |
2896 | 0 | hts_log_error("NULL pointer arguments"); |
2897 | 0 | return -1; |
2898 | 0 | } |
2899 | 0 | if (end) *end = (char *)in; |
2900 | |
|
2901 | 0 | if (*in == '*') { |
2902 | 0 | if (end) (*end)++; |
2903 | 0 | return 0; |
2904 | 0 | } |
2905 | 0 | n_cigar = read_ncigar(in); |
2906 | 0 | if (!n_cigar) return 0; |
2907 | 0 | if (n_cigar > *a_mem) { |
2908 | 0 | uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar)); |
2909 | 0 | if (a_tmp) { |
2910 | 0 | *a_cigar = a_tmp; |
2911 | 0 | *a_mem = n_cigar; |
2912 | 0 | } else { |
2913 | 0 | hts_log_error("Memory allocation error"); |
2914 | 0 | return -1; |
2915 | 0 | } |
2916 | 0 | } |
2917 | | |
2918 | 0 | if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1; |
2919 | 0 | if (end) *end = (char *)in+diff; |
2920 | |
|
2921 | 0 | return n_cigar; |
2922 | 0 | } |
2923 | | |
2924 | 77.1k | ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) { |
2925 | 77.1k | size_t n_cigar = 0; |
2926 | 77.1k | int diff; |
2927 | | |
2928 | 77.1k | if (!in || !b) { |
2929 | 0 | hts_log_error("NULL pointer arguments"); |
2930 | 0 | return -1; |
2931 | 0 | } |
2932 | 77.1k | if (end) *end = (char *)in; |
2933 | | |
2934 | 77.1k | n_cigar = (*in == '*') ? 0 : read_ncigar(in); |
2935 | 77.1k | if (!n_cigar && b->core.n_cigar == 0) { |
2936 | 31 | if (end) *end = (char *)in+1; |
2937 | 31 | return 0; |
2938 | 31 | } |
2939 | | |
2940 | 77.1k | ssize_t cig_diff = n_cigar - b->core.n_cigar; |
2941 | 77.1k | if (cig_diff > 0 && |
2942 | 77.1k | possibly_expand_bam_data(b, cig_diff * sizeof(uint32_t)) < 0) { |
2943 | 0 | hts_log_error("Memory allocation error"); |
2944 | 0 | return -1; |
2945 | 0 | } |
2946 | | |
2947 | 77.1k | uint32_t *cig = bam_get_cigar(b); |
2948 | 77.1k | if ((uint8_t *)cig != b->data + b->l_data) { |
2949 | | // Modifying an BAM existing BAM record |
2950 | 0 | uint8_t *seq = bam_get_seq(b); |
2951 | 0 | memmove(cig + n_cigar, seq, (b->data + b->l_data) - seq); |
2952 | 0 | } |
2953 | | |
2954 | 77.1k | if (n_cigar) { |
2955 | 77.1k | if (!(diff = parse_cigar(in, cig, n_cigar))) |
2956 | 160 | return -1; |
2957 | 77.1k | } else { |
2958 | 0 | diff = 1; // handle "*" |
2959 | 0 | } |
2960 | | |
2961 | 76.9k | b->l_data += cig_diff * sizeof(uint32_t); |
2962 | 76.9k | b->core.n_cigar = n_cigar; |
2963 | 76.9k | if (end) *end = (char *)in + diff; |
2964 | | |
2965 | 76.9k | return n_cigar; |
2966 | 77.1k | } |
2967 | | |
2968 | | /* |
2969 | | * ----------------------------------------------------------------------------- |
2970 | | * SAM threading |
2971 | | */ |
2972 | | // Size of SAM text block (reading) |
2973 | 0 | #define SAM_NBYTES 240000 |
2974 | | |
2975 | | // Number of BAM records (writing, up to NB_mem in size) |
2976 | 0 | #define SAM_NBAM 1000 |
2977 | | |
2978 | | struct SAM_state; |
2979 | | |
2980 | | // Output job - a block of BAM records |
2981 | | typedef struct sp_bams { |
2982 | | struct sp_bams *next; |
2983 | | int serial; |
2984 | | |
2985 | | bam1_t *bams; |
2986 | | int nbams, abams; // used and alloc for bams[] array |
2987 | | size_t bam_mem; // very approximate total size |
2988 | | |
2989 | | struct SAM_state *fd; |
2990 | | } sp_bams; |
2991 | | |
2992 | | // Input job - a block of SAM text |
2993 | | typedef struct sp_lines { |
2994 | | struct sp_lines *next; |
2995 | | int serial; |
2996 | | |
2997 | | char *data; |
2998 | | int data_size; |
2999 | | int alloc; |
3000 | | |
3001 | | struct SAM_state *fd; |
3002 | | sp_bams *bams; |
3003 | | } sp_lines; |
3004 | | |
3005 | | enum sam_cmd { |
3006 | | SAM_NONE = 0, |
3007 | | SAM_CLOSE, |
3008 | | SAM_CLOSE_DONE, |
3009 | | SAM_AT_EOF, |
3010 | | }; |
3011 | | |
3012 | | typedef struct SAM_state { |
3013 | | sam_hdr_t *h; |
3014 | | |
3015 | | hts_tpool *p; |
3016 | | int own_pool; |
3017 | | pthread_mutex_t lines_m; |
3018 | | hts_tpool_process *q; |
3019 | | pthread_t dispatcher; |
3020 | | int dispatcher_set; |
3021 | | |
3022 | | sp_lines *lines; |
3023 | | sp_bams *bams; |
3024 | | |
3025 | | sp_bams *curr_bam; |
3026 | | int curr_idx; |
3027 | | int serial; |
3028 | | |
3029 | | // Be warned: moving these mutexes around in this struct can reduce |
3030 | | // threading performance by up to 70%! |
3031 | | pthread_mutex_t command_m; |
3032 | | pthread_cond_t command_c; |
3033 | | enum sam_cmd command; |
3034 | | |
3035 | | // One of the E* errno codes |
3036 | | int errcode; |
3037 | | |
3038 | | htsFile *fp; |
3039 | | } SAM_state; |
3040 | | |
3041 | | // Returns a SAM_state struct from a generic hFILE. |
3042 | | // |
3043 | | // Returns NULL on failure. |
3044 | 0 | static SAM_state *sam_state_create(htsFile *fp) { |
3045 | | // Ideally sam_open wouldn't be a #define to hts_open but instead would |
3046 | | // be a redirect call with an additional 'S' mode. This in turn would |
3047 | | // correctly set the designed format to sam instead of a generic |
3048 | | // text_format. |
3049 | 0 | if (fp->format.format != sam && fp->format.format != text_format) |
3050 | 0 | return NULL; |
3051 | | |
3052 | 0 | SAM_state *fd = calloc(1, sizeof(*fd)); |
3053 | 0 | if (!fd) |
3054 | 0 | return NULL; |
3055 | | |
3056 | 0 | fp->state = fd; |
3057 | 0 | fd->fp = fp; |
3058 | |
|
3059 | 0 | return fd; |
3060 | 0 | } |
3061 | | |
3062 | | static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str); |
3063 | | static void *sam_format_worker(void *arg); |
3064 | | |
3065 | 0 | static void sam_state_err(SAM_state *fd, int errcode) { |
3066 | 0 | pthread_mutex_lock(&fd->command_m); |
3067 | 0 | if (!fd->errcode) |
3068 | 0 | fd->errcode = errcode; |
3069 | 0 | pthread_mutex_unlock(&fd->command_m); |
3070 | 0 | } |
3071 | | |
3072 | 0 | static void sam_free_sp_bams(sp_bams *b) { |
3073 | 0 | if (!b) |
3074 | 0 | return; |
3075 | | |
3076 | 0 | if (b->bams) { |
3077 | 0 | int i; |
3078 | 0 | for (i = 0; i < b->abams; i++) { |
3079 | 0 | if (b->bams[i].data) |
3080 | 0 | free(b->bams[i].data); |
3081 | 0 | } |
3082 | 0 | free(b->bams); |
3083 | 0 | } |
3084 | 0 | free(b); |
3085 | 0 | } |
3086 | | |
3087 | | // Destroys the state produce by sam_state_create. |
3088 | 2.60k | int sam_state_destroy(htsFile *fp) { |
3089 | 2.60k | int ret = 0; |
3090 | | |
3091 | 2.60k | if (!fp->state) |
3092 | 2.60k | return 0; |
3093 | | |
3094 | 0 | SAM_state *fd = fp->state; |
3095 | 0 | if (fd->p) { |
3096 | 0 | if (fd->h) { |
3097 | | // Notify sam_dispatcher we're closing |
3098 | 0 | pthread_mutex_lock(&fd->command_m); |
3099 | 0 | if (fd->command != SAM_CLOSE_DONE) |
3100 | 0 | fd->command = SAM_CLOSE; |
3101 | 0 | pthread_cond_signal(&fd->command_c); |
3102 | 0 | ret = -fd->errcode; |
3103 | 0 | if (fd->q) |
3104 | 0 | hts_tpool_wake_dispatch(fd->q); // unstick the reader |
3105 | |
|
3106 | 0 | if (!fp->is_write && fd->q && fd->dispatcher_set) { |
3107 | 0 | for (;;) { |
3108 | | // Avoid deadlocks with dispatcher |
3109 | 0 | if (fd->command == SAM_CLOSE_DONE) |
3110 | 0 | break; |
3111 | 0 | hts_tpool_wake_dispatch(fd->q); |
3112 | 0 | pthread_mutex_unlock(&fd->command_m); |
3113 | 0 | hts_usleep(10000); |
3114 | 0 | pthread_mutex_lock(&fd->command_m); |
3115 | 0 | } |
3116 | 0 | } |
3117 | 0 | pthread_mutex_unlock(&fd->command_m); |
3118 | |
|
3119 | 0 | if (fp->is_write) { |
3120 | | // Dispatch the last partial block. |
3121 | 0 | sp_bams *gb = fd->curr_bam; |
3122 | 0 | if (!ret && gb && gb->nbams > 0 && fd->q) |
3123 | 0 | ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb); |
3124 | | |
3125 | | // Flush and drain output |
3126 | 0 | if (fd->q) |
3127 | 0 | hts_tpool_process_flush(fd->q); |
3128 | 0 | pthread_mutex_lock(&fd->command_m); |
3129 | 0 | if (!ret) ret = -fd->errcode; |
3130 | 0 | pthread_mutex_unlock(&fd->command_m); |
3131 | |
|
3132 | 0 | while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) { |
3133 | 0 | hts_usleep(10000); |
3134 | 0 | pthread_mutex_lock(&fd->command_m); |
3135 | 0 | ret = -fd->errcode; |
3136 | | // not empty but shutdown implies error |
3137 | 0 | if (hts_tpool_process_is_shutdown(fd->q) && !ret) |
3138 | 0 | ret = EIO; |
3139 | 0 | pthread_mutex_unlock(&fd->command_m); |
3140 | 0 | } |
3141 | 0 | if (fd->q) |
3142 | 0 | hts_tpool_process_shutdown(fd->q); |
3143 | 0 | } |
3144 | | |
3145 | | // Wait for it to acknowledge |
3146 | 0 | if (fd->dispatcher_set) |
3147 | 0 | pthread_join(fd->dispatcher, NULL); |
3148 | 0 | if (!ret) ret = -fd->errcode; |
3149 | 0 | } |
3150 | | |
3151 | | // Tidy up memory |
3152 | 0 | if (fd->q) |
3153 | 0 | hts_tpool_process_destroy(fd->q); |
3154 | |
|
3155 | 0 | if (fd->own_pool && fp->format.compression == no_compression) { |
3156 | 0 | hts_tpool_destroy(fd->p); |
3157 | 0 | fd->p = NULL; |
3158 | 0 | } |
3159 | 0 | pthread_mutex_destroy(&fd->lines_m); |
3160 | 0 | pthread_mutex_destroy(&fd->command_m); |
3161 | 0 | pthread_cond_destroy(&fd->command_c); |
3162 | |
|
3163 | 0 | sp_lines *l = fd->lines; |
3164 | 0 | while (l) { |
3165 | 0 | sp_lines *n = l->next; |
3166 | 0 | free(l->data); |
3167 | 0 | free(l); |
3168 | 0 | l = n; |
3169 | 0 | } |
3170 | |
|
3171 | 0 | sp_bams *b = fd->bams; |
3172 | 0 | while (b) { |
3173 | 0 | if (fd->curr_bam == b) |
3174 | 0 | fd->curr_bam = NULL; |
3175 | 0 | sp_bams *n = b->next; |
3176 | 0 | sam_free_sp_bams(b); |
3177 | 0 | b = n; |
3178 | 0 | } |
3179 | |
|
3180 | 0 | if (fd->curr_bam) |
3181 | 0 | sam_free_sp_bams(fd->curr_bam); |
3182 | | |
3183 | | // Decrement counter by one, maybe destroying too. |
3184 | | // This is to permit the caller using bam_hdr_destroy |
3185 | | // before sam_close without triggering decode errors |
3186 | | // in the background threads. |
3187 | 0 | bam_hdr_destroy(fd->h); |
3188 | 0 | } |
3189 | |
|
3190 | 0 | free(fp->state); |
3191 | 0 | fp->state = NULL; |
3192 | 0 | return ret; |
3193 | 2.60k | } |
3194 | | |
3195 | | // Cleanup function - job for sam_parse_worker; result for sam_format_worker |
3196 | 0 | static void cleanup_sp_lines(void *arg) { |
3197 | 0 | sp_lines *gl = (sp_lines *)arg; |
3198 | 0 | if (!gl) return; |
3199 | | |
3200 | | // Should always be true for lines passed to / from thread workers. |
3201 | 0 | assert(gl->next == NULL); |
3202 | |
|
3203 | 0 | free(gl->data); |
3204 | 0 | sam_free_sp_bams(gl->bams); |
3205 | 0 | free(gl); |
3206 | 0 | } |
3207 | | |
3208 | | // Run from one of the worker threads. |
3209 | | // Convert a passed in array of lines to array of BAMs, returning |
3210 | | // the result back to the thread queue. |
3211 | 0 | static void *sam_parse_worker(void *arg) { |
3212 | 0 | sp_lines *gl = (sp_lines *)arg; |
3213 | 0 | sp_bams *gb = NULL; |
3214 | 0 | char *lines = gl->data; |
3215 | 0 | int i; |
3216 | 0 | bam1_t *b; |
3217 | 0 | SAM_state *fd = gl->fd; |
3218 | | |
3219 | | // Use a block of BAM structs we had earlier if available. |
3220 | 0 | pthread_mutex_lock(&fd->lines_m); |
3221 | 0 | if (fd->bams) { |
3222 | 0 | gb = fd->bams; |
3223 | 0 | fd->bams = gb->next; |
3224 | 0 | } |
3225 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3226 | |
|
3227 | 0 | if (gb == NULL) { |
3228 | 0 | gb = calloc(1, sizeof(*gb)); |
3229 | 0 | if (!gb) { |
3230 | 0 | return NULL; |
3231 | 0 | } |
3232 | 0 | gb->abams = 100; |
3233 | 0 | gb->bams = b = calloc(gb->abams, sizeof(*b)); |
3234 | 0 | if (!gb->bams) { |
3235 | 0 | sam_state_err(fd, ENOMEM); |
3236 | 0 | goto err; |
3237 | 0 | } |
3238 | 0 | gb->nbams = 0; |
3239 | 0 | gb->bam_mem = 0; |
3240 | 0 | } |
3241 | 0 | gb->serial = gl->serial; |
3242 | 0 | gb->next = NULL; |
3243 | |
|
3244 | 0 | b = (bam1_t *)gb->bams; |
3245 | 0 | if (!b) { |
3246 | 0 | sam_state_err(fd, ENOMEM); |
3247 | 0 | goto err; |
3248 | 0 | } |
3249 | | |
3250 | 0 | i = 0; |
3251 | 0 | char *cp = lines, *cp_end = lines + gl->data_size; |
3252 | 0 | while (cp < cp_end) { |
3253 | 0 | if (i >= gb->abams) { |
3254 | 0 | int old_abams = gb->abams; |
3255 | 0 | gb->abams *= 2; |
3256 | 0 | b = (bam1_t *)realloc(gb->bams, gb->abams*sizeof(bam1_t)); |
3257 | 0 | if (!b) { |
3258 | 0 | gb->abams /= 2; |
3259 | 0 | sam_state_err(fd, ENOMEM); |
3260 | 0 | goto err; |
3261 | 0 | } |
3262 | 0 | memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b)); |
3263 | 0 | gb->bams = b; |
3264 | 0 | } |
3265 | | |
3266 | | // Ideally we'd get sam_parse1 to return the number of |
3267 | | // bytes decoded and to be able to stop on newline as |
3268 | | // well as \0. |
3269 | | // |
3270 | | // We can then avoid the additional strchr loop. |
3271 | | // It's around 6% of our CPU cost, albeit threadable. |
3272 | | // |
3273 | | // However this is an API change so for now we copy. |
3274 | | |
3275 | 0 | char *nl = strchr(cp, '\n'); |
3276 | 0 | char *line_end; |
3277 | 0 | if (nl) { |
3278 | 0 | line_end = nl; |
3279 | 0 | if (line_end > cp && *(line_end - 1) == '\r') |
3280 | 0 | line_end--; |
3281 | 0 | nl++; |
3282 | 0 | } else { |
3283 | 0 | nl = line_end = cp_end; |
3284 | 0 | } |
3285 | 0 | *line_end = '\0'; |
3286 | 0 | kstring_t ks = { line_end - cp, gl->alloc, cp }; |
3287 | 0 | if (sam_parse1(&ks, fd->h, &b[i]) < 0) { |
3288 | 0 | sam_state_err(fd, errno ? errno : EIO); |
3289 | 0 | cleanup_sp_lines(gl); |
3290 | 0 | goto err; |
3291 | 0 | } |
3292 | | |
3293 | 0 | cp = nl; |
3294 | 0 | i++; |
3295 | 0 | } |
3296 | 0 | gb->nbams = i; |
3297 | |
|
3298 | 0 | pthread_mutex_lock(&fd->lines_m); |
3299 | 0 | gl->next = fd->lines; |
3300 | 0 | fd->lines = gl; |
3301 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3302 | 0 | return gb; |
3303 | | |
3304 | 0 | err: |
3305 | 0 | sam_free_sp_bams(gb); |
3306 | 0 | return NULL; |
3307 | 0 | } |
3308 | | |
3309 | 0 | static void *sam_parse_eof(void *arg) { |
3310 | 0 | return NULL; |
3311 | 0 | } |
3312 | | |
3313 | | // Cleanup function - result for sam_parse_worker; job for sam_format_worker |
3314 | 0 | static void cleanup_sp_bams(void *arg) { |
3315 | 0 | sam_free_sp_bams((sp_bams *) arg); |
3316 | 0 | } |
3317 | | |
3318 | | // Runs in its own thread. |
3319 | | // Reads a block of text (SAM) and sends a new job to the thread queue to |
3320 | | // translate this to BAM. |
3321 | 0 | static void *sam_dispatcher_read(void *vp) { |
3322 | 0 | htsFile *fp = vp; |
3323 | 0 | kstring_t line = {0}; |
3324 | 0 | int line_frag = 0; |
3325 | 0 | SAM_state *fd = fp->state; |
3326 | 0 | sp_lines *l = NULL; |
3327 | | |
3328 | | // Pre-allocate buffer for left-over bits of line (exact size doesn't |
3329 | | // matter as it will grow if necessary). |
3330 | 0 | if (ks_resize(&line, 1000) < 0) |
3331 | 0 | goto err; |
3332 | | |
3333 | 0 | for (;;) { |
3334 | | // Check for command |
3335 | 0 | pthread_mutex_lock(&fd->command_m); |
3336 | 0 | switch (fd->command) { |
3337 | | |
3338 | 0 | case SAM_CLOSE: |
3339 | 0 | pthread_cond_signal(&fd->command_c); |
3340 | 0 | pthread_mutex_unlock(&fd->command_m); |
3341 | 0 | hts_tpool_process_shutdown(fd->q); |
3342 | 0 | goto tidyup; |
3343 | | |
3344 | 0 | default: |
3345 | 0 | break; |
3346 | 0 | } |
3347 | 0 | pthread_mutex_unlock(&fd->command_m); |
3348 | |
|
3349 | 0 | pthread_mutex_lock(&fd->lines_m); |
3350 | 0 | if (fd->lines) { |
3351 | | // reuse existing line buffer |
3352 | 0 | l = fd->lines; |
3353 | 0 | fd->lines = l->next; |
3354 | 0 | } |
3355 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3356 | |
|
3357 | 0 | if (l == NULL) { |
3358 | | // none to reuse, to create a new one |
3359 | 0 | l = calloc(1, sizeof(*l)); |
3360 | 0 | if (!l) |
3361 | 0 | goto err; |
3362 | 0 | l->alloc = SAM_NBYTES; |
3363 | 0 | l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1 |
3364 | 0 | if (!l->data) { |
3365 | 0 | free(l); |
3366 | 0 | l = NULL; |
3367 | 0 | goto err; |
3368 | 0 | } |
3369 | 0 | l->fd = fd; |
3370 | 0 | } |
3371 | 0 | l->next = NULL; |
3372 | |
|
3373 | 0 | if (l->alloc < line_frag+SAM_NBYTES/2) { |
3374 | 0 | char *rp = realloc(l->data, line_frag+SAM_NBYTES/2 +8); |
3375 | 0 | if (!rp) |
3376 | 0 | goto err; |
3377 | 0 | l->alloc = line_frag+SAM_NBYTES/2; |
3378 | 0 | l->data = rp; |
3379 | 0 | } |
3380 | 0 | memcpy(l->data, line.s, line_frag); |
3381 | |
|
3382 | 0 | l->data_size = line_frag; |
3383 | 0 | ssize_t nbytes; |
3384 | 0 | longer_line: |
3385 | 0 | if (fp->is_bgzf) |
3386 | 0 | nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag); |
3387 | 0 | else |
3388 | 0 | nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag); |
3389 | 0 | if (nbytes < 0) { |
3390 | 0 | sam_state_err(fd, errno ? errno : EIO); |
3391 | 0 | goto err; |
3392 | 0 | } else if (nbytes == 0) |
3393 | 0 | break; // EOF |
3394 | 0 | l->data_size += nbytes; |
3395 | | |
3396 | | // trim to last \n. Maybe \r\n, but that's still fine |
3397 | 0 | if (nbytes == l->alloc - line_frag) { |
3398 | 0 | char *cp_end = l->data + l->data_size; |
3399 | 0 | char *cp = cp_end-1; |
3400 | |
|
3401 | 0 | while (cp > (char *)l->data && *cp != '\n') |
3402 | 0 | cp--; |
3403 | | |
3404 | | // entire buffer is part of a single line |
3405 | 0 | if (cp == l->data) { |
3406 | 0 | line_frag = l->data_size; |
3407 | 0 | char *rp = realloc(l->data, l->alloc * 2 + 8); |
3408 | 0 | if (!rp) |
3409 | 0 | goto err; |
3410 | 0 | l->alloc *= 2; |
3411 | 0 | l->data = rp; |
3412 | 0 | assert(l->alloc >= l->data_size); |
3413 | 0 | assert(l->alloc >= line_frag); |
3414 | 0 | assert(l->alloc >= l->alloc - line_frag); |
3415 | 0 | goto longer_line; |
3416 | 0 | } |
3417 | 0 | cp++; |
3418 | | |
3419 | | // line holds the remainder of our line. |
3420 | 0 | if (ks_resize(&line, cp_end - cp) < 0) |
3421 | 0 | goto err; |
3422 | 0 | memcpy(line.s, cp, cp_end - cp); |
3423 | 0 | line_frag = cp_end - cp; |
3424 | 0 | l->data_size = l->alloc - line_frag; |
3425 | 0 | } else { |
3426 | | // out of buffer |
3427 | 0 | line_frag = 0; |
3428 | 0 | } |
3429 | | |
3430 | 0 | l->serial = fd->serial++; |
3431 | | //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial); |
3432 | 0 | if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l, |
3433 | 0 | cleanup_sp_lines, cleanup_sp_bams, 0) < 0) |
3434 | 0 | goto err; |
3435 | 0 | pthread_mutex_lock(&fd->command_m); |
3436 | 0 | if (fd->command == SAM_CLOSE) { |
3437 | 0 | pthread_mutex_unlock(&fd->command_m); |
3438 | 0 | l = NULL; |
3439 | 0 | goto tidyup; |
3440 | 0 | } |
3441 | 0 | l = NULL; // Now "owned" by sam_parse_worker() |
3442 | 0 | pthread_mutex_unlock(&fd->command_m); |
3443 | 0 | } |
3444 | | |
3445 | | // Submit a NULL sp_bams entry to act as an EOF marker |
3446 | 0 | if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0) |
3447 | 0 | goto err; |
3448 | | |
3449 | | // At EOF, wait for close request. |
3450 | | // (In future if we add support for seek, this is where we need to catch it.) |
3451 | 0 | for (;;) { |
3452 | 0 | pthread_mutex_lock(&fd->command_m); |
3453 | 0 | if (fd->command == SAM_NONE) |
3454 | 0 | pthread_cond_wait(&fd->command_c, &fd->command_m); |
3455 | 0 | switch (fd->command) { |
3456 | 0 | case SAM_CLOSE: |
3457 | 0 | pthread_cond_signal(&fd->command_c); |
3458 | 0 | pthread_mutex_unlock(&fd->command_m); |
3459 | 0 | hts_tpool_process_shutdown(fd->q); |
3460 | 0 | goto tidyup; |
3461 | | |
3462 | 0 | default: |
3463 | 0 | pthread_mutex_unlock(&fd->command_m); |
3464 | 0 | break; |
3465 | 0 | } |
3466 | 0 | } |
3467 | | |
3468 | 0 | tidyup: |
3469 | 0 | pthread_mutex_lock(&fd->command_m); |
3470 | 0 | fd->command = SAM_CLOSE_DONE; |
3471 | 0 | pthread_cond_signal(&fd->command_c); |
3472 | 0 | pthread_mutex_unlock(&fd->command_m); |
3473 | |
|
3474 | 0 | if (l) { |
3475 | 0 | pthread_mutex_lock(&fd->lines_m); |
3476 | 0 | l->next = fd->lines; |
3477 | 0 | fd->lines = l; |
3478 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3479 | 0 | } |
3480 | 0 | free(line.s); |
3481 | |
|
3482 | 0 | return NULL; |
3483 | | |
3484 | 0 | err: |
3485 | 0 | sam_state_err(fd, errno ? errno : ENOMEM); |
3486 | 0 | hts_tpool_process_shutdown(fd->q); |
3487 | 0 | goto tidyup; |
3488 | 0 | } |
3489 | | |
3490 | | // Runs in its own thread. |
3491 | | // Takes encoded blocks of SAM off the thread results queue and writes them |
3492 | | // to our output stream. |
3493 | 0 | static void *sam_dispatcher_write(void *vp) { |
3494 | 0 | htsFile *fp = vp; |
3495 | 0 | SAM_state *fd = fp->state; |
3496 | 0 | hts_tpool_result *r; |
3497 | | |
3498 | | // Iterates until result queue is shutdown, where it returns NULL. |
3499 | 0 | while ((r = hts_tpool_next_result_wait(fd->q))) { |
3500 | 0 | sp_lines *gl = (sp_lines *)hts_tpool_result_data(r); |
3501 | 0 | if (!gl) { |
3502 | 0 | sam_state_err(fd, ENOMEM); |
3503 | 0 | goto err; |
3504 | 0 | } |
3505 | | |
3506 | 0 | if (fp->idx) { |
3507 | 0 | sp_bams *gb = gl->bams; |
3508 | 0 | int i = 0, count = 0; |
3509 | 0 | while (i < gl->data_size) { |
3510 | 0 | int j = i; |
3511 | 0 | while (i < gl->data_size && gl->data[i] != '\n') |
3512 | 0 | i++; |
3513 | 0 | if (i < gl->data_size) |
3514 | 0 | i++; |
3515 | |
|
3516 | 0 | if (fp->is_bgzf) { |
3517 | 0 | if (bgzf_flush_try(fp->fp.bgzf, i-j) < 0) |
3518 | 0 | goto err; |
3519 | 0 | if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j) |
3520 | 0 | goto err; |
3521 | 0 | } else { |
3522 | 0 | if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j) |
3523 | 0 | goto err; |
3524 | 0 | } |
3525 | | |
3526 | 0 | bam1_t *b = &gb->bams[count++]; |
3527 | 0 | if (fp->format.compression == bgzf) { |
3528 | 0 | if (bgzf_idx_push(fp->fp.bgzf, fp->idx, |
3529 | 0 | b->core.tid, b->core.pos, bam_endpos(b), |
3530 | 0 | bgzf_tell(fp->fp.bgzf), |
3531 | 0 | !(b->core.flag&BAM_FUNMAP)) < 0) { |
3532 | 0 | sam_state_err(fd, errno ? errno : ENOMEM); |
3533 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
3534 | 0 | bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1); |
3535 | 0 | goto err; |
3536 | 0 | } |
3537 | 0 | } else { |
3538 | 0 | if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b), |
3539 | 0 | bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) { |
3540 | 0 | sam_state_err(fd, errno ? errno : ENOMEM); |
3541 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
3542 | 0 | bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1); |
3543 | 0 | goto err; |
3544 | 0 | } |
3545 | 0 | } |
3546 | 0 | } |
3547 | | |
3548 | 0 | assert(count == gb->nbams); |
3549 | | |
3550 | | // Add bam array to free-list |
3551 | 0 | pthread_mutex_lock(&fd->lines_m); |
3552 | 0 | gb->next = fd->bams; |
3553 | 0 | fd->bams = gl->bams; |
3554 | 0 | gl->bams = NULL; |
3555 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3556 | 0 | } else { |
3557 | 0 | if (fp->is_bgzf) { |
3558 | | // We keep track of how much in the current block we have |
3559 | | // remaining => R. We look for the last newline in input |
3560 | | // [i] to [i+R], backwards => position N. |
3561 | | // |
3562 | | // If we find a newline, we write out bytes i to N. |
3563 | | // We know we cannot fit the next record in this bgzf block, |
3564 | | // so we flush what we have and copy input N to i+R into |
3565 | | // the start of a new block, and recompute a new R for that. |
3566 | | // |
3567 | | // If we don't find a newline (i==N) then we cannot extend |
3568 | | // the current block at all, so flush whatever is in it now |
3569 | | // if it ends on a newline. |
3570 | | // We still copy i(==N) to i+R to the next block and |
3571 | | // continue as before with a new R. |
3572 | | // |
3573 | | // The only exception on the flush is when we run out of |
3574 | | // data in the input. In that case we skip it as we don't |
3575 | | // yet know if the next record will fit. |
3576 | | // |
3577 | | // Both conditions share the same code here: |
3578 | | // - Look for newline (pos N) |
3579 | | // - Write i to N (which maybe 0) |
3580 | | // - Flush if block ends on newline and not end of input |
3581 | | // - write N to i+R |
3582 | |
|
3583 | 0 | int i = 0; |
3584 | 0 | BGZF *fb = fp->fp.bgzf; |
3585 | 0 | while (i < gl->data_size) { |
3586 | | // remaining space in block |
3587 | 0 | int R = BGZF_BLOCK_SIZE - fb->block_offset; |
3588 | 0 | int eod = 0; |
3589 | 0 | if (R > gl->data_size-i) |
3590 | 0 | R = gl->data_size-i, eod = 1; |
3591 | | |
3592 | | // Find last newline in input data |
3593 | 0 | int N = i + R; |
3594 | 0 | while (--N > i) { |
3595 | 0 | if (gl->data[N] == '\n') |
3596 | 0 | break; |
3597 | 0 | } |
3598 | |
|
3599 | 0 | if (N != i) { |
3600 | | // Found a newline |
3601 | 0 | N++; |
3602 | 0 | if (bgzf_write(fb, &gl->data[i], N-i) != N-i) |
3603 | 0 | goto err; |
3604 | 0 | } |
3605 | | |
3606 | | // Flush bgzf block |
3607 | 0 | int b_off = fb->block_offset; |
3608 | 0 | if (!eod && b_off && |
3609 | 0 | ((char *)fb->uncompressed_block)[b_off-1] == '\n') |
3610 | 0 | if (bgzf_flush_try(fb, BGZF_BLOCK_SIZE) < 0) |
3611 | 0 | goto err; |
3612 | | |
3613 | | // Copy from N onwards into next block |
3614 | 0 | if (i+R > N) |
3615 | 0 | if (bgzf_write(fb, &gl->data[N], i+R - N) |
3616 | 0 | != i+R - N) |
3617 | 0 | goto err; |
3618 | | |
3619 | 0 | i = i+R; |
3620 | 0 | } |
3621 | 0 | } else { |
3622 | 0 | if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size) |
3623 | 0 | goto err; |
3624 | 0 | } |
3625 | 0 | } |
3626 | | |
3627 | 0 | hts_tpool_delete_result(r, 0); |
3628 | | |
3629 | | // Also updated by main thread |
3630 | 0 | pthread_mutex_lock(&fd->lines_m); |
3631 | 0 | gl->next = fd->lines; |
3632 | 0 | fd->lines = gl; |
3633 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3634 | 0 | } |
3635 | | |
3636 | 0 | sam_state_err(fd, 0); // success |
3637 | 0 | hts_tpool_process_shutdown(fd->q); |
3638 | 0 | return NULL; |
3639 | | |
3640 | 0 | err: |
3641 | 0 | sam_state_err(fd, errno ? errno : EIO); |
3642 | 0 | return (void *)-1; |
3643 | 0 | } |
3644 | | |
3645 | | // Run from one of the worker threads. |
3646 | | // Convert a passed in array of BAMs (sp_bams) and converts to a block |
3647 | | // of text SAM records (sp_lines). |
3648 | 0 | static void *sam_format_worker(void *arg) { |
3649 | 0 | sp_bams *gb = (sp_bams *)arg; |
3650 | 0 | sp_lines *gl = NULL; |
3651 | 0 | int i; |
3652 | 0 | SAM_state *fd = gb->fd; |
3653 | 0 | htsFile *fp = fd->fp; |
3654 | | |
3655 | | // Use a block of SAM strings we had earlier if available. |
3656 | 0 | pthread_mutex_lock(&fd->lines_m); |
3657 | 0 | if (fd->lines) { |
3658 | 0 | gl = fd->lines; |
3659 | 0 | fd->lines = gl->next; |
3660 | 0 | } |
3661 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3662 | |
|
3663 | 0 | if (gl == NULL) { |
3664 | 0 | gl = calloc(1, sizeof(*gl)); |
3665 | 0 | if (!gl) { |
3666 | 0 | sam_state_err(fd, ENOMEM); |
3667 | 0 | return NULL; |
3668 | 0 | } |
3669 | 0 | gl->alloc = gl->data_size = 0; |
3670 | 0 | gl->data = NULL; |
3671 | 0 | } |
3672 | 0 | gl->serial = gb->serial; |
3673 | 0 | gl->next = NULL; |
3674 | |
|
3675 | 0 | kstring_t ks = {0, gl->alloc, gl->data}; |
3676 | |
|
3677 | 0 | for (i = 0; i < gb->nbams; i++) { |
3678 | 0 | if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) { |
3679 | 0 | sam_state_err(fd, errno ? errno : EIO); |
3680 | 0 | goto err; |
3681 | 0 | } |
3682 | 0 | kputc('\n', &ks); |
3683 | 0 | } |
3684 | | |
3685 | 0 | pthread_mutex_lock(&fd->lines_m); |
3686 | 0 | gl->data_size = ks.l; |
3687 | 0 | gl->alloc = ks.m; |
3688 | 0 | gl->data = ks.s; |
3689 | |
|
3690 | 0 | if (fp->idx) { |
3691 | | // Keep hold of the bam array a little longer as |
3692 | | // sam_dispatcher_write needs to use them for building the index. |
3693 | 0 | gl->bams = gb; |
3694 | 0 | } else { |
3695 | | // Add bam array to free-list |
3696 | 0 | gb->next = fd->bams; |
3697 | 0 | fd->bams = gb; |
3698 | 0 | } |
3699 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3700 | |
|
3701 | 0 | return gl; |
3702 | | |
3703 | 0 | err: |
3704 | | // Possible race between this and fd->curr_bam. |
3705 | | // Easier to not free and leave it on the input list so it |
3706 | | // gets freed there instead? |
3707 | | // sam_free_sp_bams(gb); |
3708 | 0 | if (gl) { |
3709 | 0 | free(gl->data); |
3710 | 0 | free(gl); |
3711 | 0 | } |
3712 | 0 | return NULL; |
3713 | 0 | } |
3714 | | |
3715 | 0 | int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) { |
3716 | 0 | if (fp->state) |
3717 | 0 | return 0; |
3718 | | |
3719 | 0 | if (!(fp->state = sam_state_create(fp))) |
3720 | 0 | return -1; |
3721 | 0 | SAM_state *fd = (SAM_state *)fp->state; |
3722 | |
|
3723 | 0 | pthread_mutex_init(&fd->lines_m, NULL); |
3724 | 0 | pthread_mutex_init(&fd->command_m, NULL); |
3725 | 0 | pthread_cond_init(&fd->command_c, NULL); |
3726 | 0 | fd->p = p->pool; |
3727 | 0 | int qsize = p->qsize; |
3728 | 0 | if (!qsize) |
3729 | 0 | qsize = 2*hts_tpool_size(fd->p); |
3730 | 0 | fd->q = hts_tpool_process_init(fd->p, qsize, 0); |
3731 | 0 | if (!fd->q) { |
3732 | 0 | sam_state_destroy(fp); |
3733 | 0 | return -1; |
3734 | 0 | } |
3735 | | |
3736 | 0 | if (fp->format.compression == bgzf) |
3737 | 0 | return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize); |
3738 | | |
3739 | 0 | return 0; |
3740 | 0 | } |
3741 | | |
3742 | 0 | int sam_set_threads(htsFile *fp, int nthreads) { |
3743 | 0 | if (nthreads <= 0) |
3744 | 0 | return 0; |
3745 | | |
3746 | 0 | htsThreadPool p; |
3747 | 0 | p.pool = hts_tpool_init(nthreads); |
3748 | 0 | p.qsize = nthreads*2; |
3749 | |
|
3750 | 0 | int ret = sam_set_thread_pool(fp, &p); |
3751 | 0 | if (ret < 0) |
3752 | 0 | return ret; |
3753 | | |
3754 | 0 | SAM_state *fd = (SAM_state *)fp->state; |
3755 | 0 | fd->own_pool = 1; |
3756 | |
|
3757 | 0 | return 0; |
3758 | 0 | } |
3759 | | |
3760 | 0 | #define UMI_TAGS 5 |
3761 | | typedef struct { |
3762 | | kstring_t name; |
3763 | | kstring_t comment; // NB: pointer into name, do not free |
3764 | | kstring_t seq; |
3765 | | kstring_t qual; |
3766 | | int casava; |
3767 | | int aux; |
3768 | | int rnum; |
3769 | | char BC[3]; // aux tag ID for barcode |
3770 | | char UMI[UMI_TAGS][3]; // aux tag list for UMIs. |
3771 | | khash_t(tag) *tags; // which aux tags to use (if empty, use all). |
3772 | | char nprefix; |
3773 | | int sra_names; |
3774 | | regex_t regex; |
3775 | | } fastq_state; |
3776 | | |
3777 | | // Initialise fastq state. |
3778 | | // Name char of '@' or '>' distinguishes fastq vs fasta variant |
3779 | 648 | static fastq_state *fastq_state_init(int name_char) { |
3780 | 648 | fastq_state *x = (fastq_state *)calloc(1, sizeof(*x)); |
3781 | 648 | if (!x) |
3782 | 0 | return NULL; |
3783 | 648 | strcpy(x->BC, "BC"); |
3784 | 648 | x->nprefix = name_char; |
3785 | | // Default Illumina naming convention |
3786 | 648 | char *re = "^[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:([^:#/]+)"; |
3787 | 648 | if (regcomp(&x->regex, re, REG_EXTENDED) != 0) { |
3788 | 0 | free(x); |
3789 | 0 | return NULL; |
3790 | 0 | } |
3791 | | |
3792 | 648 | return x; |
3793 | 648 | } |
3794 | | |
3795 | 864 | void fastq_state_destroy(htsFile *fp) { |
3796 | 864 | if (fp->state) { |
3797 | 648 | fastq_state *x = (fastq_state *)fp->state; |
3798 | 648 | if (x->tags) |
3799 | 0 | kh_destroy(tag, x->tags); |
3800 | 648 | ks_free(&x->name); |
3801 | 648 | ks_free(&x->seq); |
3802 | 648 | ks_free(&x->qual); |
3803 | 648 | regfree(&x->regex); |
3804 | 648 | free(fp->state); |
3805 | 648 | } |
3806 | 864 | } |
3807 | | |
3808 | 0 | int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) { |
3809 | 0 | va_list args; |
3810 | |
|
3811 | 0 | if (!fp) |
3812 | 0 | return -1; |
3813 | 0 | if (!fp->state) |
3814 | 0 | if (!(fp->state = fastq_state_init(fp->format.format == fastq_format |
3815 | 0 | ? '@' : '>'))) |
3816 | 0 | return -1; |
3817 | | |
3818 | 0 | fastq_state *x = (fastq_state *)fp->state; |
3819 | |
|
3820 | 0 | switch (opt) { |
3821 | 0 | case FASTQ_OPT_CASAVA: |
3822 | 0 | x->casava = 1; |
3823 | 0 | break; |
3824 | | |
3825 | 0 | case FASTQ_OPT_NAME2: |
3826 | 0 | x->sra_names = 1; |
3827 | 0 | break; |
3828 | | |
3829 | 0 | case FASTQ_OPT_AUX: { |
3830 | 0 | va_start(args, opt); |
3831 | 0 | x->aux = 1; |
3832 | 0 | char *tag = va_arg(args, char *); |
3833 | 0 | va_end(args); |
3834 | 0 | if (tag && strcmp(tag, "1") != 0) { |
3835 | 0 | if (!x->tags) |
3836 | 0 | if (!(x->tags = kh_init(tag))) |
3837 | 0 | return -1; |
3838 | | |
3839 | 0 | size_t i, tlen = strlen(tag); |
3840 | 0 | for (i = 0; i+3 <= tlen+1; i += 3) { |
3841 | 0 | if (tag[i+0] == ',' || tag[i+1] == ',' || |
3842 | 0 | !(tag[i+2] == ',' || tag[i+2] == '\0')) { |
3843 | 0 | hts_log_warning("Bad tag format '%.3s'; skipping option", tag+i); |
3844 | 0 | break; |
3845 | 0 | } |
3846 | 0 | int ret, tcode = tag[i+0]*256 + tag[i+1]; |
3847 | 0 | kh_put(tag, x->tags, tcode, &ret); |
3848 | 0 | if (ret < 0) |
3849 | 0 | return -1; |
3850 | 0 | } |
3851 | 0 | } |
3852 | 0 | break; |
3853 | 0 | } |
3854 | | |
3855 | 0 | case FASTQ_OPT_BARCODE: { |
3856 | 0 | va_start(args, opt); |
3857 | 0 | char *bc = va_arg(args, char *); |
3858 | 0 | va_end(args); |
3859 | 0 | strncpy(x->BC, bc, 2); |
3860 | 0 | x->BC[2] = 0; |
3861 | 0 | break; |
3862 | 0 | } |
3863 | | |
3864 | 0 | case FASTQ_OPT_UMI: { |
3865 | | // UMI tag: an empty string disables UMI by setting x->UMI[0] to \0\0\0 |
3866 | 0 | va_start(args, opt); |
3867 | 0 | char *bc = va_arg(args, char *), *bc_orig = bc; |
3868 | 0 | va_end(args); |
3869 | 0 | if (!bc || strcmp(bc, "1") == 0) |
3870 | 0 | bc = "RX"; |
3871 | 0 | int ntags = 0, err = 0; |
3872 | 0 | for (ntags = 0; *bc && ntags < UMI_TAGS; ntags++) { |
3873 | 0 | if (!isalpha(bc[0]) || !isalnum_c(bc[1])) { |
3874 | 0 | err = 1; |
3875 | 0 | break; |
3876 | 0 | } |
3877 | | |
3878 | 0 | strncpy(x->UMI[ntags], bc, 3); |
3879 | 0 | bc += 2; |
3880 | 0 | if (*bc && *bc != ',') { |
3881 | 0 | err = 1; |
3882 | 0 | break; |
3883 | 0 | } |
3884 | 0 | bc+=(*bc==','); |
3885 | 0 | x->UMI[ntags][2] = 0; |
3886 | 0 | } |
3887 | 0 | for (; ntags < UMI_TAGS; ntags++) |
3888 | 0 | x->UMI[ntags][0] = x->UMI[ntags][1] = x->UMI[ntags][2] = 0; |
3889 | | |
3890 | |
|
3891 | 0 | if (err) |
3892 | 0 | hts_log_warning("Bad UMI tag list '%s'", bc_orig); |
3893 | |
|
3894 | 0 | break; |
3895 | 0 | } |
3896 | | |
3897 | 0 | case FASTQ_OPT_UMI_REGEX: { |
3898 | 0 | va_start(args, opt); |
3899 | 0 | char *re = va_arg(args, char *); |
3900 | 0 | va_end(args); |
3901 | |
|
3902 | 0 | regfree(&x->regex); |
3903 | 0 | if (regcomp(&x->regex, re, REG_EXTENDED) != 0) { |
3904 | 0 | hts_log_error("Regular expression '%s' is not supported", re); |
3905 | 0 | return -1; |
3906 | 0 | } |
3907 | 0 | break; |
3908 | 0 | } |
3909 | | |
3910 | 0 | case FASTQ_OPT_RNUM: |
3911 | 0 | x->rnum = 1; |
3912 | 0 | break; |
3913 | | |
3914 | 0 | default: |
3915 | 0 | break; |
3916 | 0 | } |
3917 | 0 | return 0; |
3918 | 0 | } |
3919 | | |
3920 | 11.3M | static int fastq_parse1(htsFile *fp, bam1_t *b) { |
3921 | 11.3M | fastq_state *x = (fastq_state *)fp->state; |
3922 | 11.3M | size_t i, l; |
3923 | 11.3M | int ret = 0; |
3924 | | |
3925 | 11.3M | if (fp->format.format == fasta_format && fp->line.s) { |
3926 | | // For FASTA we've already read the >name line; steal it |
3927 | | // Not the most efficient, but we don't optimise for fasta reading. |
3928 | 11.3M | if (fp->line.l == 0) |
3929 | 192 | return -1; // EOF |
3930 | | |
3931 | 11.3M | free(x->name.s); |
3932 | 11.3M | x->name = fp->line; |
3933 | 11.3M | fp->line.l = fp->line.m = 0; |
3934 | 11.3M | fp->line.s = NULL; |
3935 | 11.3M | } else { |
3936 | | // Read a FASTQ format entry. |
3937 | 1.13k | ret = hts_getline(fp, KS_SEP_LINE, &x->name); |
3938 | 1.13k | if (ret == -1) |
3939 | 3 | return -1; // EOF |
3940 | 1.13k | else if (ret < -1) |
3941 | 0 | return ret; // ERR |
3942 | 1.13k | } |
3943 | | |
3944 | | // Name |
3945 | 11.3M | if (*x->name.s != x->nprefix) |
3946 | 9 | return -2; |
3947 | | |
3948 | | // Reverse the SRA strangeness of putting the run_name.number before |
3949 | | // the read name. |
3950 | 11.3M | i = 0; |
3951 | 11.3M | char *name = x->name.s+1; |
3952 | 11.3M | if (x->sra_names) { |
3953 | 0 | char *cp = strpbrk(x->name.s, " \t"); |
3954 | 0 | if (cp) { |
3955 | 0 | while (*cp == ' ' || *cp == '\t') |
3956 | 0 | cp++; |
3957 | 0 | *--cp = '@'; |
3958 | 0 | i = cp - x->name.s; |
3959 | 0 | name = cp+1; |
3960 | 0 | } |
3961 | 0 | } |
3962 | | |
3963 | 11.3M | l = x->name.l; |
3964 | 11.3M | char *s = x->name.s; |
3965 | 31.2M | while (i < l && !isspace_c(s[i])) |
3966 | 19.9M | i++; |
3967 | 11.3M | if (i < l) { |
3968 | 90.3k | s[i] = 0; |
3969 | 90.3k | x->name.l = i++; |
3970 | 90.3k | } |
3971 | | |
3972 | | // Comment; a kstring struct, but pointer into name line. (Do not free) |
3973 | 11.7M | while (i < l && isspace_c(s[i])) |
3974 | 445k | i++; |
3975 | 11.3M | x->comment.s = s+i; |
3976 | 11.3M | x->comment.l = l - i; |
3977 | | |
3978 | | // Seq |
3979 | 11.3M | x->seq.l = 0; |
3980 | 42.7M | for (;;) { |
3981 | 42.7M | if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0) |
3982 | 588 | if (fp->format.format == fastq_format || ret < -1) |
3983 | 387 | return -2; |
3984 | 42.7M | if (ret == -1 || |
3985 | 42.7M | *fp->line.s == (fp->format.format == fastq_format ? '+' : '>')) |
3986 | 11.3M | break; |
3987 | 31.4M | if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0) |
3988 | 0 | return -2; |
3989 | 31.4M | } |
3990 | | |
3991 | | // Qual |
3992 | 11.3M | if (fp->format.format == fastq_format) { |
3993 | 495 | size_t remainder = x->seq.l; |
3994 | 495 | x->qual.l = 0; |
3995 | 1.77k | do { |
3996 | 1.77k | if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0) |
3997 | 0 | return -2; |
3998 | 1.77k | if (fp->line.l > remainder) |
3999 | 9 | return -2; |
4000 | 1.77k | if (kputsn(fp->line.s, fp->line.l, &x->qual) < 0) |
4001 | 0 | return -2; |
4002 | 1.77k | remainder -= fp->line.l; |
4003 | 1.77k | } while (remainder > 0); |
4004 | | |
4005 | | // Decr qual |
4006 | 170k | for (i = 0; i < x->qual.l; i++) |
4007 | 169k | x->qual.s[i] -= '!'; |
4008 | 486 | } |
4009 | | |
4010 | 11.3M | int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED; |
4011 | 11.3M | if (x->name.l > 2 && |
4012 | 864k | x->name.s[x->name.l-2] == '/' && |
4013 | 14.0k | isdigit_c(x->name.s[x->name.l-1])) { |
4014 | 13.5k | switch(x->name.s[x->name.l-1]) { |
4015 | 2.67k | case '1': flag |= BAM_FREAD1 | pflag; break; |
4016 | 423 | case '2': flag |= BAM_FREAD2 | pflag; break; |
4017 | 10.4k | default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break; |
4018 | 13.5k | } |
4019 | 13.5k | x->name.s[x->name.l-=2] = 0; |
4020 | 13.5k | } |
4021 | | |
4022 | | // Strip Illumina formatted UMI off read-name |
4023 | 11.3M | char UMI_seq[256]; // maximum length in spec |
4024 | 11.3M | size_t UMI_len = 0; |
4025 | 11.3M | if (x->UMI[0][0]) { |
4026 | 0 | regmatch_t match[3]; |
4027 | 0 | if (regexec(&x->regex, x->name.s, 2, match, 0) == 0 |
4028 | 0 | && match[0].rm_so >= 0 // whole regex |
4029 | 0 | && match[1].rm_so >= 0) { // bracketted UMI component |
4030 | 0 | UMI_len = match[1].rm_eo - match[1].rm_so; |
4031 | 0 | if (UMI_len > 255) { |
4032 | 0 | hts_log_error("SAM read name is too long"); |
4033 | 0 | return -2; |
4034 | 0 | } |
4035 | | |
4036 | | // The SAMTags spec recommends (but not requires) separating |
4037 | | // barcodes with hyphen ('-'). |
4038 | 0 | size_t i; |
4039 | 0 | for (i = 0; i < UMI_len; i++) |
4040 | 0 | UMI_seq[i] = isalpha_c(x->name.s[i+match[1].rm_so]) |
4041 | 0 | ? x->name.s[i+match[1].rm_so] |
4042 | 0 | : '-'; |
4043 | | |
4044 | | // Move any trailing #num earlier in the name |
4045 | 0 | if (UMI_len) { |
4046 | 0 | UMI_seq[UMI_len++] = 0; |
4047 | |
|
4048 | 0 | x->name.l = match[1].rm_so; |
4049 | 0 | if (x->name.l > 0 && x->name.s[x->name.l-1] == ':') |
4050 | 0 | x->name.l--; // remove colon too |
4051 | 0 | char *cp = x->name.s + match[1].rm_eo; |
4052 | 0 | while (*cp) |
4053 | 0 | x->name.s[x->name.l++] = *cp++; |
4054 | 0 | x->name.s[x->name.l] = 0; |
4055 | 0 | } |
4056 | 0 | } |
4057 | 0 | } |
4058 | | |
4059 | | // Convert to BAM |
4060 | 11.3M | ret = bam_set1(b, |
4061 | 11.3M | x->name.s + x->name.l - name, name, |
4062 | 11.3M | flag, |
4063 | 11.3M | -1, -1, 0, // ref '*', pos, mapq, |
4064 | 11.3M | 0, NULL, // no cigar, |
4065 | 11.3M | -1, -1, 0, // mate |
4066 | 11.3M | x->seq.l, x->seq.s, x->qual.s, |
4067 | 11.3M | 0); |
4068 | 11.3M | if (ret < 0) return -2; |
4069 | | |
4070 | | // Add UMI tag if removed from read-name above |
4071 | 11.3M | if (UMI_len) { |
4072 | 0 | if (bam_aux_append(b, x->UMI[0], 'Z', UMI_len, (uint8_t *)UMI_seq) < 0) |
4073 | 0 | ret = -2; |
4074 | 0 | } |
4075 | | |
4076 | | // Identify Illumina CASAVA strings. |
4077 | | // <read>:<is_filtered>:<control_bits>:<barcode_sequence> |
4078 | 11.3M | char *barcode = NULL; |
4079 | 11.3M | int barcode_len = 0; |
4080 | 11.3M | kstring_t *kc = &x->comment; |
4081 | 11.3M | char *endptr; |
4082 | 11.3M | if (x->casava && |
4083 | | // \d:[YN]:\d+:[ACGTN]+ |
4084 | 0 | kc->l > 6 && (kc->s[1] | kc->s[3]) == ':' && isdigit_c(kc->s[0]) && |
4085 | 0 | strtol(kc->s+4, &endptr, 10) >= 0 && endptr != kc->s+4 |
4086 | 0 | && *endptr == ':') { |
4087 | | |
4088 | | // read num |
4089 | 0 | switch(kc->s[0]) { |
4090 | 0 | case '1': b->core.flag |= BAM_FREAD1 | pflag; break; |
4091 | 0 | case '2': b->core.flag |= BAM_FREAD2 | pflag; break; |
4092 | 0 | default : b->core.flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break; |
4093 | 0 | } |
4094 | | |
4095 | 0 | if (kc->s[2] == 'Y') |
4096 | 0 | b->core.flag |= BAM_FQCFAIL; |
4097 | | |
4098 | | // Barcode, maybe numeric in which case we skip it |
4099 | 0 | if (!isdigit_c(endptr[1])) { |
4100 | 0 | barcode = endptr+1; |
4101 | 0 | for (i = barcode - kc->s; i < kc->l; i++) |
4102 | 0 | if (isspace_c(kc->s[i])) |
4103 | 0 | break; |
4104 | |
|
4105 | 0 | kc->s[i] = 0; |
4106 | 0 | barcode_len = i+1-(barcode - kc->s); |
4107 | 0 | } |
4108 | 0 | } |
4109 | | |
4110 | 11.3M | if (ret >= 0 && barcode_len) |
4111 | 0 | if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0) |
4112 | 0 | ret = -2; |
4113 | | |
4114 | 11.3M | if (!x->aux) |
4115 | 11.3M | return ret; |
4116 | | |
4117 | | // Identify any SAM style aux tags in comments too. |
4118 | 0 | if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0) |
4119 | 0 | ret = -2; |
4120 | |
|
4121 | 0 | return ret; |
4122 | 11.3M | } |
4123 | | |
4124 | | // Internal component of sam_read1 below |
4125 | 209 | static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { |
4126 | 209 | int ret = bam_read1(fp->fp.bgzf, b); |
4127 | 209 | if (h && ret >= 0) { |
4128 | 187 | if (b->core.tid >= h->n_targets || b->core.tid < -1 || |
4129 | 185 | b->core.mtid >= h->n_targets || b->core.mtid < -1) { |
4130 | 2 | errno = ERANGE; |
4131 | 2 | return -3; |
4132 | 2 | } |
4133 | 187 | } |
4134 | 207 | return ret; |
4135 | 209 | } |
4136 | | |
4137 | | // Internal component of sam_read1 below |
4138 | 530 | static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) { |
4139 | 530 | int ret = cram_get_bam_seq(fp->fp.cram, b); |
4140 | 530 | if (ret < 0) |
4141 | 530 | return cram_eof(fp->fp.cram) ? -1 : -2; |
4142 | | |
4143 | 0 | if (bam_tag2cigar(*b, 1, 1) < 0) |
4144 | 0 | return -2; |
4145 | | |
4146 | 0 | return ret; |
4147 | 0 | } |
4148 | | |
4149 | | // Internal component of sam_read1 below |
4150 | 92.8k | static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { |
4151 | 92.8k | int ret; |
4152 | | |
4153 | | // Consume 1st line after header parsing as it wasn't using peek |
4154 | 92.8k | if (fp->line.l != 0) { |
4155 | 0 | ret = sam_parse1(&fp->line, h, b); |
4156 | 0 | fp->line.l = 0; |
4157 | 0 | return ret; |
4158 | 0 | } |
4159 | | |
4160 | 92.8k | if (fp->state) { |
4161 | 0 | SAM_state *fd = (SAM_state *)fp->state; |
4162 | |
|
4163 | 0 | if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) { |
4164 | | // We don't support multi-threaded SAM parsing with seeks yet. |
4165 | 0 | int ret; |
4166 | 0 | if ((ret = sam_state_destroy(fp)) < 0) { |
4167 | 0 | errno = -ret; |
4168 | 0 | return -2; |
4169 | 0 | } |
4170 | 0 | if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0) |
4171 | 0 | return -2; |
4172 | 0 | fp->fp.bgzf->seeked = 0; |
4173 | 0 | goto err_recover; |
4174 | 0 | } |
4175 | | |
4176 | 0 | if (!fd->h) { |
4177 | 0 | fd->h = h; |
4178 | 0 | fd->h->ref_count++; |
4179 | | // Ensure hrecs is initialised now as we don't want multiple |
4180 | | // threads trying to do this simultaneously. |
4181 | 0 | if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0) |
4182 | 0 | return -2; |
4183 | | |
4184 | | // We can only do this once we've got a header |
4185 | 0 | if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read, |
4186 | 0 | fp) != 0) |
4187 | 0 | return -2; |
4188 | 0 | fd->dispatcher_set = 1; |
4189 | 0 | } |
4190 | | |
4191 | 0 | if (fd->h != h) { |
4192 | 0 | hts_log_error("SAM multi-threaded decoding does not support changing header"); |
4193 | 0 | return -2; |
4194 | 0 | } |
4195 | | |
4196 | 0 | sp_bams *gb = fd->curr_bam; |
4197 | 0 | if (!gb) { |
4198 | 0 | if (fd->errcode) { |
4199 | | // In case reader failed |
4200 | 0 | errno = fd->errcode; |
4201 | 0 | return -2; |
4202 | 0 | } |
4203 | | |
4204 | 0 | pthread_mutex_lock(&fd->command_m); |
4205 | 0 | int cmd = fd->command; |
4206 | 0 | pthread_mutex_unlock(&fd->command_m); |
4207 | 0 | if (cmd == SAM_AT_EOF) |
4208 | 0 | return -1; |
4209 | | |
4210 | 0 | hts_tpool_result *r = hts_tpool_next_result_wait(fd->q); |
4211 | 0 | if (!r) |
4212 | 0 | return -2; |
4213 | 0 | fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r); |
4214 | 0 | hts_tpool_delete_result(r, 0); |
4215 | 0 | } |
4216 | 0 | if (!gb) { |
4217 | 0 | pthread_mutex_lock(&fd->command_m); |
4218 | 0 | fd->command = SAM_AT_EOF; |
4219 | 0 | pthread_mutex_unlock(&fd->command_m); |
4220 | 0 | return fd->errcode ? -2 : -1; |
4221 | 0 | } |
4222 | 0 | bam1_t *b_array = (bam1_t *)gb->bams; |
4223 | 0 | if (fd->curr_idx < gb->nbams) |
4224 | 0 | if (!bam_copy1(b, &b_array[fd->curr_idx++])) |
4225 | 0 | return -2; |
4226 | 0 | if (fd->curr_idx == gb->nbams) { |
4227 | 0 | pthread_mutex_lock(&fd->lines_m); |
4228 | 0 | gb->next = fd->bams; |
4229 | 0 | fd->bams = gb; |
4230 | 0 | pthread_mutex_unlock(&fd->lines_m); |
4231 | |
|
4232 | 0 | fd->curr_bam = NULL; |
4233 | 0 | fd->curr_idx = 0; |
4234 | | // Consider prefetching next record? I.e. |
4235 | | // } else { |
4236 | | // __builtin_prefetch(&b_array[fd->curr_idx], 0, 3); |
4237 | 0 | } |
4238 | |
|
4239 | 0 | ret = 0; |
4240 | |
|
4241 | 92.8k | } else { |
4242 | 92.8k | err_recover: |
4243 | 92.8k | ret = hts_getline(fp, KS_SEP_LINE, &fp->line); |
4244 | 92.8k | if (ret < 0) return ret; |
4245 | | |
4246 | 92.3k | ret = sam_parse1(&fp->line, h, b); |
4247 | 92.3k | fp->line.l = 0; |
4248 | 92.3k | if (ret < 0) { |
4249 | 700 | hts_log_warning("Parse error at line %lld", (long long)fp->lineno); |
4250 | 700 | if (h && h->ignore_sam_err) goto err_recover; |
4251 | 700 | } |
4252 | 92.3k | } |
4253 | | |
4254 | 92.3k | return ret; |
4255 | 92.8k | } |
4256 | | |
4257 | | // Returns 0 on success, |
4258 | | // -1 on EOF, |
4259 | | // <-1 on error |
4260 | | int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) |
4261 | 11.3M | { |
4262 | 11.3M | int ret, pass_filter; |
4263 | | |
4264 | 11.3M | do { |
4265 | 11.3M | switch (fp->format.format) { |
4266 | 209 | case bam: |
4267 | 209 | ret = sam_read1_bam(fp, h, b); |
4268 | 209 | break; |
4269 | | |
4270 | 530 | case cram: |
4271 | 530 | ret = sam_read1_cram(fp, h, &b); |
4272 | 530 | break; |
4273 | | |
4274 | 92.8k | case sam: |
4275 | 92.8k | ret = sam_read1_sam(fp, h, b); |
4276 | 92.8k | break; |
4277 | | |
4278 | 11.3M | case fasta_format: |
4279 | 11.3M | case fastq_format: { |
4280 | 11.3M | fastq_state *x = (fastq_state *)fp->state; |
4281 | 11.3M | if (!x) { |
4282 | 648 | if (!(fp->state = fastq_state_init(fp->format.format |
4283 | 648 | == fastq_format ? '@' : '>'))) |
4284 | 0 | return -2; |
4285 | 648 | } |
4286 | | |
4287 | 11.3M | return fastq_parse1(fp, b); |
4288 | 11.3M | } |
4289 | | |
4290 | 0 | case empty_format: |
4291 | 0 | errno = EPIPE; |
4292 | 0 | return -3; |
4293 | | |
4294 | 0 | default: |
4295 | 0 | errno = EFTYPE; |
4296 | 0 | return -3; |
4297 | 11.3M | } |
4298 | | |
4299 | 93.5k | pass_filter = (ret >= 0 && fp->filter) |
4300 | 93.5k | ? sam_passes_filter(h, b, fp->filter) |
4301 | 93.5k | : 1; |
4302 | 93.5k | } while (pass_filter == 0); |
4303 | | |
4304 | 93.5k | return pass_filter < 0 ? -2 : ret; |
4305 | 11.3M | } |
4306 | | |
4307 | | // With gcc, -O3 or -ftree-loop-vectorize is really key here as otherwise |
4308 | | // this code isn't vectorised and runs far slower than is necessary (even |
4309 | | // with the restrict keyword being used). |
4310 | | static inline void HTS_OPT3 |
4311 | 265 | add33(uint8_t *a, const uint8_t * b, int32_t len) { |
4312 | 265 | uint32_t i; |
4313 | 57.1k | for (i = 0; i < len; i++) |
4314 | 56.8k | a[i] = b[i]+33; |
4315 | 265 | } |
4316 | | |
4317 | | static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) |
4318 | 3.79M | { |
4319 | 3.79M | int i, r = 0; |
4320 | 3.79M | uint8_t *s, *end; |
4321 | 3.79M | const bam1_core_t *c = &b->core; |
4322 | | |
4323 | 3.79M | if (c->l_qname == 0) |
4324 | 0 | return -1; |
4325 | 3.79M | r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str); |
4326 | 3.79M | r |= kputc_('\t', str); // query name |
4327 | 3.79M | r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag |
4328 | 3.79M | if (c->tid >= 0) { // chr |
4329 | 21.0k | r |= kputs(h->target_name[c->tid] , str); |
4330 | 21.0k | r |= kputc_('\t', str); |
4331 | 3.77M | } else r |= kputsn_("*\t", 2, str); |
4332 | 3.79M | r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos |
4333 | 3.79M | r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual |
4334 | 3.79M | if (c->n_cigar) { // cigar |
4335 | 25.7k | uint32_t *cigar = bam_get_cigar(b); |
4336 | 683k | for (i = 0; i < c->n_cigar; ++i) { |
4337 | 657k | r |= kputw(bam_cigar_oplen(cigar[i]), str); |
4338 | 657k | r |= kputc_(bam_cigar_opchr(cigar[i]), str); |
4339 | 657k | } |
4340 | 3.77M | } else r |= kputc_('*', str); |
4341 | 3.79M | r |= kputc_('\t', str); |
4342 | 3.79M | if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr |
4343 | 520 | else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str); |
4344 | 447 | else { |
4345 | 447 | r |= kputs(h->target_name[c->mtid], str); |
4346 | 447 | r |= kputc_('\t', str); |
4347 | 447 | } |
4348 | 3.79M | r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos |
4349 | 3.79M | r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len |
4350 | 3.79M | if (c->l_qseq) { // seq and qual |
4351 | 113k | uint8_t *s = bam_get_seq(b); |
4352 | 113k | if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err; |
4353 | 113k | char *cp = str->s + str->l; |
4354 | | |
4355 | | // Sequence, 2 bases at a time |
4356 | 113k | nibble2base(s, cp, c->l_qseq); |
4357 | 113k | cp[c->l_qseq] = '\t'; |
4358 | 113k | cp += c->l_qseq+1; |
4359 | | |
4360 | | // Quality |
4361 | 113k | s = bam_get_qual(b); |
4362 | 113k | i = 0; |
4363 | 113k | if (s[0] == 0xff) { |
4364 | 113k | cp[i++] = '*'; |
4365 | 113k | } else { |
4366 | 265 | add33((uint8_t *)cp, s, c->l_qseq); // cp[i] = s[i]+33; |
4367 | 265 | i = c->l_qseq; |
4368 | 265 | } |
4369 | 113k | cp[i] = 0; |
4370 | 113k | cp += i; |
4371 | 113k | str->l = cp - str->s; |
4372 | 3.68M | } else r |= kputsn_("*\t*", 3, str); |
4373 | | |
4374 | 3.79M | s = bam_get_aux(b); // aux |
4375 | 3.79M | end = b->data + b->l_data; |
4376 | | |
4377 | 4.31M | while (end - s >= 4) { |
4378 | 517k | r |= kputc_('\t', str); |
4379 | 517k | if ((s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)) == NULL) |
4380 | 7 | goto bad_aux; |
4381 | 517k | } |
4382 | 3.79M | r |= kputsn("", 0, str); // nul terminate |
4383 | 3.79M | if (r < 0) goto mem_err; |
4384 | | |
4385 | 3.79M | return str->l; |
4386 | | |
4387 | 7 | bad_aux: |
4388 | 7 | hts_log_error("Corrupted aux data for read %.*s flag %d", |
4389 | 7 | b->core.l_qname, bam_get_qname(b), b->core.flag); |
4390 | 7 | errno = EINVAL; |
4391 | 7 | return -1; |
4392 | | |
4393 | 0 | mem_err: |
4394 | 0 | hts_log_error("Out of memory"); |
4395 | 0 | errno = ENOMEM; |
4396 | 0 | return -1; |
4397 | 3.79M | } |
4398 | | |
4399 | | int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) |
4400 | 3.79M | { |
4401 | 3.79M | str->l = 0; |
4402 | 3.79M | return sam_format1_append(h, b, str); |
4403 | 3.79M | } |
4404 | | |
4405 | | static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end); |
4406 | | int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str) |
4407 | 0 | { |
4408 | 0 | unsigned flag = b->core.flag; |
4409 | 0 | int i, e = 0, len = b->core.l_qseq; |
4410 | 0 | uint8_t *seq, *qual; |
4411 | |
|
4412 | 0 | str->l = 0; |
4413 | | |
4414 | | // Name |
4415 | 0 | if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF) |
4416 | 0 | return -1; |
4417 | | |
4418 | | // UMI tag |
4419 | 0 | if (x && *x->UMI[0]) { |
4420 | | // Temporary copy of '#num' if present |
4421 | 0 | char plex[256]; |
4422 | 0 | size_t len = str->l; |
4423 | 0 | while (len && str->s[len] != ':' && str->s[len] != '#') |
4424 | 0 | len--; |
4425 | |
|
4426 | 0 | if (str->s[len] == '#' && str->l - len < 255) { |
4427 | 0 | memcpy(plex, &str->s[len], str->l - len); |
4428 | 0 | plex[str->l - len] = 0; |
4429 | 0 | str->l = len; |
4430 | 0 | } else { |
4431 | 0 | *plex = 0; |
4432 | 0 | } |
4433 | |
|
4434 | 0 | uint8_t *bc = NULL; |
4435 | 0 | int n; |
4436 | 0 | for (n = 0; !bc && n < UMI_TAGS; n++) |
4437 | 0 | bc = bam_aux_get(b, x->UMI[n]); |
4438 | 0 | if (bc && *bc == 'Z') { |
4439 | 0 | int err = kputc(':', str) < 0; |
4440 | | // Replace any non-alpha with '+' |
4441 | 0 | while (*++bc) |
4442 | 0 | err |= kputc(isalpha_c(*bc) ? toupper_c(*bc) : '+', str) < 0; |
4443 | 0 | if (err) |
4444 | 0 | return -1; |
4445 | 0 | } |
4446 | | |
4447 | 0 | if (*plex && kputs(plex, str) < 0) |
4448 | 0 | return -1; |
4449 | 0 | } |
4450 | | |
4451 | | // /1 or /2 suffix |
4452 | 0 | if (x && x->rnum && (flag & BAM_FPAIRED)) { |
4453 | 0 | int r12 = flag & (BAM_FREAD1 | BAM_FREAD2); |
4454 | 0 | if (r12 == BAM_FREAD1) { |
4455 | 0 | if (kputs("/1", str) == EOF) |
4456 | 0 | return -1; |
4457 | 0 | } else if (r12 == BAM_FREAD2) { |
4458 | 0 | if (kputs("/2", str) == EOF) |
4459 | 0 | return -1; |
4460 | 0 | } |
4461 | 0 | } |
4462 | | |
4463 | | // Illumina CASAVA tag. |
4464 | | // This is <rnum>:<Y/N qcfail>:<control-bits>:<barcode-or-zero> |
4465 | 0 | if (x && x->casava) { |
4466 | 0 | int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0; |
4467 | 0 | char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N'; |
4468 | 0 | uint8_t *bc = bam_aux_get(b, x->BC); |
4469 | 0 | if (ksprintf(str, " %d:%c:0:%s", rnum, filtered, |
4470 | 0 | bc ? (char *)bc+1 : "0") < 0) |
4471 | 0 | return -1; |
4472 | | |
4473 | 0 | if (bc && (*bc != 'Z' || (!isupper_c(bc[1]) && !islower_c(bc[1])))) { |
4474 | 0 | hts_log_warning("BC tag starts with non-sequence base; using '0'"); |
4475 | 0 | str->l -= strlen((char *)bc)-2; // limit to 1 char |
4476 | 0 | str->s[str->l-1] = '0'; |
4477 | 0 | str->s[str->l] = 0; |
4478 | 0 | bc = NULL; |
4479 | 0 | } |
4480 | | |
4481 | | // Replace any non-alpha with '+'. Ie seq-seq to seq+seq |
4482 | 0 | if (bc) { |
4483 | 0 | int l = strlen((char *)bc+1); |
4484 | 0 | char *c = (char *)str->s + str->l - l; |
4485 | 0 | for (i = 0; i < l; i++) { |
4486 | 0 | if (!isalpha_c(c[i])) |
4487 | 0 | c[i] = '+'; |
4488 | 0 | else if (islower_c(c[i])) |
4489 | 0 | c[i] = toupper_c(c[i]); |
4490 | 0 | } |
4491 | 0 | } |
4492 | 0 | } |
4493 | | |
4494 | | // Aux tags |
4495 | 0 | if (x && x->aux) { |
4496 | 0 | uint8_t *s = bam_get_aux(b), *end = b->data + b->l_data; |
4497 | 0 | while (s && end - s >= 4) { |
4498 | 0 | int tt = s[0]*256 + s[1]; |
4499 | 0 | if (x->tags == NULL || |
4500 | 0 | kh_get(tag, x->tags, tt) != kh_end(x->tags)) { |
4501 | 0 | e |= kputc_('\t', str) < 0; |
4502 | 0 | if (!(s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str))) |
4503 | 0 | return -1; |
4504 | 0 | } else { |
4505 | 0 | s = skip_aux(s+2, end); |
4506 | 0 | } |
4507 | 0 | } |
4508 | 0 | e |= kputsn("", 0, str) < 0; // nul terminate |
4509 | 0 | } |
4510 | | |
4511 | 0 | if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1; |
4512 | 0 | e |= kputc_('\n', str) < 0; |
4513 | | |
4514 | | // Seq line |
4515 | 0 | seq = bam_get_seq(b); |
4516 | 0 | if (flag & BAM_FREVERSE) |
4517 | 0 | for (i = len-1; i >= 0; i--) |
4518 | 0 | e |= kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str) < 0; |
4519 | 0 | else |
4520 | 0 | for (i = 0; i < len; i++) |
4521 | 0 | e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0; |
4522 | | |
4523 | | |
4524 | | // Qual line |
4525 | 0 | if (x->nprefix == '@') { |
4526 | 0 | kputsn("\n+\n", 3, str); |
4527 | 0 | qual = bam_get_qual(b); |
4528 | 0 | if (qual[0] == 0xff) |
4529 | 0 | for (i = 0; i < len; i++) |
4530 | 0 | e |= kputc_('B', str) < 0; |
4531 | 0 | else if (flag & BAM_FREVERSE) |
4532 | 0 | for (i = len-1; i >= 0; i--) |
4533 | 0 | e |= kputc_(33 + qual[i], str) < 0; |
4534 | 0 | else |
4535 | 0 | for (i = 0; i < len; i++) |
4536 | 0 | e |= kputc_(33 + qual[i], str) < 0; |
4537 | |
|
4538 | 0 | } |
4539 | 0 | e |= kputc('\n', str) < 0; |
4540 | |
|
4541 | 0 | return e ? -1 : str->l; |
4542 | 0 | } |
4543 | | |
4544 | | // Sadly we need to be able to modify the bam_hdr here so we can |
4545 | | // reference count the structure. |
4546 | | int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) |
4547 | 11.3M | { |
4548 | 11.3M | switch (fp->format.format) { |
4549 | 0 | case binary_format: |
4550 | 0 | fp->format.category = sequence_data; |
4551 | 0 | fp->format.format = bam; |
4552 | | /* fall-through */ |
4553 | 3.79M | case bam: |
4554 | 3.79M | return bam_write_idx1(fp, h, b); |
4555 | | |
4556 | 3.79M | case cram: |
4557 | 3.79M | return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b); |
4558 | | |
4559 | 0 | case text_format: |
4560 | 0 | fp->format.category = sequence_data; |
4561 | 0 | fp->format.format = sam; |
4562 | | /* fall-through */ |
4563 | 3.79M | case sam: |
4564 | 3.79M | if (fp->state) { |
4565 | 0 | SAM_state *fd = (SAM_state *)fp->state; |
4566 | | |
4567 | | // Threaded output |
4568 | 0 | if (!fd->h) { |
4569 | | // NB: discard const. We don't actually modify sam_hdr_t here, |
4570 | | // just data pointed to by it (which is a bit weasely still), |
4571 | | // but out cached pointer must be non-const as we want to |
4572 | | // destroy it later on and sam_hdr_destroy takes non-const. |
4573 | | // |
4574 | | // We do this because some tools do sam_hdr_destroy; sam_close |
4575 | | // while others do sam_close; sam_hdr_destroy. The former is |
4576 | | // an issue as we need the header still when flushing. |
4577 | 0 | fd->h = (sam_hdr_t *)h; |
4578 | 0 | fd->h->ref_count++; |
4579 | |
|
4580 | 0 | if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write, |
4581 | 0 | fp) != 0) |
4582 | 0 | return -2; |
4583 | 0 | fd->dispatcher_set = 1; |
4584 | 0 | } |
4585 | | |
4586 | 0 | if (fd->h != h) { |
4587 | 0 | hts_log_error("SAM multi-threaded decoding does not support changing header"); |
4588 | 0 | return -2; |
4589 | 0 | } |
4590 | | |
4591 | | // Find a suitable BAM array to copy to |
4592 | 0 | sp_bams *gb = fd->curr_bam; |
4593 | 0 | if (!gb) { |
4594 | 0 | pthread_mutex_lock(&fd->lines_m); |
4595 | 0 | if (fd->bams) { |
4596 | 0 | fd->curr_bam = gb = fd->bams; |
4597 | 0 | fd->bams = gb->next; |
4598 | 0 | gb->next = NULL; |
4599 | 0 | gb->nbams = 0; |
4600 | 0 | gb->bam_mem = 0; |
4601 | 0 | pthread_mutex_unlock(&fd->lines_m); |
4602 | 0 | } else { |
4603 | 0 | pthread_mutex_unlock(&fd->lines_m); |
4604 | 0 | if (!(gb = calloc(1, sizeof(*gb)))) return -1; |
4605 | 0 | if (!(gb->bams = calloc(SAM_NBAM, sizeof(*gb->bams)))) { |
4606 | 0 | free(gb); |
4607 | 0 | return -1; |
4608 | 0 | } |
4609 | 0 | gb->nbams = 0; |
4610 | 0 | gb->abams = SAM_NBAM; |
4611 | 0 | gb->bam_mem = 0; |
4612 | 0 | gb->fd = fd; |
4613 | 0 | fd->curr_idx = 0; |
4614 | 0 | fd->curr_bam = gb; |
4615 | 0 | } |
4616 | 0 | } |
4617 | | |
4618 | 0 | if (!bam_copy1(&gb->bams[gb->nbams++], b)) |
4619 | 0 | return -2; |
4620 | 0 | gb->bam_mem += b->l_data + sizeof(*b); |
4621 | | |
4622 | | // Dispatch if full |
4623 | 0 | if (gb->nbams == SAM_NBAM || gb->bam_mem > SAM_NBYTES*0.8) { |
4624 | 0 | gb->serial = fd->serial++; |
4625 | 0 | pthread_mutex_lock(&fd->command_m); |
4626 | 0 | if (fd->errcode != 0) { |
4627 | 0 | pthread_mutex_unlock(&fd->command_m); |
4628 | 0 | return -fd->errcode; |
4629 | 0 | } |
4630 | 0 | if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb, |
4631 | 0 | cleanup_sp_bams, |
4632 | 0 | cleanup_sp_lines, 0) < 0) { |
4633 | 0 | pthread_mutex_unlock(&fd->command_m); |
4634 | 0 | return -1; |
4635 | 0 | } |
4636 | 0 | pthread_mutex_unlock(&fd->command_m); |
4637 | 0 | fd->curr_bam = NULL; |
4638 | 0 | } |
4639 | | |
4640 | | // Dummy value as we don't know how long it really is. |
4641 | | // We could track file sizes via a SAM_state field, but I don't think |
4642 | | // it is necessary. |
4643 | 0 | return 1; |
4644 | 3.79M | } else { |
4645 | 3.79M | if (sam_format1(h, b, &fp->line) < 0) return -1; |
4646 | 3.79M | kputc('\n', &fp->line); |
4647 | 3.79M | if (fp->is_bgzf) { |
4648 | 0 | if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) |
4649 | 0 | return -1; |
4650 | 0 | if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1; |
4651 | 3.79M | } else { |
4652 | 3.79M | if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1; |
4653 | 3.79M | } |
4654 | | |
4655 | 3.79M | if (fp->idx) { |
4656 | 0 | if (fp->format.compression == bgzf) { |
4657 | 0 | if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), |
4658 | 0 | bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) { |
4659 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
4660 | 0 | bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); |
4661 | 0 | return -1; |
4662 | 0 | } |
4663 | 0 | } else { |
4664 | 0 | if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b), |
4665 | 0 | bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) { |
4666 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
4667 | 0 | bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); |
4668 | 0 | return -1; |
4669 | 0 | } |
4670 | 0 | } |
4671 | 0 | } |
4672 | | |
4673 | 3.79M | return fp->line.l; |
4674 | 3.79M | } |
4675 | | |
4676 | | |
4677 | 0 | case fasta_format: |
4678 | 0 | case fastq_format: { |
4679 | 0 | fastq_state *x = (fastq_state *)fp->state; |
4680 | 0 | if (!x) { |
4681 | 0 | if (!(fp->state = fastq_state_init(fp->format.format |
4682 | 0 | == fastq_format ? '@' : '>'))) |
4683 | 0 | return -2; |
4684 | 0 | } |
4685 | | |
4686 | 0 | if (fastq_format1(fp->state, b, &fp->line) < 0) |
4687 | 0 | return -1; |
4688 | 0 | if (fp->is_bgzf) { |
4689 | 0 | if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) |
4690 | 0 | return -1; |
4691 | 0 | if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l) |
4692 | 0 | return -1; |
4693 | 0 | } else { |
4694 | 0 | if (hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l) |
4695 | 0 | return -1; |
4696 | 0 | } |
4697 | 0 | return fp->line.l; |
4698 | 0 | } |
4699 | | |
4700 | 0 | default: |
4701 | 0 | errno = EBADF; |
4702 | 0 | return -1; |
4703 | 11.3M | } |
4704 | 11.3M | } |
4705 | | |
4706 | | /************************ |
4707 | | *** Auxiliary fields *** |
4708 | | ************************/ |
4709 | | #ifndef HTS_LITTLE_ENDIAN |
4710 | | static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) { |
4711 | | int tsz = aux_type2size(type); |
4712 | | |
4713 | | if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1; |
4714 | | |
4715 | | switch (tsz) { |
4716 | | case 'H': case 'Z': case 1: // Trivial |
4717 | | memcpy(out, in, len); |
4718 | | break; |
4719 | | |
4720 | | #define aux_val_to_le(type_t, store_le) do { \ |
4721 | | type_t v; \ |
4722 | | size_t i; \ |
4723 | | for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \ |
4724 | | memcpy(&v, in + i, sizeof(type_t)); \ |
4725 | | store_le(v, out); \ |
4726 | | } \ |
4727 | | } while (0) |
4728 | | |
4729 | | case 2: aux_val_to_le(uint16_t, u16_to_le); break; |
4730 | | case 4: aux_val_to_le(uint32_t, u32_to_le); break; |
4731 | | case 8: aux_val_to_le(uint64_t, u64_to_le); break; |
4732 | | |
4733 | | #undef aux_val_to_le |
4734 | | |
4735 | | case 'B': { // Recurse! |
4736 | | uint32_t n; |
4737 | | if (len < 5) return -1; |
4738 | | memcpy(&n, in + 1, 4); |
4739 | | out[0] = in[0]; |
4740 | | u32_to_le(n, out + 1); |
4741 | | return aux_to_le(in[0], out + 5, in + 5, len - 5); |
4742 | | } |
4743 | | |
4744 | | default: // Unknown type code |
4745 | | return -1; |
4746 | | } |
4747 | | |
4748 | | |
4749 | | |
4750 | | return 0; |
4751 | | } |
4752 | | #endif |
4753 | | |
4754 | | int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data) |
4755 | 0 | { |
4756 | 0 | uint32_t new_len; |
4757 | |
|
4758 | 0 | assert(b->l_data >= 0); |
4759 | 0 | new_len = b->l_data + 3 + len; |
4760 | 0 | if (new_len > INT32_MAX || new_len < b->l_data) goto nomem; |
4761 | | |
4762 | 0 | if (realloc_bam_data(b, new_len) < 0) return -1; |
4763 | | |
4764 | 0 | b->data[b->l_data] = tag[0]; |
4765 | 0 | b->data[b->l_data + 1] = tag[1]; |
4766 | 0 | b->data[b->l_data + 2] = type; |
4767 | |
|
4768 | 0 | #ifdef HTS_LITTLE_ENDIAN |
4769 | 0 | memcpy(b->data + b->l_data + 3, data, len); |
4770 | | #else |
4771 | | if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) { |
4772 | | errno = EINVAL; |
4773 | | return -1; |
4774 | | } |
4775 | | #endif |
4776 | |
|
4777 | 0 | b->l_data = new_len; |
4778 | |
|
4779 | 0 | return 0; |
4780 | | |
4781 | 0 | nomem: |
4782 | 0 | errno = ENOMEM; |
4783 | 0 | return -1; |
4784 | 0 | } |
4785 | | |
4786 | | static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end) |
4787 | 1.40M | { |
4788 | 1.40M | int size; |
4789 | 1.40M | uint32_t n; |
4790 | 1.40M | if (s >= end) return end; |
4791 | 1.40M | size = aux_type2size(*s); ++s; // skip type |
4792 | 1.40M | switch (size) { |
4793 | 241k | case 'Z': |
4794 | 249k | case 'H': |
4795 | 249k | s = memchr(s, 0, end-s); |
4796 | 249k | return s ? s+1 : end; |
4797 | 41.2k | case 'B': |
4798 | 41.2k | if (end - s < 5) return NULL; |
4799 | 41.2k | size = aux_type2size(*s); ++s; |
4800 | 41.2k | n = le_to_u32(s); |
4801 | 41.2k | s += 4; |
4802 | 41.2k | if (size == 0 || end - s < size * n) return NULL; |
4803 | 41.2k | return s + size * n; |
4804 | 47 | case 0: |
4805 | 47 | return NULL; |
4806 | 1.11M | default: |
4807 | 1.11M | if (end - s < size) return NULL; |
4808 | 1.11M | return s + size; |
4809 | 1.40M | } |
4810 | 1.40M | } |
4811 | | |
4812 | | uint8_t *bam_aux_first(const bam1_t *b) |
4813 | 3.84M | { |
4814 | 3.84M | uint8_t *s = bam_get_aux(b); |
4815 | 3.84M | uint8_t *end = b->data + b->l_data; |
4816 | 3.84M | if (end - s <= 2) { errno = ENOENT; return NULL; } |
4817 | 68.9k | return s+2; |
4818 | 3.84M | } |
4819 | | |
4820 | | uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s) |
4821 | 1.39M | { |
4822 | 1.39M | uint8_t *end = b->data + b->l_data; |
4823 | 1.39M | uint8_t *next = s? skip_aux((uint8_t *) s, end) : end; |
4824 | 1.39M | if (next == NULL) goto bad_aux; |
4825 | 1.39M | if (end - next <= 2) { errno = ENOENT; return NULL; } |
4826 | 1.33M | return next+2; |
4827 | | |
4828 | 47 | bad_aux: |
4829 | 47 | hts_log_error("Corrupted aux data for read %s flag %d", |
4830 | 47 | bam_get_qname(b), b->core.flag); |
4831 | 47 | errno = EINVAL; |
4832 | 47 | return NULL; |
4833 | 1.39M | } |
4834 | | |
4835 | | uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) |
4836 | 3.84M | { |
4837 | 3.84M | uint8_t *s; |
4838 | 5.24M | for (s = bam_aux_first(b); s; s = bam_aux_next(b, s)) |
4839 | 1.40M | if (s[-2] == tag[0] && s[-1] == tag[1]) { |
4840 | | // Check the tag value is valid and complete |
4841 | 14.5k | uint8_t *e = skip_aux(s, b->data + b->l_data); |
4842 | 14.5k | if (e == NULL) goto bad_aux; |
4843 | 14.5k | if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux; |
4844 | | |
4845 | 14.5k | return s; |
4846 | 14.5k | } |
4847 | | |
4848 | | // errno now as set by bam_aux_first()/bam_aux_next() |
4849 | 3.83M | return NULL; |
4850 | | |
4851 | 0 | bad_aux: |
4852 | 0 | hts_log_error("Corrupted aux data for read %s flag %d", |
4853 | 0 | bam_get_qname(b), b->core.flag); |
4854 | 0 | errno = EINVAL; |
4855 | 0 | return NULL; |
4856 | 3.84M | } |
4857 | | |
4858 | | int bam_aux_del(bam1_t *b, uint8_t *s) |
4859 | 0 | { |
4860 | 0 | s = bam_aux_remove(b, s); |
4861 | 0 | return (s || errno == ENOENT)? 0 : -1; |
4862 | 0 | } |
4863 | | |
4864 | | uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s) |
4865 | 0 | { |
4866 | 0 | uint8_t *end = b->data + b->l_data; |
4867 | 0 | uint8_t *next = skip_aux(s, end); |
4868 | 0 | if (next == NULL) goto bad_aux; |
4869 | | |
4870 | 0 | b->l_data -= next - (s-2); |
4871 | 0 | if (next >= end) { errno = ENOENT; return NULL; } |
4872 | | |
4873 | 0 | memmove(s-2, next, end - next); |
4874 | 0 | return s; |
4875 | | |
4876 | 0 | bad_aux: |
4877 | 0 | hts_log_error("Corrupted aux data for read %s flag %d", |
4878 | 0 | bam_get_qname(b), b->core.flag); |
4879 | 0 | errno = EINVAL; |
4880 | 0 | return NULL; |
4881 | 0 | } |
4882 | | |
4883 | | int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data) |
4884 | 0 | { |
4885 | | // FIXME: This is not at all efficient! |
4886 | 0 | size_t ln = len >= 0 ? len : strlen(data) + 1; |
4887 | 0 | size_t old_ln = 0; |
4888 | 0 | int need_nul = ln == 0 || data[ln - 1] != '\0'; |
4889 | 0 | int save_errno = errno; |
4890 | 0 | int new_tag = 0; |
4891 | 0 | uint8_t *s = bam_aux_get(b,tag), *e; |
4892 | |
|
4893 | 0 | if (s) { // Replacing existing tag |
4894 | 0 | char type = *s; |
4895 | 0 | if (type != 'Z') { |
4896 | 0 | hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type); |
4897 | 0 | errno = EINVAL; |
4898 | 0 | return -1; |
4899 | 0 | } |
4900 | 0 | s++; |
4901 | 0 | e = memchr(s, '\0', b->data + b->l_data - s); |
4902 | 0 | old_ln = (e ? e - s : b->data + b->l_data - s) + 1; |
4903 | 0 | s -= 3; |
4904 | 0 | } else { |
4905 | 0 | if (errno != ENOENT) { // Invalid aux data, give up |
4906 | 0 | return -1; |
4907 | 0 | } else { // Tag doesn't exist - put it on the end |
4908 | 0 | errno = save_errno; |
4909 | 0 | s = b->data + b->l_data; |
4910 | 0 | new_tag = 3; |
4911 | 0 | } |
4912 | 0 | } |
4913 | | |
4914 | 0 | if (old_ln < ln + need_nul + new_tag) { |
4915 | 0 | ptrdiff_t s_offset = s - b->data; |
4916 | 0 | if (possibly_expand_bam_data(b, ln + need_nul + new_tag - old_ln) < 0) |
4917 | 0 | return -1; |
4918 | 0 | s = b->data + s_offset; |
4919 | 0 | } |
4920 | 0 | if (!new_tag) { |
4921 | 0 | memmove(s + 3 + ln + need_nul, |
4922 | 0 | s + 3 + old_ln, |
4923 | 0 | b->l_data - (s + 3 - b->data) - old_ln); |
4924 | 0 | } |
4925 | 0 | b->l_data += new_tag + ln + need_nul - old_ln; |
4926 | |
|
4927 | 0 | s[0] = tag[0]; |
4928 | 0 | s[1] = tag[1]; |
4929 | 0 | s[2] = 'Z'; |
4930 | 0 | memmove(s+3,data,ln); |
4931 | 0 | if (need_nul) s[3 + ln] = '\0'; |
4932 | 0 | return 0; |
4933 | 0 | } |
4934 | | |
4935 | | int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val) |
4936 | 0 | { |
4937 | 0 | uint32_t sz, old_sz = 0, new = 0; |
4938 | 0 | uint8_t *s, type; |
4939 | |
|
4940 | 0 | if (val < INT32_MIN || val > UINT32_MAX) { |
4941 | 0 | errno = EOVERFLOW; |
4942 | 0 | return -1; |
4943 | 0 | } |
4944 | 0 | if (val < INT16_MIN) { type = 'i'; sz = 4; } |
4945 | 0 | else if (val < INT8_MIN) { type = 's'; sz = 2; } |
4946 | 0 | else if (val < 0) { type = 'c'; sz = 1; } |
4947 | 0 | else if (val < UINT8_MAX) { type = 'C'; sz = 1; } |
4948 | 0 | else if (val < UINT16_MAX) { type = 'S'; sz = 2; } |
4949 | 0 | else { type = 'I'; sz = 4; } |
4950 | |
|
4951 | 0 | s = bam_aux_get(b, tag); |
4952 | 0 | if (s) { // Tag present - how big was the old one? |
4953 | 0 | switch (*s) { |
4954 | 0 | case 'c': case 'C': old_sz = 1; break; |
4955 | 0 | case 's': case 'S': old_sz = 2; break; |
4956 | 0 | case 'i': case 'I': old_sz = 4; break; |
4957 | 0 | default: errno = EINVAL; return -1; // Not an integer |
4958 | 0 | } |
4959 | 0 | } else { |
4960 | 0 | if (errno == ENOENT) { // Tag doesn't exist - add a new one |
4961 | 0 | s = b->data + b->l_data; |
4962 | 0 | new = 1; |
4963 | 0 | } else { // Invalid aux data, give up. |
4964 | 0 | return -1; |
4965 | 0 | } |
4966 | 0 | } |
4967 | | |
4968 | 0 | if (new || old_sz < sz) { |
4969 | | // Make room for new tag |
4970 | 0 | ptrdiff_t s_offset = s - b->data; |
4971 | 0 | if (possibly_expand_bam_data(b, (new ? 3 : 0) + sz - old_sz) < 0) |
4972 | 0 | return -1; |
4973 | 0 | s = b->data + s_offset; |
4974 | 0 | if (new) { // Add tag id |
4975 | 0 | *s++ = tag[0]; |
4976 | 0 | *s++ = tag[1]; |
4977 | 0 | } else { // Shift following data so we have space |
4978 | 0 | memmove(s + sz, s + old_sz, b->l_data - s_offset - old_sz); |
4979 | 0 | } |
4980 | 0 | } else { |
4981 | | // Reuse old space. Data value may be bigger than necessary but |
4982 | | // we avoid having to move everything else |
4983 | 0 | sz = old_sz; |
4984 | 0 | type = (val < 0 ? "\0cs\0i" : "\0CS\0I")[old_sz]; |
4985 | 0 | assert(type > 0); |
4986 | 0 | } |
4987 | 0 | *s++ = type; |
4988 | 0 | #ifdef HTS_LITTLE_ENDIAN |
4989 | 0 | memcpy(s, &val, sz); |
4990 | | #else |
4991 | | switch (sz) { |
4992 | | case 4: u32_to_le(val, s); break; |
4993 | | case 2: u16_to_le(val, s); break; |
4994 | | default: *s = val; break; |
4995 | | } |
4996 | | #endif |
4997 | 0 | b->l_data += (new ? 3 : 0) + sz - old_sz; |
4998 | 0 | return 0; |
4999 | 0 | } |
5000 | | |
5001 | | int bam_aux_update_float(bam1_t *b, const char tag[2], float val) |
5002 | 0 | { |
5003 | 0 | uint8_t *s = bam_aux_get(b, tag); |
5004 | 0 | int shrink = 0, new = 0; |
5005 | |
|
5006 | 0 | if (s) { // Tag present - what was it? |
5007 | 0 | switch (*s) { |
5008 | 0 | case 'f': break; |
5009 | 0 | case 'd': shrink = 1; break; |
5010 | 0 | default: errno = EINVAL; return -1; // Not a float |
5011 | 0 | } |
5012 | 0 | } else { |
5013 | 0 | if (errno == ENOENT) { // Tag doesn't exist - add a new one |
5014 | 0 | new = 1; |
5015 | 0 | } else { // Invalid aux data, give up. |
5016 | 0 | return -1; |
5017 | 0 | } |
5018 | 0 | } |
5019 | | |
5020 | 0 | if (new) { // Ensure there's room |
5021 | 0 | if (possibly_expand_bam_data(b, 3 + 4) < 0) |
5022 | 0 | return -1; |
5023 | 0 | s = b->data + b->l_data; |
5024 | 0 | *s++ = tag[0]; |
5025 | 0 | *s++ = tag[1]; |
5026 | 0 | } else if (shrink) { // Convert non-standard double tag to float |
5027 | 0 | memmove(s + 5, s + 9, b->l_data - ((s + 9) - b->data)); |
5028 | 0 | b->l_data -= 4; |
5029 | 0 | } |
5030 | 0 | *s++ = 'f'; |
5031 | 0 | float_to_le(val, s); |
5032 | 0 | if (new) b->l_data += 7; |
5033 | |
|
5034 | 0 | return 0; |
5035 | 0 | } |
5036 | | |
5037 | | int bam_aux_update_array(bam1_t *b, const char tag[2], |
5038 | | uint8_t type, uint32_t items, void *data) |
5039 | 0 | { |
5040 | 0 | uint8_t *s = bam_aux_get(b, tag); |
5041 | 0 | size_t old_sz = 0, new_sz; |
5042 | 0 | int new = 0; |
5043 | |
|
5044 | 0 | if (s) { // Tag present |
5045 | 0 | if (*s != 'B') { errno = EINVAL; return -1; } |
5046 | 0 | old_sz = aux_type2size(s[1]); |
5047 | 0 | if (old_sz < 1 || old_sz > 4) { errno = EINVAL; return -1; } |
5048 | 0 | old_sz *= le_to_u32(s + 2); |
5049 | 0 | } else { |
5050 | 0 | if (errno == ENOENT) { // Tag doesn't exist - add a new one |
5051 | 0 | s = b->data + b->l_data; |
5052 | 0 | new = 1; |
5053 | 0 | } else { // Invalid aux data, give up. |
5054 | 0 | return -1; |
5055 | 0 | } |
5056 | 0 | } |
5057 | | |
5058 | 0 | new_sz = aux_type2size(type); |
5059 | 0 | if (new_sz < 1 || new_sz > 4) { errno = EINVAL; return -1; } |
5060 | 0 | if (items > INT32_MAX / new_sz) { errno = ENOMEM; return -1; } |
5061 | 0 | new_sz *= items; |
5062 | |
|
5063 | 0 | if (new || old_sz < new_sz) { |
5064 | | // Make room for new tag |
5065 | 0 | ptrdiff_t s_offset = s - b->data; |
5066 | 0 | if (possibly_expand_bam_data(b, (new ? 8 : 0) + new_sz - old_sz) < 0) |
5067 | 0 | return -1; |
5068 | 0 | s = b->data + s_offset; |
5069 | 0 | } |
5070 | 0 | if (new) { // Add tag id and type |
5071 | 0 | *s++ = tag[0]; |
5072 | 0 | *s++ = tag[1]; |
5073 | 0 | *s = 'B'; |
5074 | 0 | b->l_data += 8 + new_sz; |
5075 | 0 | } else if (old_sz != new_sz) { // shift following data if necessary |
5076 | 0 | memmove(s + 6 + new_sz, s + 6 + old_sz, |
5077 | 0 | b->l_data - ((s + 6 + old_sz) - b->data)); |
5078 | 0 | b->l_data -= old_sz; |
5079 | 0 | b->l_data += new_sz; |
5080 | 0 | } |
5081 | |
|
5082 | 0 | s[1] = type; |
5083 | 0 | u32_to_le(items, s + 2); |
5084 | 0 | if (new_sz > 0) { |
5085 | 0 | #ifdef HTS_LITTLE_ENDIAN |
5086 | 0 | memcpy(s + 6, data, new_sz); |
5087 | | #else |
5088 | | return aux_to_le(type, s + 6, data, new_sz); |
5089 | | #endif |
5090 | 0 | } |
5091 | 0 | return 0; |
5092 | 0 | } |
5093 | | |
5094 | | static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s, |
5095 | | uint32_t idx) |
5096 | 0 | { |
5097 | 0 | switch (type) { |
5098 | 0 | case 'c': return le_to_i8(s + idx); |
5099 | 0 | case 'C': return s[idx]; |
5100 | 0 | case 's': return le_to_i16(s + 2 * idx); |
5101 | 0 | case 'S': return le_to_u16(s + 2 * idx); |
5102 | 0 | case 'i': return le_to_i32(s + 4 * idx); |
5103 | 0 | case 'I': return le_to_u32(s + 4 * idx); |
5104 | 0 | default: |
5105 | 0 | errno = EINVAL; |
5106 | 0 | return 0; |
5107 | 0 | } |
5108 | 0 | } |
5109 | | |
5110 | | int64_t bam_aux2i(const uint8_t *s) |
5111 | 0 | { |
5112 | 0 | int type; |
5113 | 0 | type = *s++; |
5114 | 0 | return get_int_aux_val(type, s, 0); |
5115 | 0 | } |
5116 | | |
5117 | | double bam_aux2f(const uint8_t *s) |
5118 | 0 | { |
5119 | 0 | int type; |
5120 | 0 | type = *s++; |
5121 | 0 | if (type == 'd') return le_to_double(s); |
5122 | 0 | else if (type == 'f') return le_to_float(s); |
5123 | 0 | else return get_int_aux_val(type, s, 0); |
5124 | 0 | } |
5125 | | |
5126 | | char bam_aux2A(const uint8_t *s) |
5127 | 0 | { |
5128 | 0 | int type; |
5129 | 0 | type = *s++; |
5130 | 0 | if (type == 'A') return *(char*)s; |
5131 | 0 | errno = EINVAL; |
5132 | 0 | return 0; |
5133 | 0 | } |
5134 | | |
5135 | | char *bam_aux2Z(const uint8_t *s) |
5136 | 0 | { |
5137 | 0 | int type; |
5138 | 0 | type = *s++; |
5139 | 0 | if (type == 'Z' || type == 'H') return (char*)s; |
5140 | 0 | errno = EINVAL; |
5141 | 0 | return 0; |
5142 | 0 | } |
5143 | | |
5144 | | uint32_t bam_auxB_len(const uint8_t *s) |
5145 | 0 | { |
5146 | 0 | if (s[0] != 'B') { |
5147 | 0 | errno = EINVAL; |
5148 | 0 | return 0; |
5149 | 0 | } |
5150 | 0 | return le_to_u32(s + 2); |
5151 | 0 | } |
5152 | | |
5153 | | int64_t bam_auxB2i(const uint8_t *s, uint32_t idx) |
5154 | 0 | { |
5155 | 0 | uint32_t len = bam_auxB_len(s); |
5156 | 0 | if (idx >= len) { |
5157 | 0 | errno = ERANGE; |
5158 | 0 | return 0; |
5159 | 0 | } |
5160 | 0 | return get_int_aux_val(s[1], s + 6, idx); |
5161 | 0 | } |
5162 | | |
5163 | | double bam_auxB2f(const uint8_t *s, uint32_t idx) |
5164 | 0 | { |
5165 | 0 | uint32_t len = bam_auxB_len(s); |
5166 | 0 | if (idx >= len) { |
5167 | 0 | errno = ERANGE; |
5168 | 0 | return 0.0; |
5169 | 0 | } |
5170 | 0 | if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx); |
5171 | 0 | else return get_int_aux_val(s[1], s + 6, idx); |
5172 | 0 | } |
5173 | | |
5174 | | int sam_open_mode(char *mode, const char *fn, const char *format) |
5175 | 0 | { |
5176 | | // TODO Parse "bam5" etc for compression level |
5177 | 0 | if (format == NULL) { |
5178 | | // Try to pick a format based on the filename extension |
5179 | 0 | char extension[HTS_MAX_EXT_LEN]; |
5180 | 0 | if (find_file_extension(fn, extension) < 0) return -1; |
5181 | 0 | return sam_open_mode(mode, fn, extension); |
5182 | 0 | } |
5183 | 0 | else if (strcasecmp(format, "bam") == 0) strcpy(mode, "b"); |
5184 | 0 | else if (strcasecmp(format, "cram") == 0) strcpy(mode, "c"); |
5185 | 0 | else if (strcasecmp(format, "sam") == 0) strcpy(mode, ""); |
5186 | 0 | else if (strcasecmp(format, "sam.gz") == 0) strcpy(mode, "z"); |
5187 | 0 | else if (strcasecmp(format, "fastq") == 0 || |
5188 | 0 | strcasecmp(format, "fq") == 0) strcpy(mode, "f"); |
5189 | 0 | else if (strcasecmp(format, "fastq.gz") == 0 || |
5190 | 0 | strcasecmp(format, "fq.gz") == 0) strcpy(mode, "fz"); |
5191 | 0 | else if (strcasecmp(format, "fasta") == 0 || |
5192 | 0 | strcasecmp(format, "fa") == 0) strcpy(mode, "F"); |
5193 | 0 | else if (strcasecmp(format, "fasta.gz") == 0 || |
5194 | 0 | strcasecmp(format, "fa.gz") == 0) strcpy(mode, "Fz"); |
5195 | 0 | else return -1; |
5196 | | |
5197 | 0 | return 0; |
5198 | 0 | } |
5199 | | |
5200 | | // A version of sam_open_mode that can handle ,key=value options. |
5201 | | // The format string is allocated and returned, to be freed by the caller. |
5202 | | // Prefix should be "r" or "w", |
5203 | | char *sam_open_mode_opts(const char *fn, |
5204 | | const char *mode, |
5205 | | const char *format) |
5206 | 0 | { |
5207 | 0 | char *mode_opts = malloc((format ? strlen(format) : 1) + |
5208 | 0 | (mode ? strlen(mode) : 1) + 12); |
5209 | 0 | char *opts, *cp; |
5210 | 0 | int format_len; |
5211 | |
|
5212 | 0 | if (!mode_opts) |
5213 | 0 | return NULL; |
5214 | | |
5215 | 0 | strcpy(mode_opts, mode ? mode : "r"); |
5216 | 0 | cp = mode_opts + strlen(mode_opts); |
5217 | |
|
5218 | 0 | if (format == NULL) { |
5219 | | // Try to pick a format based on the filename extension |
5220 | 0 | char extension[HTS_MAX_EXT_LEN]; |
5221 | 0 | if (find_file_extension(fn, extension) < 0) { |
5222 | 0 | free(mode_opts); |
5223 | 0 | return NULL; |
5224 | 0 | } |
5225 | 0 | if (sam_open_mode(cp, fn, extension) == 0) { |
5226 | 0 | return mode_opts; |
5227 | 0 | } else { |
5228 | 0 | free(mode_opts); |
5229 | 0 | return NULL; |
5230 | 0 | } |
5231 | 0 | } |
5232 | | |
5233 | 0 | if ((opts = strchr(format, ','))) { |
5234 | 0 | format_len = opts-format; |
5235 | 0 | } else { |
5236 | 0 | opts=""; |
5237 | 0 | format_len = strlen(format); |
5238 | 0 | } |
5239 | |
|
5240 | 0 | if (strncmp(format, "bam", format_len) == 0) { |
5241 | 0 | *cp++ = 'b'; |
5242 | 0 | } else if (strncmp(format, "cram", format_len) == 0) { |
5243 | 0 | *cp++ = 'c'; |
5244 | 0 | } else if (strncmp(format, "cram2", format_len) == 0) { |
5245 | 0 | *cp++ = 'c'; |
5246 | 0 | strcpy(cp, ",VERSION=2.1"); |
5247 | 0 | cp += 12; |
5248 | 0 | } else if (strncmp(format, "cram3", format_len) == 0) { |
5249 | 0 | *cp++ = 'c'; |
5250 | 0 | strcpy(cp, ",VERSION=3.0"); |
5251 | 0 | cp += 12; |
5252 | 0 | } else if (strncmp(format, "sam", format_len) == 0) { |
5253 | 0 | ; // format mode="" |
5254 | 0 | } else if (strncmp(format, "sam.gz", format_len) == 0) { |
5255 | 0 | *cp++ = 'z'; |
5256 | 0 | } else if (strncmp(format, "fastq", format_len) == 0 || |
5257 | 0 | strncmp(format, "fq", format_len) == 0) { |
5258 | 0 | *cp++ = 'f'; |
5259 | 0 | } else if (strncmp(format, "fastq.gz", format_len) == 0 || |
5260 | 0 | strncmp(format, "fq.gz", format_len) == 0) { |
5261 | 0 | *cp++ = 'f'; |
5262 | 0 | *cp++ = 'z'; |
5263 | 0 | } else if (strncmp(format, "fasta", format_len) == 0 || |
5264 | 0 | strncmp(format, "fa", format_len) == 0) { |
5265 | 0 | *cp++ = 'F'; |
5266 | 0 | } else if (strncmp(format, "fasta.gz", format_len) == 0 || |
5267 | 0 | strncmp(format, "fa", format_len) == 0) { |
5268 | 0 | *cp++ = 'F'; |
5269 | 0 | *cp++ = 'z'; |
5270 | 0 | } else { |
5271 | 0 | free(mode_opts); |
5272 | 0 | return NULL; |
5273 | 0 | } |
5274 | | |
5275 | 0 | strcpy(cp, opts); |
5276 | |
|
5277 | 0 | return mode_opts; |
5278 | 0 | } |
5279 | | |
5280 | 0 | #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n)) |
5281 | | int bam_str2flag(const char *str) |
5282 | 0 | { |
5283 | 0 | char *end, *beg = (char*) str; |
5284 | 0 | long int flag = strtol(str, &end, 0); |
5285 | 0 | if ( end!=str ) return flag; // the conversion was successful |
5286 | 0 | flag = 0; |
5287 | 0 | while ( *str ) |
5288 | 0 | { |
5289 | 0 | end = beg; |
5290 | 0 | while ( *end && *end!=',' ) end++; |
5291 | 0 | if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED; |
5292 | 0 | else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR; |
5293 | 0 | else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP; |
5294 | 0 | else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP; |
5295 | 0 | else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE; |
5296 | 0 | else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE; |
5297 | 0 | else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1; |
5298 | 0 | else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2; |
5299 | 0 | else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY; |
5300 | 0 | else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL; |
5301 | 0 | else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP; |
5302 | 0 | else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY; |
5303 | 0 | else return -1; |
5304 | 0 | if ( !*end ) break; |
5305 | 0 | beg = end + 1; |
5306 | 0 | } |
5307 | 0 | return flag; |
5308 | 0 | } |
5309 | | |
5310 | | char *bam_flag2str(int flag) |
5311 | 0 | { |
5312 | 0 | kstring_t str = {0,0,0}; |
5313 | 0 | if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED"); |
5314 | 0 | if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR"); |
5315 | 0 | if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP"); |
5316 | 0 | if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP"); |
5317 | 0 | if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE"); |
5318 | 0 | if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE"); |
5319 | 0 | if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1"); |
5320 | 0 | if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2"); |
5321 | 0 | if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY"); |
5322 | 0 | if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL"); |
5323 | 0 | if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP"); |
5324 | 0 | if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY"); |
5325 | 0 | if ( str.l == 0 ) kputsn("", 0, &str); |
5326 | 0 | return str.s; |
5327 | 0 | } |
5328 | | |
5329 | | |
5330 | | /************************** |
5331 | | *** Pileup and Mpileup *** |
5332 | | **************************/ |
5333 | | |
5334 | | #if !defined(BAM_NO_PILEUP) |
5335 | | |
5336 | | #include <assert.h> |
5337 | | |
5338 | | /******************* |
5339 | | *** Memory pool *** |
5340 | | *******************/ |
5341 | | |
5342 | | typedef struct { |
5343 | | int k, y; |
5344 | | hts_pos_t x, end; |
5345 | | } cstate_t; |
5346 | | |
5347 | | static cstate_t g_cstate_null = { -1, 0, 0, 0 }; |
5348 | | |
5349 | | typedef struct __linkbuf_t { |
5350 | | bam1_t b; |
5351 | | hts_pos_t beg, end; |
5352 | | cstate_t s; |
5353 | | struct __linkbuf_t *next; |
5354 | | bam_pileup_cd cd; |
5355 | | } lbnode_t; |
5356 | | |
5357 | | typedef struct { |
5358 | | int cnt, n, max; |
5359 | | lbnode_t **buf; |
5360 | | } mempool_t; |
5361 | | |
5362 | | static mempool_t *mp_init(void) |
5363 | 0 | { |
5364 | 0 | mempool_t *mp; |
5365 | 0 | mp = (mempool_t*)calloc(1, sizeof(mempool_t)); |
5366 | 0 | return mp; |
5367 | 0 | } |
5368 | | static void mp_destroy(mempool_t *mp) |
5369 | 0 | { |
5370 | 0 | int k; |
5371 | 0 | for (k = 0; k < mp->n; ++k) { |
5372 | 0 | free(mp->buf[k]->b.data); |
5373 | 0 | free(mp->buf[k]); |
5374 | 0 | } |
5375 | 0 | free(mp->buf); |
5376 | 0 | free(mp); |
5377 | 0 | } |
5378 | | static inline lbnode_t *mp_alloc(mempool_t *mp) |
5379 | 0 | { |
5380 | 0 | ++mp->cnt; |
5381 | 0 | if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t)); |
5382 | 0 | else return mp->buf[--mp->n]; |
5383 | 0 | } |
5384 | | static inline void mp_free(mempool_t *mp, lbnode_t *p) |
5385 | 0 | { |
5386 | 0 | --mp->cnt; p->next = 0; // clear lbnode_t::next here |
5387 | 0 | if (mp->n == mp->max) { |
5388 | 0 | mp->max = mp->max? mp->max<<1 : 256; |
5389 | 0 | mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max); |
5390 | 0 | } |
5391 | 0 | mp->buf[mp->n++] = p; |
5392 | 0 | } |
5393 | | |
5394 | | /********************** |
5395 | | *** CIGAR resolver *** |
5396 | | **********************/ |
5397 | | |
5398 | | /* s->k: the index of the CIGAR operator that has just been processed. |
5399 | | s->x: the reference coordinate of the start of s->k |
5400 | | s->y: the query coordinate of the start of s->k |
5401 | | */ |
5402 | | static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s) |
5403 | 0 | { |
5404 | 0 | #define _cop(c) ((c)&BAM_CIGAR_MASK) |
5405 | 0 | #define _cln(c) ((c)>>BAM_CIGAR_SHIFT) |
5406 | |
|
5407 | 0 | bam1_t *b = p->b; |
5408 | 0 | bam1_core_t *c = &b->core; |
5409 | 0 | uint32_t *cigar = bam_get_cigar(b); |
5410 | 0 | int k; |
5411 | | // determine the current CIGAR operation |
5412 | | //fprintf(stderr, "%s\tpos=%ld\tend=%ld\t(%d,%ld,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y); |
5413 | 0 | if (s->k == -1) { // never processed |
5414 | 0 | p->qpos = 0; |
5415 | 0 | if (c->n_cigar == 1) { // just one operation, save a loop |
5416 | 0 | if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0; |
5417 | 0 | } else { // find the first match or deletion |
5418 | 0 | for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) { |
5419 | 0 | int op = _cop(cigar[k]); |
5420 | 0 | int l = _cln(cigar[k]); |
5421 | 0 | if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || |
5422 | 0 | op == BAM_CEQUAL || op == BAM_CDIFF) break; |
5423 | 0 | else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; |
5424 | 0 | } |
5425 | 0 | assert(k < c->n_cigar); |
5426 | 0 | s->k = k; |
5427 | 0 | } |
5428 | 0 | } else { // the read has been processed before |
5429 | 0 | int op, l = _cln(cigar[s->k]); |
5430 | 0 | if (pos - s->x >= l) { // jump to the next operation |
5431 | 0 | assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case |
5432 | 0 | op = _cop(cigar[s->k+1]); |
5433 | 0 | if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop |
5434 | 0 | if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; |
5435 | 0 | s->x += l; |
5436 | 0 | ++s->k; |
5437 | 0 | } else { // find the next M/D/N/=/X |
5438 | 0 | if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; |
5439 | 0 | s->x += l; |
5440 | 0 | for (k = s->k + 1; k < c->n_cigar; ++k) { |
5441 | 0 | op = _cop(cigar[k]), l = _cln(cigar[k]); |
5442 | 0 | if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break; |
5443 | 0 | else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; |
5444 | 0 | } |
5445 | 0 | s->k = k; |
5446 | 0 | } |
5447 | 0 | assert(s->k < c->n_cigar); // otherwise a bug |
5448 | 0 | } // else, do nothing |
5449 | 0 | } |
5450 | 0 | { // collect pileup information |
5451 | 0 | int op, l; |
5452 | 0 | op = _cop(cigar[s->k]); l = _cln(cigar[s->k]); |
5453 | 0 | p->is_del = p->indel = p->is_refskip = 0; |
5454 | 0 | if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation |
5455 | 0 | int op2 = _cop(cigar[s->k+1]); |
5456 | 0 | int l2 = _cln(cigar[s->k+1]); |
5457 | 0 | if (op2 == BAM_CDEL && op != BAM_CDEL) { |
5458 | | // At start of a new deletion, merge e.g. 1D2D to 3D. |
5459 | | // Within a deletion (the 2D in 1D2D) we keep p->indel=0 |
5460 | | // and rely on is_del=1 as we would for 3D. |
5461 | 0 | p->indel = -(int)l2; |
5462 | 0 | for (k = s->k+2; k < c->n_cigar; ++k) { |
5463 | 0 | op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); |
5464 | 0 | if (op2 == BAM_CDEL) p->indel -= l2; |
5465 | 0 | else break; |
5466 | 0 | } |
5467 | 0 | } else if (op2 == BAM_CINS) { |
5468 | 0 | p->indel = l2; |
5469 | 0 | for (k = s->k+2; k < c->n_cigar; ++k) { |
5470 | 0 | op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); |
5471 | 0 | if (op2 == BAM_CINS) p->indel += l2; |
5472 | 0 | else if (op2 != BAM_CPAD) break; |
5473 | 0 | } |
5474 | 0 | } else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { |
5475 | 0 | int l3 = 0; |
5476 | 0 | for (k = s->k + 2; k < c->n_cigar; ++k) { |
5477 | 0 | op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); |
5478 | 0 | if (op2 == BAM_CINS) l3 += l2; |
5479 | 0 | else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break; |
5480 | 0 | } |
5481 | 0 | if (l3 > 0) p->indel = l3; |
5482 | 0 | } |
5483 | 0 | } |
5484 | 0 | if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { |
5485 | 0 | p->qpos = s->y + (pos - s->x); |
5486 | 0 | } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { |
5487 | 0 | p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!! |
5488 | 0 | p->is_refskip = (op == BAM_CREF_SKIP); |
5489 | 0 | } // cannot be other operations; otherwise a bug |
5490 | 0 | p->is_head = (pos == c->pos); p->is_tail = (pos == s->end); |
5491 | 0 | } |
5492 | 0 | p->cigar_ind = s->k; |
5493 | 0 | return 1; |
5494 | 0 | } |
5495 | | |
5496 | | /******************************* |
5497 | | *** Expansion of insertions *** |
5498 | | *******************************/ |
5499 | | |
5500 | | /* |
5501 | | * Fills out the kstring with the padded insertion sequence for the current |
5502 | | * location in 'p'. If this is not an insertion site, the string is blank. |
5503 | | * |
5504 | | * This variant handles base modifications, but only when "m" is non-NULL. |
5505 | | * |
5506 | | * Returns the number of inserted base on success, with string length being |
5507 | | * accessable via ins->l; |
5508 | | * -1 on failure. |
5509 | | */ |
5510 | | int bam_plp_insertion_mod(const bam_pileup1_t *p, |
5511 | | hts_base_mod_state *m, |
5512 | 0 | kstring_t *ins, int *del_len) { |
5513 | 0 | int j, k, indel, nb = 0; |
5514 | 0 | uint32_t *cigar; |
5515 | |
|
5516 | 0 | if (p->indel <= 0) { |
5517 | 0 | if (ks_resize(ins, 1) < 0) |
5518 | 0 | return -1; |
5519 | 0 | ins->l = 0; |
5520 | 0 | ins->s[0] = '\0'; |
5521 | 0 | return 0; |
5522 | 0 | } |
5523 | | |
5524 | 0 | if (del_len) |
5525 | 0 | *del_len = 0; |
5526 | | |
5527 | | // Measure indel length including pads |
5528 | 0 | indel = 0; |
5529 | 0 | k = p->cigar_ind+1; |
5530 | 0 | cigar = bam_get_cigar(p->b); |
5531 | 0 | while (k < p->b->core.n_cigar) { |
5532 | 0 | switch (cigar[k] & BAM_CIGAR_MASK) { |
5533 | 0 | case BAM_CPAD: |
5534 | 0 | case BAM_CINS: |
5535 | 0 | indel += (cigar[k] >> BAM_CIGAR_SHIFT); |
5536 | 0 | break; |
5537 | 0 | default: |
5538 | 0 | k = p->b->core.n_cigar; |
5539 | 0 | break; |
5540 | 0 | } |
5541 | 0 | k++; |
5542 | 0 | } |
5543 | 0 | nb = ins->l = indel; |
5544 | | |
5545 | | // Produce sequence |
5546 | 0 | if (ks_resize(ins, indel+1) < 0) |
5547 | 0 | return -1; |
5548 | 0 | indel = 0; |
5549 | 0 | k = p->cigar_ind+1; |
5550 | 0 | j = 1; |
5551 | 0 | while (k < p->b->core.n_cigar) { |
5552 | 0 | int l, c; |
5553 | 0 | switch (cigar[k] & BAM_CIGAR_MASK) { |
5554 | 0 | case BAM_CPAD: |
5555 | 0 | for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++) |
5556 | 0 | ins->s[indel++] = '*'; |
5557 | 0 | break; |
5558 | 0 | case BAM_CINS: |
5559 | 0 | for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) { |
5560 | 0 | c = p->qpos + j - p->is_del < p->b->core.l_qseq |
5561 | 0 | ? seq_nt16_str[bam_seqi(bam_get_seq(p->b), |
5562 | 0 | p->qpos + j - p->is_del)] |
5563 | 0 | : 'N'; |
5564 | 0 | ins->s[indel++] = c; |
5565 | 0 | int nm; |
5566 | 0 | hts_base_mod mod[256]; |
5567 | 0 | if (m && (nm = bam_mods_at_qpos(p->b, p->qpos + j - p->is_del, |
5568 | 0 | m, mod, 256)) > 0) { |
5569 | 0 | int o_indel = indel; |
5570 | 0 | if (ks_resize(ins, ins->l + nm*16+3) < 0) |
5571 | 0 | return -1; |
5572 | 0 | ins->s[indel++] = '['; |
5573 | 0 | int j; |
5574 | 0 | for (j = 0; j < nm; j++) { |
5575 | 0 | char qual[20]; |
5576 | 0 | if (mod[j].qual >= 0) |
5577 | 0 | snprintf(qual, sizeof(qual), "%d", mod[j].qual); |
5578 | 0 | else |
5579 | 0 | *qual=0; |
5580 | 0 | if (mod[j].modified_base < 0) |
5581 | | // ChEBI |
5582 | 0 | indel += snprintf(&ins->s[indel], ins->m - indel, |
5583 | 0 | "%c(%d)%s", |
5584 | 0 | "+-"[mod[j].strand], |
5585 | 0 | -mod[j].modified_base, |
5586 | 0 | qual); |
5587 | 0 | else |
5588 | 0 | indel += snprintf(&ins->s[indel], ins->m - indel, |
5589 | 0 | "%c%c%s", |
5590 | 0 | "+-"[mod[j].strand], |
5591 | 0 | mod[j].modified_base, |
5592 | 0 | qual); |
5593 | 0 | } |
5594 | 0 | ins->s[indel++] = ']'; |
5595 | 0 | ins->l += indel - o_indel; // grow by amount we used |
5596 | 0 | } |
5597 | 0 | } |
5598 | 0 | break; |
5599 | 0 | case BAM_CDEL: |
5600 | | // eg cigar 1M2I1D gives mpileup output in T+2AA-1C style |
5601 | 0 | if (del_len) |
5602 | 0 | *del_len = cigar[k]>>BAM_CIGAR_SHIFT; |
5603 | | // fall through |
5604 | 0 | default: |
5605 | 0 | k = p->b->core.n_cigar; |
5606 | 0 | break; |
5607 | 0 | } |
5608 | 0 | k++; |
5609 | 0 | } |
5610 | 0 | ins->s[indel] = '\0'; |
5611 | 0 | ins->l = indel; // string length |
5612 | |
|
5613 | 0 | return nb; // base length |
5614 | 0 | } |
5615 | | |
5616 | | /* |
5617 | | * Fills out the kstring with the padded insertion sequence for the current |
5618 | | * location in 'p'. If this is not an insertion site, the string is blank. |
5619 | | * |
5620 | | * This is the original interface with no capability for reporting base |
5621 | | * modifications. |
5622 | | * |
5623 | | * Returns the length of insertion string on success; |
5624 | | * -1 on failure. |
5625 | | */ |
5626 | 0 | int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) { |
5627 | 0 | return bam_plp_insertion_mod(p, NULL, ins, del_len); |
5628 | 0 | } |
5629 | | |
5630 | | /*********************** |
5631 | | *** Pileup iterator *** |
5632 | | ***********************/ |
5633 | | |
5634 | | // Dictionary of overlapping reads |
5635 | | KHASH_MAP_INIT_STR(olap_hash, lbnode_t *) |
5636 | | typedef khash_t(olap_hash) olap_hash_t; |
5637 | | |
5638 | | struct bam_plp_s { |
5639 | | mempool_t *mp; |
5640 | | lbnode_t *head, *tail; |
5641 | | int32_t tid, max_tid; |
5642 | | hts_pos_t pos, max_pos; |
5643 | | int is_eof, max_plp, error, maxcnt; |
5644 | | uint64_t id; |
5645 | | bam_pileup1_t *plp; |
5646 | | // for the "auto" interface only |
5647 | | bam1_t *b; |
5648 | | bam_plp_auto_f func; |
5649 | | void *data; |
5650 | | olap_hash_t *overlaps; |
5651 | | |
5652 | | // For notification of creation and destruction events |
5653 | | // and associated client-owned pointer. |
5654 | | int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd); |
5655 | | int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd); |
5656 | | }; |
5657 | | |
5658 | | bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data) |
5659 | 0 | { |
5660 | 0 | bam_plp_t iter; |
5661 | 0 | iter = (bam_plp_t)calloc(1, sizeof(struct bam_plp_s)); |
5662 | 0 | iter->mp = mp_init(); |
5663 | 0 | iter->head = iter->tail = mp_alloc(iter->mp); |
5664 | 0 | iter->max_tid = iter->max_pos = -1; |
5665 | 0 | iter->maxcnt = 8000; |
5666 | 0 | if (func) { |
5667 | 0 | iter->func = func; |
5668 | 0 | iter->data = data; |
5669 | 0 | iter->b = bam_init1(); |
5670 | 0 | } |
5671 | 0 | return iter; |
5672 | 0 | } |
5673 | | |
5674 | | int bam_plp_init_overlaps(bam_plp_t iter) |
5675 | 0 | { |
5676 | 0 | iter->overlaps = kh_init(olap_hash); // hash for tweaking quality of bases in overlapping reads |
5677 | 0 | return iter->overlaps ? 0 : -1; |
5678 | 0 | } |
5679 | | |
5680 | | void bam_plp_destroy(bam_plp_t iter) |
5681 | 0 | { |
5682 | 0 | lbnode_t *p, *pnext; |
5683 | 0 | if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps); |
5684 | 0 | for (p = iter->head; p != NULL; p = pnext) { |
5685 | 0 | if (iter->plp_destruct && p != iter->tail) |
5686 | 0 | iter->plp_destruct(iter->data, &p->b, &p->cd); |
5687 | 0 | pnext = p->next; |
5688 | 0 | mp_free(iter->mp, p); |
5689 | 0 | } |
5690 | 0 | mp_destroy(iter->mp); |
5691 | 0 | if (iter->b) bam_destroy1(iter->b); |
5692 | 0 | free(iter->plp); |
5693 | 0 | free(iter); |
5694 | 0 | } |
5695 | | |
5696 | | void bam_plp_constructor(bam_plp_t plp, |
5697 | 0 | int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) { |
5698 | 0 | plp->plp_construct = func; |
5699 | 0 | } |
5700 | | |
5701 | | void bam_plp_destructor(bam_plp_t plp, |
5702 | 0 | int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) { |
5703 | 0 | plp->plp_destruct = func; |
5704 | 0 | } |
5705 | | |
5706 | | //--------------------------------- |
5707 | | //--- Tweak overlapping reads |
5708 | | //--------------------------------- |
5709 | | |
5710 | | /** |
5711 | | * cigar_iref2iseq_set() - find the first CMATCH setting the ref and the read index |
5712 | | * cigar_iref2iseq_next() - get the next CMATCH base |
5713 | | * @cigar: pointer to current cigar block (rw) |
5714 | | * @cigar_max: pointer just beyond the last cigar block |
5715 | | * @icig: position within the current cigar block (rw) |
5716 | | * @iseq: position in the sequence (rw) |
5717 | | * @iref: position with respect to the beginning of the read (iref_pos - b->core.pos) (rw) |
5718 | | * |
5719 | | * Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered, |
5720 | | * or -2 on error. |
5721 | | */ |
5722 | | static inline int cigar_iref2iseq_set(const uint32_t **cigar, |
5723 | | const uint32_t *cigar_max, |
5724 | | hts_pos_t *icig, |
5725 | | hts_pos_t *iseq, |
5726 | | hts_pos_t *iref) |
5727 | 0 | { |
5728 | 0 | hts_pos_t pos = *iref; |
5729 | 0 | if ( pos < 0 ) return -1; |
5730 | 0 | *icig = 0; |
5731 | 0 | *iseq = 0; |
5732 | 0 | *iref = 0; |
5733 | 0 | while ( *cigar<cigar_max ) |
5734 | 0 | { |
5735 | 0 | int cig = (**cigar) & BAM_CIGAR_MASK; |
5736 | 0 | int ncig = (**cigar) >> BAM_CIGAR_SHIFT; |
5737 | |
|
5738 | 0 | if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } |
5739 | 0 | if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; } |
5740 | 0 | if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) |
5741 | 0 | { |
5742 | 0 | pos -= ncig; |
5743 | 0 | if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; } |
5744 | 0 | (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig; |
5745 | 0 | continue; |
5746 | 0 | } |
5747 | 0 | if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } |
5748 | 0 | if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) |
5749 | 0 | { |
5750 | 0 | pos -= ncig; |
5751 | 0 | if ( pos<0 ) pos = 0; |
5752 | 0 | (*cigar)++; *icig = 0; *iref += ncig; |
5753 | 0 | continue; |
5754 | 0 | } |
5755 | 0 | hts_log_error("Unexpected cigar %d", cig); |
5756 | 0 | return -2; |
5757 | 0 | } |
5758 | 0 | *iseq = -1; |
5759 | 0 | return -1; |
5760 | 0 | } |
5761 | | static inline int cigar_iref2iseq_next(const uint32_t **cigar, |
5762 | | const uint32_t *cigar_max, |
5763 | | hts_pos_t *icig, |
5764 | | hts_pos_t *iseq, |
5765 | | hts_pos_t *iref) |
5766 | 0 | { |
5767 | 0 | while ( *cigar < cigar_max ) |
5768 | 0 | { |
5769 | 0 | int cig = (**cigar) & BAM_CIGAR_MASK; |
5770 | 0 | int ncig = (**cigar) >> BAM_CIGAR_SHIFT; |
5771 | |
|
5772 | 0 | if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) |
5773 | 0 | { |
5774 | 0 | if ( *icig >= ncig - 1 ) { *icig = -1; (*cigar)++; continue; } |
5775 | 0 | (*iseq)++; (*icig)++; (*iref)++; |
5776 | 0 | return BAM_CMATCH; |
5777 | 0 | } |
5778 | 0 | if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = -1; continue; } |
5779 | 0 | if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; } |
5780 | 0 | if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; } |
5781 | 0 | if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = -1; continue; } |
5782 | 0 | hts_log_error("Unexpected cigar %d", cig); |
5783 | 0 | return -2; |
5784 | 0 | } |
5785 | 0 | *iseq = -1; |
5786 | 0 | *iref = -1; |
5787 | 0 | return -1; |
5788 | 0 | } |
5789 | | |
5790 | | // Given overlapping read 'a' (left) and 'b' (right) on the same |
5791 | | // template, adjust quality values to zero for either a or b. |
5792 | | // Note versions 1.12 and earlier always removed quality from 'b' for |
5793 | | // matching bases. Now we select a or b semi-randomly based on name hash. |
5794 | | // Returns 0 on success, |
5795 | | // -1 on failure |
5796 | | static int tweak_overlap_quality(bam1_t *a, bam1_t *b) |
5797 | 0 | { |
5798 | 0 | const uint32_t *a_cigar = bam_get_cigar(a), |
5799 | 0 | *a_cigar_max = a_cigar + a->core.n_cigar; |
5800 | 0 | const uint32_t *b_cigar = bam_get_cigar(b), |
5801 | 0 | *b_cigar_max = b_cigar + b->core.n_cigar; |
5802 | 0 | hts_pos_t a_icig = 0, a_iseq = 0; |
5803 | 0 | hts_pos_t b_icig = 0, b_iseq = 0; |
5804 | 0 | uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b); |
5805 | 0 | uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b); |
5806 | |
|
5807 | 0 | hts_pos_t iref = b->core.pos; |
5808 | 0 | hts_pos_t a_iref = iref - a->core.pos; |
5809 | 0 | hts_pos_t b_iref = iref - b->core.pos; |
5810 | |
|
5811 | 0 | int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, |
5812 | 0 | &a_icig, &a_iseq, &a_iref); |
5813 | 0 | if ( a_ret<0 ) |
5814 | | // no overlap or error |
5815 | 0 | return a_ret<-1 ? -1:0; |
5816 | | |
5817 | 0 | int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, |
5818 | 0 | &b_icig, &b_iseq, &b_iref); |
5819 | 0 | if ( b_ret<0 ) |
5820 | | // no overlap or error |
5821 | 0 | return b_ret<-1 ? -1:0; |
5822 | | |
5823 | | // Determine which seq is the one getting modified qualities. |
5824 | 0 | uint8_t amul, bmul; |
5825 | 0 | if (__ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(a))) & 1) { |
5826 | 0 | amul = 1; |
5827 | 0 | bmul = 0; |
5828 | 0 | } else { |
5829 | 0 | amul = 0; |
5830 | 0 | bmul = 1; |
5831 | 0 | } |
5832 | | |
5833 | | // Loop over the overlapping region nulling qualities in either |
5834 | | // seq a or b. |
5835 | 0 | int err = 0; |
5836 | 0 | while ( 1 ) { |
5837 | | // Step to next matching reference position in a and b |
5838 | 0 | while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos ) |
5839 | 0 | a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, |
5840 | 0 | &a_icig, &a_iseq, &a_iref); |
5841 | 0 | if ( a_ret<0 ) { // done |
5842 | 0 | err = a_ret<-1?-1:0; |
5843 | 0 | break; |
5844 | 0 | } |
5845 | | |
5846 | 0 | while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos ) |
5847 | 0 | b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, |
5848 | 0 | &b_iseq, &b_iref); |
5849 | 0 | if ( b_ret<0 ) { // done |
5850 | 0 | err = b_ret<-1?-1:0; |
5851 | 0 | break; |
5852 | 0 | } |
5853 | | |
5854 | 0 | if ( iref < a_iref + a->core.pos ) |
5855 | 0 | iref = a_iref + a->core.pos; |
5856 | |
|
5857 | 0 | if ( iref < b_iref + b->core.pos ) |
5858 | 0 | iref = b_iref + b->core.pos; |
5859 | |
|
5860 | 0 | iref++; |
5861 | | |
5862 | | // If A or B has a deletion then we catch up the other to this point. |
5863 | | // We also amend quality values using the same rules for mismatch. |
5864 | 0 | if (a_iref+a->core.pos != b_iref+b->core.pos) { |
5865 | 0 | if (a_iref+a->core.pos < b_iref+b->core.pos |
5866 | 0 | && b_cigar > bam_get_cigar(b) |
5867 | 0 | && bam_cigar_op(b_cigar[-1]) == BAM_CDEL) { |
5868 | | // Del in B means it's moved on further than A |
5869 | 0 | do { |
5870 | 0 | a_qual[a_iseq] = amul |
5871 | 0 | ? a_qual[a_iseq]*0.8 |
5872 | 0 | : 0; |
5873 | 0 | a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, |
5874 | 0 | &a_icig, &a_iseq, &a_iref); |
5875 | 0 | if (a_ret < 0) |
5876 | 0 | return -(a_ret<-1); // 0 or -1 |
5877 | 0 | } while (a_iref + a->core.pos < b_iref+b->core.pos); |
5878 | 0 | } else if (a_cigar > bam_get_cigar(a) |
5879 | 0 | && bam_cigar_op(a_cigar[-1]) == BAM_CDEL) { |
5880 | | // Del in A means it's moved on further than B |
5881 | 0 | do { |
5882 | 0 | b_qual[b_iseq] = bmul |
5883 | 0 | ? b_qual[b_iseq]*0.8 |
5884 | 0 | : 0; |
5885 | 0 | b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, |
5886 | 0 | &b_icig, &b_iseq, &b_iref); |
5887 | 0 | if (b_ret < 0) |
5888 | 0 | return -(b_ret<-1); // 0 or -1 |
5889 | 0 | } while (b_iref + b->core.pos < a_iref+a->core.pos); |
5890 | 0 | } else { |
5891 | | // Anything else, eg ref-skip, we don't support here |
5892 | 0 | continue; |
5893 | 0 | } |
5894 | 0 | } |
5895 | | |
5896 | | // fprintf(stderr, "a_cig=%ld,%ld b_cig=%ld,%ld iref=%ld " |
5897 | | // "a_iref=%ld b_iref=%ld a_iseq=%ld b_iseq=%ld\n", |
5898 | | // a_cigar-bam_get_cigar(a), a_icig, |
5899 | | // b_cigar-bam_get_cigar(b), b_icig, |
5900 | | // iref, a_iref+a->core.pos+1, b_iref+b->core.pos+1, |
5901 | | // a_iseq, b_iseq); |
5902 | | |
5903 | 0 | if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq) |
5904 | | // Fell off end of sequence, bad CIGAR? |
5905 | 0 | return -1; |
5906 | | |
5907 | | // We're finally at the same ref base in both a and b. |
5908 | | // Check if the bases match (confident) or mismatch |
5909 | | // (not so confident). |
5910 | 0 | if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) { |
5911 | | // We are very confident about this base. Use sum of quals |
5912 | 0 | int qual = a_qual[a_iseq] + b_qual[b_iseq]; |
5913 | 0 | a_qual[a_iseq] = amul * (qual>200 ? 200 : qual); |
5914 | 0 | b_qual[b_iseq] = bmul * (qual>200 ? 200 : qual);; |
5915 | 0 | } else { |
5916 | | // Not so confident about anymore given the mismatch. |
5917 | | // Reduce qual for lowest quality base. |
5918 | 0 | if ( a_qual[a_iseq] > b_qual[b_iseq] ) { |
5919 | | // A highest qual base; keep |
5920 | 0 | a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; |
5921 | 0 | b_qual[b_iseq] = 0; |
5922 | 0 | } else if (a_qual[a_iseq] < b_qual[b_iseq] ) { |
5923 | | // B highest qual base; keep |
5924 | 0 | b_qual[b_iseq] = 0.8 * b_qual[b_iseq]; |
5925 | 0 | a_qual[a_iseq] = 0; |
5926 | 0 | } else { |
5927 | | // Both equal, so pick randomly |
5928 | 0 | a_qual[a_iseq] = amul * 0.8 * a_qual[a_iseq]; |
5929 | 0 | b_qual[b_iseq] = bmul * 0.8 * b_qual[b_iseq]; |
5930 | 0 | } |
5931 | 0 | } |
5932 | 0 | } |
5933 | | |
5934 | 0 | return err; |
5935 | 0 | } |
5936 | | |
5937 | | // Fix overlapping reads. Simple soft-clipping did not give good results. |
5938 | | // Lowering qualities of unwanted bases is more selective and works better. |
5939 | | // |
5940 | | // Returns 0 on success, -1 on failure |
5941 | | static int overlap_push(bam_plp_t iter, lbnode_t *node) |
5942 | 0 | { |
5943 | 0 | if ( !iter->overlaps ) return 0; |
5944 | | |
5945 | | // mapped mates and paired reads only |
5946 | 0 | if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return 0; |
5947 | | |
5948 | | // no overlap possible, unless some wild cigar |
5949 | 0 | if ( (node->b.core.mtid >= 0 && node->b.core.tid != node->b.core.mtid) |
5950 | 0 | || (llabs(node->b.core.isize) >= 2*node->b.core.l_qseq |
5951 | 0 | && node->b.core.mpos >= node->end) // for those wild cigars |
5952 | 0 | ) return 0; |
5953 | | |
5954 | 0 | khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b)); |
5955 | 0 | if ( kitr==kh_end(iter->overlaps) ) |
5956 | 0 | { |
5957 | | // Only add reads where the mate is still to arrive |
5958 | 0 | if (node->b.core.mpos >= node->b.core.pos || |
5959 | 0 | ((node->b.core.flag & BAM_FPAIRED) && node->b.core.mpos == -1)) { |
5960 | 0 | int ret; |
5961 | 0 | kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret); |
5962 | 0 | if (ret < 0) return -1; |
5963 | 0 | kh_value(iter->overlaps, kitr) = node; |
5964 | 0 | } |
5965 | 0 | } |
5966 | 0 | else |
5967 | 0 | { |
5968 | 0 | lbnode_t *a = kh_value(iter->overlaps, kitr); |
5969 | 0 | int err = tweak_overlap_quality(&a->b, &node->b); |
5970 | 0 | kh_del(olap_hash, iter->overlaps, kitr); |
5971 | 0 | assert(a->end-1 == a->s.end); |
5972 | 0 | return err; |
5973 | 0 | } |
5974 | 0 | return 0; |
5975 | 0 | } |
5976 | | |
5977 | | static void overlap_remove(bam_plp_t iter, const bam1_t *b) |
5978 | 0 | { |
5979 | 0 | if ( !iter->overlaps ) return; |
5980 | | |
5981 | 0 | khiter_t kitr; |
5982 | 0 | if ( b ) |
5983 | 0 | { |
5984 | 0 | kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b)); |
5985 | 0 | if ( kitr!=kh_end(iter->overlaps) ) |
5986 | 0 | kh_del(olap_hash, iter->overlaps, kitr); |
5987 | 0 | } |
5988 | 0 | else |
5989 | 0 | { |
5990 | | // remove all |
5991 | 0 | for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++) |
5992 | 0 | if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr); |
5993 | 0 | } |
5994 | 0 | } |
5995 | | |
5996 | | |
5997 | | |
5998 | | // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns |
5999 | | // pointer to the piled records if next position is ready or NULL if there is not enough records in the |
6000 | | // buffer yet (the current position is still the maximum position across all buffered reads). |
6001 | | const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp) |
6002 | 0 | { |
6003 | 0 | if (iter->error) { *_n_plp = -1; return NULL; } |
6004 | 0 | *_n_plp = 0; |
6005 | 0 | if (iter->is_eof && iter->head == iter->tail) return NULL; |
6006 | 0 | while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) { |
6007 | 0 | int n_plp = 0; |
6008 | | // write iter->plp at iter->pos |
6009 | 0 | lbnode_t **pptr = &iter->head; |
6010 | 0 | while (*pptr != iter->tail) { |
6011 | 0 | lbnode_t *p = *pptr; |
6012 | 0 | if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove |
6013 | 0 | overlap_remove(iter, &p->b); |
6014 | 0 | if (iter->plp_destruct) |
6015 | 0 | iter->plp_destruct(iter->data, &p->b, &p->cd); |
6016 | 0 | *pptr = p->next; mp_free(iter->mp, p); |
6017 | 0 | } |
6018 | 0 | else { |
6019 | 0 | if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup |
6020 | 0 | if (n_plp == iter->max_plp) { // then double the capacity |
6021 | 0 | iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256; |
6022 | 0 | iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp); |
6023 | 0 | } |
6024 | 0 | iter->plp[n_plp].b = &p->b; |
6025 | 0 | iter->plp[n_plp].cd = p->cd; |
6026 | 0 | if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true... |
6027 | 0 | } |
6028 | 0 | pptr = &(*pptr)->next; |
6029 | 0 | } |
6030 | 0 | } |
6031 | 0 | *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos; |
6032 | | // update iter->tid and iter->pos |
6033 | 0 | if (iter->head != iter->tail) { |
6034 | 0 | if (iter->tid > iter->head->b.core.tid) { |
6035 | 0 | hts_log_error("Unsorted input. Pileup aborts"); |
6036 | 0 | iter->error = 1; |
6037 | 0 | *_n_plp = -1; |
6038 | 0 | return NULL; |
6039 | 0 | } |
6040 | 0 | } |
6041 | 0 | if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence |
6042 | 0 | iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference |
6043 | 0 | } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid |
6044 | 0 | iter->pos = iter->head->beg; // jump to the next position |
6045 | 0 | } else ++iter->pos; // scan contiguously |
6046 | | // return |
6047 | 0 | if (n_plp) return iter->plp; |
6048 | 0 | if (iter->is_eof && iter->head == iter->tail) break; |
6049 | 0 | } |
6050 | 0 | return NULL; |
6051 | 0 | } |
6052 | | |
6053 | | const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) |
6054 | 0 | { |
6055 | 0 | hts_pos_t pos64 = 0; |
6056 | 0 | const bam_pileup1_t *p = bam_plp64_next(iter, _tid, &pos64, _n_plp); |
6057 | 0 | if (pos64 < INT_MAX) { |
6058 | 0 | *_pos = pos64; |
6059 | 0 | } else { |
6060 | 0 | hts_log_error("Position %"PRId64" too large", pos64); |
6061 | 0 | *_pos = INT_MAX; |
6062 | 0 | iter->error = 1; |
6063 | 0 | *_n_plp = -1; |
6064 | 0 | return NULL; |
6065 | 0 | } |
6066 | 0 | return p; |
6067 | 0 | } |
6068 | | |
6069 | | int bam_plp_push(bam_plp_t iter, const bam1_t *b) |
6070 | 0 | { |
6071 | 0 | if (iter->error) return -1; |
6072 | 0 | if (b) { |
6073 | 0 | if (b->core.tid < 0) { overlap_remove(iter, b); return 0; } |
6074 | | // Skip only unmapped reads here, any additional filtering must be done in iter->func |
6075 | 0 | if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; } |
6076 | 0 | if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt) |
6077 | 0 | { |
6078 | 0 | overlap_remove(iter, b); |
6079 | 0 | return 0; |
6080 | 0 | } |
6081 | 0 | if (bam_copy1(&iter->tail->b, b) == NULL) |
6082 | 0 | return -1; |
6083 | 0 | iter->tail->b.id = iter->id++; |
6084 | 0 | iter->tail->beg = b->core.pos; |
6085 | | // Use raw rlen rather than bam_endpos() which adjusts rlen=0 to rlen=1 |
6086 | 0 | iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); |
6087 | 0 | iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t |
6088 | 0 | if (b->core.tid < iter->max_tid) { |
6089 | 0 | hts_log_error("The input is not sorted (chromosomes out of order)"); |
6090 | 0 | iter->error = 1; |
6091 | 0 | return -1; |
6092 | 0 | } |
6093 | 0 | if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) { |
6094 | 0 | hts_log_error("The input is not sorted (reads out of order)"); |
6095 | 0 | iter->error = 1; |
6096 | 0 | return -1; |
6097 | 0 | } |
6098 | 0 | iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg; |
6099 | 0 | if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) { |
6100 | 0 | lbnode_t *next = mp_alloc(iter->mp); |
6101 | 0 | if (!next) { |
6102 | 0 | iter->error = 1; |
6103 | 0 | return -1; |
6104 | 0 | } |
6105 | 0 | if (iter->plp_construct) { |
6106 | 0 | if (iter->plp_construct(iter->data, &iter->tail->b, |
6107 | 0 | &iter->tail->cd) < 0) { |
6108 | 0 | mp_free(iter->mp, next); |
6109 | 0 | iter->error = 1; |
6110 | 0 | return -1; |
6111 | 0 | } |
6112 | 0 | } |
6113 | 0 | if (overlap_push(iter, iter->tail) < 0) { |
6114 | 0 | mp_free(iter->mp, next); |
6115 | 0 | iter->error = 1; |
6116 | 0 | return -1; |
6117 | 0 | } |
6118 | 0 | iter->tail->next = next; |
6119 | 0 | iter->tail = iter->tail->next; |
6120 | 0 | } |
6121 | 0 | } else iter->is_eof = 1; |
6122 | 0 | return 0; |
6123 | 0 | } |
6124 | | |
6125 | | const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp) |
6126 | 0 | { |
6127 | 0 | const bam_pileup1_t *plp; |
6128 | 0 | if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; } |
6129 | 0 | if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp; |
6130 | 0 | else { // no pileup line can be obtained; read alignments |
6131 | 0 | *_n_plp = 0; |
6132 | 0 | if (iter->is_eof) return 0; |
6133 | 0 | int ret; |
6134 | 0 | while ( (ret=iter->func(iter->data, iter->b)) >= 0) { |
6135 | 0 | if (bam_plp_push(iter, iter->b) < 0) { |
6136 | 0 | *_n_plp = -1; |
6137 | 0 | return 0; |
6138 | 0 | } |
6139 | 0 | if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp; |
6140 | | // otherwise no pileup line can be returned; read the next alignment. |
6141 | 0 | } |
6142 | 0 | if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; } |
6143 | 0 | if (bam_plp_push(iter, 0) < 0) { |
6144 | 0 | *_n_plp = -1; |
6145 | 0 | return 0; |
6146 | 0 | } |
6147 | 0 | if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp; |
6148 | 0 | return 0; |
6149 | 0 | } |
6150 | 0 | } |
6151 | | |
6152 | | const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) |
6153 | 0 | { |
6154 | 0 | hts_pos_t pos64 = 0; |
6155 | 0 | const bam_pileup1_t *p = bam_plp64_auto(iter, _tid, &pos64, _n_plp); |
6156 | 0 | if (pos64 < INT_MAX) { |
6157 | 0 | *_pos = pos64; |
6158 | 0 | } else { |
6159 | 0 | hts_log_error("Position %"PRId64" too large", pos64); |
6160 | 0 | *_pos = INT_MAX; |
6161 | 0 | iter->error = 1; |
6162 | 0 | *_n_plp = -1; |
6163 | 0 | return NULL; |
6164 | 0 | } |
6165 | 0 | return p; |
6166 | 0 | } |
6167 | | |
6168 | | void bam_plp_reset(bam_plp_t iter) |
6169 | 0 | { |
6170 | 0 | overlap_remove(iter, NULL); |
6171 | 0 | iter->max_tid = iter->max_pos = -1; |
6172 | 0 | iter->tid = iter->pos = 0; |
6173 | 0 | iter->is_eof = 0; |
6174 | 0 | while (iter->head != iter->tail) { |
6175 | 0 | lbnode_t *p = iter->head; |
6176 | 0 | iter->head = p->next; |
6177 | 0 | mp_free(iter->mp, p); |
6178 | 0 | } |
6179 | 0 | } |
6180 | | |
6181 | | void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt) |
6182 | 0 | { |
6183 | 0 | iter->maxcnt = maxcnt; |
6184 | 0 | } |
6185 | | |
6186 | | /************************ |
6187 | | *** Mpileup iterator *** |
6188 | | ************************/ |
6189 | | |
6190 | | struct bam_mplp_s { |
6191 | | int n; |
6192 | | int32_t min_tid, *tid; |
6193 | | hts_pos_t min_pos, *pos; |
6194 | | bam_plp_t *iter; |
6195 | | int *n_plp; |
6196 | | const bam_pileup1_t **plp; |
6197 | | }; |
6198 | | |
6199 | | bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) |
6200 | 0 | { |
6201 | 0 | int i; |
6202 | 0 | bam_mplp_t iter; |
6203 | 0 | iter = (bam_mplp_t)calloc(1, sizeof(struct bam_mplp_s)); |
6204 | 0 | iter->pos = (hts_pos_t*)calloc(n, sizeof(hts_pos_t)); |
6205 | 0 | iter->tid = (int32_t*)calloc(n, sizeof(int32_t)); |
6206 | 0 | iter->n_plp = (int*)calloc(n, sizeof(int)); |
6207 | 0 | iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*)); |
6208 | 0 | iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t)); |
6209 | 0 | iter->n = n; |
6210 | 0 | iter->min_pos = HTS_POS_MAX; |
6211 | 0 | iter->min_tid = (uint32_t)-1; |
6212 | 0 | for (i = 0; i < n; ++i) { |
6213 | 0 | iter->iter[i] = bam_plp_init(func, data[i]); |
6214 | 0 | iter->pos[i] = iter->min_pos; |
6215 | 0 | iter->tid[i] = iter->min_tid; |
6216 | 0 | } |
6217 | 0 | return iter; |
6218 | 0 | } |
6219 | | |
6220 | | int bam_mplp_init_overlaps(bam_mplp_t iter) |
6221 | 0 | { |
6222 | 0 | int i, r = 0; |
6223 | 0 | for (i = 0; i < iter->n; ++i) |
6224 | 0 | r |= bam_plp_init_overlaps(iter->iter[i]); |
6225 | 0 | return r == 0 ? 0 : -1; |
6226 | 0 | } |
6227 | | |
6228 | | void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt) |
6229 | 0 | { |
6230 | 0 | int i; |
6231 | 0 | for (i = 0; i < iter->n; ++i) |
6232 | 0 | iter->iter[i]->maxcnt = maxcnt; |
6233 | 0 | } |
6234 | | |
6235 | | void bam_mplp_destroy(bam_mplp_t iter) |
6236 | 0 | { |
6237 | 0 | int i; |
6238 | 0 | for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]); |
6239 | 0 | free(iter->iter); free(iter->pos); free(iter->tid); |
6240 | 0 | free(iter->n_plp); free(iter->plp); |
6241 | 0 | free(iter); |
6242 | 0 | } |
6243 | | |
6244 | | int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp) |
6245 | 0 | { |
6246 | 0 | int i, ret = 0; |
6247 | 0 | hts_pos_t new_min_pos = HTS_POS_MAX; |
6248 | 0 | uint32_t new_min_tid = (uint32_t)-1; |
6249 | 0 | for (i = 0; i < iter->n; ++i) { |
6250 | 0 | if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) { |
6251 | 0 | int tid; |
6252 | 0 | hts_pos_t pos; |
6253 | 0 | iter->plp[i] = bam_plp64_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]); |
6254 | 0 | if ( iter->iter[i]->error ) return -1; |
6255 | 0 | if (iter->plp[i]) { |
6256 | 0 | iter->tid[i] = tid; |
6257 | 0 | iter->pos[i] = pos; |
6258 | 0 | } else { |
6259 | 0 | iter->tid[i] = 0; |
6260 | 0 | iter->pos[i] = 0; |
6261 | 0 | } |
6262 | 0 | } |
6263 | 0 | if (iter->plp[i]) { |
6264 | 0 | if (iter->tid[i] < new_min_tid) { |
6265 | 0 | new_min_tid = iter->tid[i]; |
6266 | 0 | new_min_pos = iter->pos[i]; |
6267 | 0 | } else if (iter->tid[i] == new_min_tid && iter->pos[i] < new_min_pos) { |
6268 | 0 | new_min_pos = iter->pos[i]; |
6269 | 0 | } |
6270 | 0 | } |
6271 | 0 | } |
6272 | 0 | iter->min_pos = new_min_pos; |
6273 | 0 | iter->min_tid = new_min_tid; |
6274 | 0 | if (new_min_pos == HTS_POS_MAX) return 0; |
6275 | 0 | *_tid = new_min_tid; *_pos = new_min_pos; |
6276 | 0 | for (i = 0; i < iter->n; ++i) { |
6277 | 0 | if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) { |
6278 | 0 | n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i]; |
6279 | 0 | ++ret; |
6280 | 0 | } else n_plp[i] = 0, plp[i] = 0; |
6281 | 0 | } |
6282 | 0 | return ret; |
6283 | 0 | } |
6284 | | |
6285 | | int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) |
6286 | 0 | { |
6287 | 0 | hts_pos_t pos64 = 0; |
6288 | 0 | int ret = bam_mplp64_auto(iter, _tid, &pos64, n_plp, plp); |
6289 | 0 | if (ret >= 0) { |
6290 | 0 | if (pos64 < INT_MAX) { |
6291 | 0 | *_pos = pos64; |
6292 | 0 | } else { |
6293 | 0 | hts_log_error("Position %"PRId64" too large", pos64); |
6294 | 0 | *_pos = INT_MAX; |
6295 | 0 | return -1; |
6296 | 0 | } |
6297 | 0 | } |
6298 | 0 | return ret; |
6299 | 0 | } |
6300 | | |
6301 | | void bam_mplp_reset(bam_mplp_t iter) |
6302 | 0 | { |
6303 | 0 | int i; |
6304 | 0 | iter->min_pos = HTS_POS_MAX; |
6305 | 0 | iter->min_tid = (uint32_t)-1; |
6306 | 0 | for (i = 0; i < iter->n; ++i) { |
6307 | 0 | bam_plp_reset(iter->iter[i]); |
6308 | 0 | iter->pos[i] = HTS_POS_MAX; |
6309 | 0 | iter->tid[i] = (uint32_t)-1; |
6310 | 0 | iter->n_plp[i] = 0; |
6311 | 0 | iter->plp[i] = NULL; |
6312 | 0 | } |
6313 | 0 | } |
6314 | | |
6315 | | void bam_mplp_constructor(bam_mplp_t iter, |
6316 | 0 | int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) { |
6317 | 0 | int i; |
6318 | 0 | for (i = 0; i < iter->n; ++i) |
6319 | 0 | bam_plp_constructor(iter->iter[i], func); |
6320 | 0 | } |
6321 | | |
6322 | | void bam_mplp_destructor(bam_mplp_t iter, |
6323 | 0 | int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) { |
6324 | 0 | int i; |
6325 | 0 | for (i = 0; i < iter->n; ++i) |
6326 | 0 | bam_plp_destructor(iter->iter[i], func); |
6327 | 0 | } |
6328 | | |
6329 | | #endif // ~!defined(BAM_NO_PILEUP) |