Line | Count | Source (jump to first uncovered line) |
1 | | /* sam.c -- SAM and BAM file I/O and manipulation. |
2 | | |
3 | | Copyright (C) 2008-2010, 2012-2022 Genome Research Ltd. |
4 | | Copyright (C) 2010, 2012, 2013 Broad Institute. |
5 | | |
6 | | Author: Heng Li <lh3@sanger.ac.uk> |
7 | | |
8 | | Permission is hereby granted, free of charge, to any person obtaining a copy |
9 | | of this software and associated documentation files (the "Software"), to deal |
10 | | in the Software without restriction, including without limitation the rights |
11 | | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
12 | | copies of the Software, and to permit persons to whom the Software is |
13 | | furnished to do so, subject to the following conditions: |
14 | | |
15 | | The above copyright notice and this permission notice shall be included in |
16 | | all copies or substantial portions of the Software. |
17 | | |
18 | | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
19 | | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
20 | | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
21 | | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
22 | | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
23 | | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
24 | | DEALINGS IN THE SOFTWARE. */ |
25 | | |
26 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
27 | | #include <config.h> |
28 | | |
29 | | #include <strings.h> |
30 | | #include <stdio.h> |
31 | | #include <stdlib.h> |
32 | | #include <string.h> |
33 | | #include <errno.h> |
34 | | #include <zlib.h> |
35 | | #include <assert.h> |
36 | | #include <signal.h> |
37 | | #include <inttypes.h> |
38 | | #include <unistd.h> |
39 | | |
40 | | // Suppress deprecation message for cigar_tab, which we initialise |
41 | | #include "htslib/hts_defs.h" |
42 | | #undef HTS_DEPRECATED |
43 | | #define HTS_DEPRECATED(message) |
44 | | |
45 | | #include "htslib/sam.h" |
46 | | #include "htslib/bgzf.h" |
47 | | #include "cram/cram.h" |
48 | | #include "hts_internal.h" |
49 | | #include "sam_internal.h" |
50 | | #include "htslib/hfile.h" |
51 | | #include "htslib/hts_endian.h" |
52 | | #include "htslib/hts_expr.h" |
53 | | #include "header.h" |
54 | | |
55 | | #include "htslib/khash.h" |
56 | | KHASH_DECLARE(s2i, kh_cstr_t, int64_t) |
57 | | KHASH_SET_INIT_INT(tag) |
58 | | |
59 | | #ifndef EFTYPE |
60 | 0 | #define EFTYPE ENOEXEC |
61 | | #endif |
62 | | #ifndef EOVERFLOW |
63 | | #define EOVERFLOW ERANGE |
64 | | #endif |
65 | | |
66 | | /********************** |
67 | | *** BAM header I/O *** |
68 | | **********************/ |
69 | | |
70 | | HTSLIB_EXPORT |
71 | | const int8_t bam_cigar_table[256] = { |
72 | | // 0 .. 47 |
73 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
74 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
75 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
76 | | |
77 | | // 48 .. 63 (including =) |
78 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, BAM_CEQUAL, -1, -1, |
79 | | |
80 | | // 64 .. 79 (including MIDNHB) |
81 | | -1, -1, BAM_CBACK, -1, BAM_CDEL, -1, -1, -1, |
82 | | BAM_CHARD_CLIP, BAM_CINS, -1, -1, -1, BAM_CMATCH, BAM_CREF_SKIP, -1, |
83 | | |
84 | | // 80 .. 95 (including SPX) |
85 | | BAM_CPAD, -1, -1, BAM_CSOFT_CLIP, -1, -1, -1, -1, |
86 | | BAM_CDIFF, -1, -1, -1, -1, -1, -1, -1, |
87 | | |
88 | | // 96 .. 127 |
89 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
90 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
91 | | |
92 | | // 128 .. 255 |
93 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
94 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
95 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
96 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
97 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
98 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
99 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
100 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 |
101 | | }; |
102 | | |
103 | | sam_hdr_t *sam_hdr_init() |
104 | 1.16k | { |
105 | 1.16k | sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t)); |
106 | 1.16k | if (bh == NULL) return NULL; |
107 | | |
108 | 1.16k | bh->cigar_tab = bam_cigar_table; |
109 | 1.16k | return bh; |
110 | 1.16k | } |
111 | | |
112 | | void sam_hdr_destroy(sam_hdr_t *bh) |
113 | 4.69k | { |
114 | 4.69k | int32_t i; |
115 | | |
116 | 4.69k | if (bh == NULL) return; |
117 | | |
118 | 1.65k | if (bh->ref_count > 0) { |
119 | 485 | --bh->ref_count; |
120 | 485 | return; |
121 | 485 | } |
122 | | |
123 | 1.16k | if (bh->target_name) { |
124 | 2.54k | for (i = 0; i < bh->n_targets; ++i) |
125 | 2.22k | free(bh->target_name[i]); |
126 | 327 | free(bh->target_name); |
127 | 327 | free(bh->target_len); |
128 | 327 | } |
129 | 1.16k | free(bh->text); |
130 | 1.16k | if (bh->hrecs) |
131 | 1.01k | sam_hrecs_free(bh->hrecs); |
132 | 1.16k | if (bh->sdict) |
133 | 1.16k | kh_destroy(s2i, (khash_t(s2i) *) bh->sdict); |
134 | 1.16k | free(bh); |
135 | 1.16k | } |
136 | | |
137 | | // Copy the sam_hdr_t::sdict hash, used to store the real lengths of long |
138 | | // references before sam_hdr_t::hrecs is populated |
139 | | int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h) |
140 | 0 | { |
141 | 0 | const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict; |
142 | 0 | khash_t(s2i) *dest_long_refs = kh_init(s2i); |
143 | 0 | int i; |
144 | 0 | if (!dest_long_refs) return -1; |
145 | | |
146 | 0 | for (i = 0; i < h->n_targets; i++) { |
147 | 0 | int ret; |
148 | 0 | khiter_t ksrc, kdest; |
149 | 0 | if (h->target_len[i] < UINT32_MAX) continue; |
150 | 0 | ksrc = kh_get(s2i, src_long_refs, h->target_name[i]); |
151 | 0 | if (ksrc == kh_end(src_long_refs)) continue; |
152 | 0 | kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret); |
153 | 0 | if (ret < 0) { |
154 | 0 | kh_destroy(s2i, dest_long_refs); |
155 | 0 | return -1; |
156 | 0 | } |
157 | 0 | kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc); |
158 | 0 | } |
159 | | |
160 | 0 | h->sdict = dest_long_refs; |
161 | 0 | return 0; |
162 | 0 | } |
163 | | |
164 | | sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0) |
165 | 245 | { |
166 | 245 | if (h0 == NULL) return NULL; |
167 | 245 | sam_hdr_t *h; |
168 | 245 | if ((h = sam_hdr_init()) == NULL) return NULL; |
169 | | // copy the simple data |
170 | 245 | h->n_targets = 0; |
171 | 245 | h->ignore_sam_err = h0->ignore_sam_err; |
172 | 245 | h->l_text = 0; |
173 | | |
174 | | // Then the pointery stuff |
175 | | |
176 | 245 | if (!h0->hrecs) { |
177 | 0 | h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t)); |
178 | 0 | if (!h->target_len) goto fail; |
179 | 0 | h->target_name = (char**)calloc(h0->n_targets, sizeof(char*)); |
180 | 0 | if (!h->target_name) goto fail; |
181 | | |
182 | 0 | int i; |
183 | 0 | for (i = 0; i < h0->n_targets; ++i) { |
184 | 0 | h->target_len[i] = h0->target_len[i]; |
185 | 0 | h->target_name[i] = strdup(h0->target_name[i]); |
186 | 0 | if (!h->target_name[i]) break; |
187 | 0 | } |
188 | 0 | h->n_targets = i; |
189 | 0 | if (i < h0->n_targets) goto fail; |
190 | | |
191 | 0 | if (h0->sdict) { |
192 | 0 | if (sam_hdr_dup_sdict(h0, h) < 0) goto fail; |
193 | 0 | } |
194 | 0 | } |
195 | | |
196 | 245 | if (h0->hrecs) { |
197 | 245 | kstring_t tmp = { 0, 0, NULL }; |
198 | 245 | if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) { |
199 | 0 | free(ks_release(&tmp)); |
200 | 0 | goto fail; |
201 | 0 | } |
202 | | |
203 | 245 | h->l_text = tmp.l; |
204 | 245 | h->text = ks_release(&tmp); |
205 | | |
206 | 245 | if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0) |
207 | 0 | goto fail; |
208 | 245 | } else { |
209 | 0 | h->l_text = h0->l_text; |
210 | 0 | h->text = malloc(h->l_text + 1); |
211 | 0 | if (!h->text) goto fail; |
212 | 0 | memcpy(h->text, h0->text, h->l_text); |
213 | 0 | h->text[h->l_text] = '\0'; |
214 | 0 | } |
215 | | |
216 | 245 | return h; |
217 | | |
218 | 0 | fail: |
219 | 0 | sam_hdr_destroy(h); |
220 | 0 | return NULL; |
221 | 245 | } |
222 | | |
223 | | sam_hdr_t *bam_hdr_read(BGZF *fp) |
224 | 52 | { |
225 | 52 | sam_hdr_t *h; |
226 | 52 | uint8_t buf[4]; |
227 | 52 | int magic_len, has_EOF; |
228 | 52 | int32_t i, name_len, num_names = 0; |
229 | 52 | size_t bufsize; |
230 | 52 | ssize_t bytes; |
231 | | // check EOF |
232 | 52 | has_EOF = bgzf_check_EOF(fp); |
233 | 52 | if (has_EOF < 0) { |
234 | 0 | perror("[W::bam_hdr_read] bgzf_check_EOF"); |
235 | 52 | } else if (has_EOF == 0) { |
236 | 52 | hts_log_warning("EOF marker is absent. The input is probably truncated"); |
237 | 52 | } |
238 | | // read "BAM1" |
239 | 52 | magic_len = bgzf_read(fp, buf, 4); |
240 | 52 | if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) { |
241 | 0 | hts_log_error("Invalid BAM binary header"); |
242 | 0 | return 0; |
243 | 0 | } |
244 | 52 | h = sam_hdr_init(); |
245 | 52 | if (!h) goto nomem; |
246 | | |
247 | | // read plain text and the number of reference sequences |
248 | 52 | bytes = bgzf_read(fp, buf, 4); |
249 | 52 | if (bytes != 4) goto read_err; |
250 | 52 | h->l_text = le_to_u32(buf); |
251 | | |
252 | 52 | bufsize = h->l_text + 1; |
253 | 52 | if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed |
254 | 52 | h->text = (char*)malloc(bufsize); |
255 | 52 | if (!h->text) goto nomem; |
256 | 52 | h->text[h->l_text] = 0; // make sure it is NULL terminated |
257 | 52 | bytes = bgzf_read(fp, h->text, h->l_text); |
258 | 52 | if (bytes != h->l_text) goto read_err; |
259 | | |
260 | 45 | bytes = bgzf_read(fp, &h->n_targets, 4); |
261 | 45 | if (bytes != 4) goto read_err; |
262 | 45 | if (fp->is_be) ed_swap_4p(&h->n_targets); |
263 | | |
264 | 45 | if (h->n_targets < 0) goto invalid; |
265 | | |
266 | | // read reference sequence names and lengths |
267 | 44 | if (h->n_targets > 0) { |
268 | 14 | h->target_name = (char**)calloc(h->n_targets, sizeof(char*)); |
269 | 14 | if (!h->target_name) goto nomem; |
270 | 14 | h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t)); |
271 | 14 | if (!h->target_len) goto nomem; |
272 | 14 | } |
273 | 30 | else { |
274 | 30 | h->target_name = NULL; |
275 | 30 | h->target_len = NULL; |
276 | 30 | } |
277 | | |
278 | 189 | for (i = 0; i != h->n_targets; ++i) { |
279 | 155 | bytes = bgzf_read(fp, &name_len, 4); |
280 | 155 | if (bytes != 4) goto read_err; |
281 | 155 | if (fp->is_be) ed_swap_4p(&name_len); |
282 | 155 | if (name_len <= 0) goto invalid; |
283 | | |
284 | 155 | h->target_name[i] = (char*)malloc(name_len); |
285 | 155 | if (!h->target_name[i]) goto nomem; |
286 | 155 | num_names++; |
287 | | |
288 | 155 | bytes = bgzf_read(fp, h->target_name[i], name_len); |
289 | 155 | if (bytes != name_len) goto read_err; |
290 | | |
291 | 145 | if (h->target_name[i][name_len - 1] != '\0') { |
292 | | /* Fix missing NUL-termination. Is this being too nice? |
293 | | We could alternatively bail out with an error. */ |
294 | 44 | char *new_name; |
295 | 44 | if (name_len == INT32_MAX) goto invalid; |
296 | 44 | new_name = realloc(h->target_name[i], name_len + 1); |
297 | 44 | if (new_name == NULL) goto nomem; |
298 | 44 | h->target_name[i] = new_name; |
299 | 44 | h->target_name[i][name_len] = '\0'; |
300 | 44 | } |
301 | | |
302 | 145 | bytes = bgzf_read(fp, &h->target_len[i], 4); |
303 | 145 | if (bytes != 4) goto read_err; |
304 | 145 | if (fp->is_be) ed_swap_4p(&h->target_len[i]); |
305 | 145 | } |
306 | 34 | return h; |
307 | | |
308 | 0 | nomem: |
309 | 0 | hts_log_error("Out of memory"); |
310 | 0 | goto clean; |
311 | | |
312 | 17 | read_err: |
313 | 17 | if (bytes < 0) { |
314 | 8 | hts_log_error("Error reading BGZF stream"); |
315 | 9 | } else { |
316 | 9 | hts_log_error("Truncated BAM header"); |
317 | 9 | } |
318 | 17 | goto clean; |
319 | | |
320 | 1 | invalid: |
321 | 1 | hts_log_error("Invalid BAM binary header"); |
322 | | |
323 | 18 | clean: |
324 | 18 | if (h != NULL) { |
325 | 18 | h->n_targets = num_names; // ensure we free only allocated target_names |
326 | 18 | sam_hdr_destroy(h); |
327 | 18 | } |
328 | 18 | return NULL; |
329 | 1 | } |
330 | | |
331 | | int bam_hdr_write(BGZF *fp, const sam_hdr_t *h) |
332 | 0 | { |
333 | 0 | int32_t i, name_len, x; |
334 | 0 | kstring_t hdr_ks = { 0, 0, NULL }; |
335 | 0 | char *text; |
336 | 0 | uint32_t l_text; |
337 | |
|
338 | 0 | if (!h) return -1; |
339 | | |
340 | 0 | if (h->hrecs) { |
341 | 0 | if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1; |
342 | 0 | if (hdr_ks.l > UINT32_MAX) { |
343 | 0 | hts_log_error("Header too long for BAM format"); |
344 | 0 | free(hdr_ks.s); |
345 | 0 | return -1; |
346 | 0 | } else if (hdr_ks.l > INT32_MAX) { |
347 | 0 | hts_log_warning("Header too long for BAM specification (>2GB)"); |
348 | 0 | hts_log_warning("Output file may not be portable"); |
349 | 0 | } |
350 | 0 | text = hdr_ks.s; |
351 | 0 | l_text = hdr_ks.l; |
352 | 0 | } else { |
353 | 0 | if (h->l_text > UINT32_MAX) { |
354 | 0 | hts_log_error("Header too long for BAM format"); |
355 | 0 | return -1; |
356 | 0 | } else if (h->l_text > INT32_MAX) { |
357 | 0 | hts_log_warning("Header too long for BAM specification (>2GB)"); |
358 | 0 | hts_log_warning("Output file may not be portable"); |
359 | 0 | } |
360 | 0 | text = h->text; |
361 | 0 | l_text = h->l_text; |
362 | 0 | } |
363 | | // write "BAM1" |
364 | 0 | if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; } |
365 | | // write plain text and the number of reference sequences |
366 | 0 | if (fp->is_be) { |
367 | 0 | x = ed_swap_4(l_text); |
368 | 0 | if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; } |
369 | 0 | if (l_text) { |
370 | 0 | if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; } |
371 | 0 | } |
372 | 0 | x = ed_swap_4(h->n_targets); |
373 | 0 | if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; } |
374 | 0 | } else { |
375 | 0 | if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; } |
376 | 0 | if (l_text) { |
377 | 0 | if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; } |
378 | 0 | } |
379 | 0 | if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; } |
380 | 0 | } |
381 | 0 | free(hdr_ks.s); |
382 | | // write sequence names and lengths |
383 | 0 | for (i = 0; i != h->n_targets; ++i) { |
384 | 0 | char *p = h->target_name[i]; |
385 | 0 | name_len = strlen(p) + 1; |
386 | 0 | if (fp->is_be) { |
387 | 0 | x = ed_swap_4(name_len); |
388 | 0 | if (bgzf_write(fp, &x, 4) < 0) return -1; |
389 | 0 | } else { |
390 | 0 | if (bgzf_write(fp, &name_len, 4) < 0) return -1; |
391 | 0 | } |
392 | 0 | if (bgzf_write(fp, p, name_len) < 0) return -1; |
393 | 0 | if (fp->is_be) { |
394 | 0 | x = ed_swap_4(h->target_len[i]); |
395 | 0 | if (bgzf_write(fp, &x, 4) < 0) return -1; |
396 | 0 | } else { |
397 | 0 | if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1; |
398 | 0 | } |
399 | 0 | } |
400 | 0 | if (bgzf_flush(fp) < 0) return -1; |
401 | 0 | return 0; |
402 | 0 | } |
403 | | |
404 | | const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, |
405 | 0 | hts_pos_t *beg, hts_pos_t *end, int flags) { |
406 | 0 | return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags); |
407 | 0 | } |
408 | | |
409 | | /************************* |
410 | | *** BAM alignment I/O *** |
411 | | *************************/ |
412 | | |
413 | | bam1_t *bam_init1() |
414 | 885 | { |
415 | 885 | return (bam1_t*)calloc(1, sizeof(bam1_t)); |
416 | 885 | } |
417 | | |
418 | | int sam_realloc_bam_data(bam1_t *b, size_t desired) |
419 | 2.41k | { |
420 | 2.41k | uint32_t new_m_data; |
421 | 2.41k | uint8_t *new_data; |
422 | 2.41k | new_m_data = desired; |
423 | 2.41k | kroundup32(new_m_data); |
424 | 2.41k | if (new_m_data < desired) { |
425 | 0 | errno = ENOMEM; // Not strictly true but we can't store the size |
426 | 0 | return -1; |
427 | 0 | } |
428 | 2.41k | if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) { |
429 | 2.41k | new_data = realloc(b->data, new_m_data); |
430 | 2.41k | } else { |
431 | 0 | if ((new_data = malloc(new_m_data)) != NULL) { |
432 | 0 | if (b->l_data > 0) |
433 | 0 | memcpy(new_data, b->data, |
434 | 0 | b->l_data < b->m_data ? b->l_data : b->m_data); |
435 | 0 | bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA)); |
436 | 0 | } |
437 | 0 | } |
438 | 2.41k | if (!new_data) return -1; |
439 | 2.41k | b->data = new_data; |
440 | 2.41k | b->m_data = new_m_data; |
441 | 2.41k | return 0; |
442 | 2.41k | } |
443 | | |
444 | | void bam_destroy1(bam1_t *b) |
445 | 885 | { |
446 | 885 | if (b == 0) return; |
447 | 885 | if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) { |
448 | 885 | free(b->data); |
449 | 885 | if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) { |
450 | | // In case of reuse |
451 | 0 | b->data = NULL; |
452 | 0 | b->m_data = 0; |
453 | 0 | b->l_data = 0; |
454 | 0 | } |
455 | 885 | } |
456 | | |
457 | 885 | if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0) |
458 | 885 | free(b); |
459 | 885 | } |
460 | | |
461 | | bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) |
462 | 0 | { |
463 | 0 | if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL; |
464 | 0 | memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data |
465 | 0 | memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest |
466 | 0 | bdst->l_data = bsrc->l_data; |
467 | 0 | bdst->id = bsrc->id; |
468 | 0 | return bdst; |
469 | 0 | } |
470 | | |
471 | | bam1_t *bam_dup1(const bam1_t *bsrc) |
472 | 0 | { |
473 | 0 | if (bsrc == NULL) return NULL; |
474 | 0 | bam1_t *bdst = bam_init1(); |
475 | 0 | if (bdst == NULL) return NULL; |
476 | 0 | if (bam_copy1(bdst, bsrc) == NULL) { |
477 | 0 | bam_destroy1(bdst); |
478 | 0 | return NULL; |
479 | 0 | } |
480 | 0 | return bdst; |
481 | 0 | } |
482 | | |
483 | | static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar, |
484 | | hts_pos_t *rlen, hts_pos_t *qlen) |
485 | 299 | { |
486 | 299 | int k; |
487 | 299 | *rlen = *qlen = 0; |
488 | 86.4k | for (k = 0; k < n_cigar; ++k) { |
489 | 86.1k | int type = bam_cigar_type(bam_cigar_op(cigar[k])); |
490 | 86.1k | int len = bam_cigar_oplen(cigar[k]); |
491 | 86.1k | if (type & 1) *qlen += len; |
492 | 86.1k | if (type & 2) *rlen += len; |
493 | 86.1k | } |
494 | 299 | } |
495 | | |
496 | | static int subtract_check_underflow(size_t length, size_t *limit) |
497 | 58.8k | { |
498 | 58.8k | if (length <= *limit) { |
499 | 58.8k | *limit -= length; |
500 | 58.8k | return 0; |
501 | 58.8k | } |
502 | | |
503 | 0 | return -1; |
504 | 58.8k | } |
505 | | |
506 | | int bam_set1(bam1_t *bam, |
507 | | size_t l_qname, const char *qname, |
508 | | uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq, |
509 | | size_t n_cigar, const uint32_t *cigar, |
510 | | int32_t mtid, hts_pos_t mpos, hts_pos_t isize, |
511 | | size_t l_seq, const char *seq, const char *qual, |
512 | | size_t l_aux) |
513 | 11.7k | { |
514 | | // use a default qname "*" if none is provided |
515 | 11.7k | if (l_qname == 0) { |
516 | 10.1k | l_qname = 1; |
517 | 10.1k | qname = "*"; |
518 | 10.1k | } |
519 | | |
520 | | // note: the qname is stored nul terminated and padded as described in the |
521 | | // documentation for the bam1_t struct. |
522 | 11.7k | size_t qname_nuls = 4 - l_qname % 4; |
523 | | |
524 | | // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos(). |
525 | | // can't use bam_endpos() directly as some fields not yet set up. |
526 | 11.7k | hts_pos_t rlen = 0, qlen = 0; |
527 | 11.7k | if (!(flag & BAM_FUNMAP)) { |
528 | 0 | bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen); |
529 | 0 | } |
530 | 11.7k | if (rlen == 0) { |
531 | 11.7k | rlen = 1; |
532 | 11.7k | } |
533 | | |
534 | | // validate parameters |
535 | 11.7k | if (l_qname > 254) { |
536 | 8 | hts_log_error("Query name too long"); |
537 | 8 | errno = EINVAL; |
538 | 8 | return -1; |
539 | 8 | } |
540 | 11.7k | if (HTS_POS_MAX - rlen <= pos) { |
541 | 0 | hts_log_error("Read ends beyond highest supported position"); |
542 | 0 | errno = EINVAL; |
543 | 0 | return -1; |
544 | 0 | } |
545 | 11.7k | if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) { |
546 | 0 | hts_log_error("Mapped query must have a CIGAR"); |
547 | 0 | errno = EINVAL; |
548 | 0 | return -1; |
549 | 0 | } |
550 | 11.7k | if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) { |
551 | 0 | hts_log_error("CIGAR and query sequence are of different length"); |
552 | 0 | errno = EINVAL; |
553 | 0 | return -1; |
554 | 0 | } |
555 | | |
556 | 11.7k | size_t limit = INT32_MAX; |
557 | 11.7k | int u = subtract_check_underflow(l_qname + qname_nuls, &limit); |
558 | 11.7k | u += subtract_check_underflow(n_cigar * 4, &limit); |
559 | 11.7k | u += subtract_check_underflow((l_seq + 1) / 2, &limit); |
560 | 11.7k | u += subtract_check_underflow(l_seq, &limit); |
561 | 11.7k | u += subtract_check_underflow(l_aux, &limit); |
562 | 11.7k | if (u != 0) { |
563 | 0 | hts_log_error("Size overflow"); |
564 | 0 | errno = EINVAL; |
565 | 0 | return -1; |
566 | 0 | } |
567 | | |
568 | | // re-allocate the data buffer as needed. |
569 | 11.7k | size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq; |
570 | 11.7k | if (realloc_bam_data(bam, data_len + l_aux) < 0) { |
571 | 0 | return -1; |
572 | 0 | } |
573 | | |
574 | 11.7k | bam->l_data = (int)data_len; |
575 | 11.7k | bam->core.pos = pos; |
576 | 11.7k | bam->core.tid = tid; |
577 | 11.7k | bam->core.bin = bam_reg2bin(pos, pos + rlen); |
578 | 11.7k | bam->core.qual = mapq; |
579 | 11.7k | bam->core.l_extranul = (uint8_t)(qname_nuls - 1); |
580 | 11.7k | bam->core.flag = flag; |
581 | 11.7k | bam->core.l_qname = (uint16_t)(l_qname + qname_nuls); |
582 | 11.7k | bam->core.n_cigar = (uint32_t)n_cigar; |
583 | 11.7k | bam->core.l_qseq = (int32_t)l_seq; |
584 | 11.7k | bam->core.mtid = mtid; |
585 | 11.7k | bam->core.mpos = mpos; |
586 | 11.7k | bam->core.isize = isize; |
587 | | |
588 | 11.7k | uint8_t *cp = bam->data; |
589 | 11.7k | strncpy((char *)cp, qname, l_qname); |
590 | 11.7k | int i; |
591 | 46.2k | for (i = 0; i < qname_nuls; i++) { |
592 | 34.4k | cp[l_qname + i] = '\0'; |
593 | 34.4k | } |
594 | 11.7k | cp += l_qname + qname_nuls; |
595 | | |
596 | 11.7k | if (n_cigar > 0) { |
597 | 0 | memcpy(cp, cigar, n_cigar * 4); |
598 | 0 | } |
599 | 11.7k | cp += n_cigar * 4; |
600 | | |
601 | 99.0M | for (i = 0; i + 1 < l_seq; i += 2) { |
602 | 99.0M | *cp++ = (seq_nt16_table[(unsigned char)seq[i]] << 4) | seq_nt16_table[(unsigned char)seq[i + 1]]; |
603 | 99.0M | } |
604 | 13.4k | for (; i < l_seq; i++) { |
605 | 1.62k | *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4; |
606 | 1.62k | } |
607 | | |
608 | 11.7k | if (qual) { |
609 | 199 | memcpy(cp, qual, l_seq); |
610 | 199 | } |
611 | 11.5k | else { |
612 | 11.5k | memset(cp, '\xff', l_seq); |
613 | 11.5k | } |
614 | | |
615 | 11.7k | return (int)data_len; |
616 | 11.7k | } |
617 | | |
618 | | hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar) |
619 | 4.68k | { |
620 | 4.68k | int k; |
621 | 4.68k | hts_pos_t l; |
622 | 4.94k | for (k = l = 0; k < n_cigar; ++k) |
623 | 253 | if (bam_cigar_type(bam_cigar_op(cigar[k]))&1) |
624 | 253 | l += bam_cigar_oplen(cigar[k]); |
625 | 4.68k | return l; |
626 | 4.68k | } |
627 | | |
628 | | hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar) |
629 | 836 | { |
630 | 836 | int k; |
631 | 836 | hts_pos_t l; |
632 | 84.4k | for (k = l = 0; k < n_cigar; ++k) |
633 | 83.6k | if (bam_cigar_type(bam_cigar_op(cigar[k]))&2) |
634 | 82.4k | l += bam_cigar_oplen(cigar[k]); |
635 | 836 | return l; |
636 | 836 | } |
637 | | |
638 | | hts_pos_t bam_endpos(const bam1_t *b) |
639 | 350 | { |
640 | 350 | hts_pos_t rlen = (b->core.flag & BAM_FUNMAP)? 0 : bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); |
641 | 350 | if (rlen == 0) rlen = 1; |
642 | 350 | return b->core.pos + rlen; |
643 | 350 | } |
644 | | |
645 | | static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG |
646 | 5.98k | { |
647 | 5.98k | bam1_core_t *c = &b->core; |
648 | 5.98k | uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes; |
649 | 5.98k | uint8_t *CG; |
650 | | |
651 | | // test where there is a real CIGAR in the CG tag to move |
652 | 5.98k | if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0; |
653 | 1.13k | cigar0 = bam_get_cigar(b); |
654 | 1.13k | if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0; |
655 | 970 | fake_bytes = c->n_cigar * 4; |
656 | 970 | int saved_errno = errno; |
657 | 970 | CG = bam_aux_get(b, "CG"); |
658 | 970 | if (!CG) { |
659 | 352 | if (errno != ENOENT) return -1; // Bad aux data |
660 | 352 | errno = saved_errno; // restore errno on expected no-CG-tag case |
661 | 352 | return 0; |
662 | 352 | } |
663 | 618 | if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i')) |
664 | 200 | return 0; // not of type B,I |
665 | 418 | CG_len = le_to_u32(CG + 2); |
666 | 418 | if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length |
667 | | |
668 | | // move from the CG tag to the right position |
669 | 350 | cigar_st = (uint8_t*)cigar0 - b->data; |
670 | 350 | c->n_cigar = CG_len; |
671 | 350 | n_cigar4 = c->n_cigar * 4; |
672 | 350 | CG_st = CG - b->data - 2; |
673 | 350 | CG_en = CG_st + 8 + n_cigar4; |
674 | 350 | if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1; |
675 | 350 | b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place |
676 | 350 | memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room |
677 | 350 | memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR |
678 | 350 | if (ori_len > CG_en) // move data after the CG tag |
679 | 26 | memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en); |
680 | 350 | b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4) |
681 | 350 | if (recal_bin) |
682 | 350 | b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5); |
683 | 350 | if (give_warning) |
684 | 350 | hts_log_error("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar); |
685 | 350 | return 1; |
686 | 350 | } |
687 | | |
688 | | static inline int aux_type2size(uint8_t type) |
689 | 4.03M | { |
690 | 4.03M | switch (type) { |
691 | 1.31M | case 'A': case 'c': case 'C': |
692 | 1.31M | return 1; |
693 | 545k | case 's': case 'S': |
694 | 545k | return 2; |
695 | 869k | case 'i': case 'I': case 'f': |
696 | 869k | return 4; |
697 | 150 | case 'd': |
698 | 150 | return 8; |
699 | 1.30M | case 'Z': case 'H': case 'B': |
700 | 1.30M | return type; |
701 | 2 | default: |
702 | 2 | return 0; |
703 | 4.03M | } |
704 | 4.03M | } |
705 | | |
706 | | static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host) |
707 | 0 | { |
708 | 0 | uint32_t *cigar = (uint32_t*)(data + c->l_qname); |
709 | 0 | uint32_t i; |
710 | 0 | for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]); |
711 | 0 | } |
712 | | |
713 | | // Fix bad records where qname is not terminated correctly. |
714 | 229 | static int fixup_missing_qname_nul(bam1_t *b) { |
715 | 229 | bam1_core_t *c = &b->core; |
716 | | |
717 | | // Note this is called before c->l_extranul is added to c->l_qname |
718 | 229 | if (c->l_extranul > 0) { |
719 | 184 | b->data[c->l_qname++] = '\0'; |
720 | 184 | c->l_extranul--; |
721 | 184 | } else { |
722 | 45 | if (b->l_data > INT_MAX - 4) return -1; |
723 | 45 | if (realloc_bam_data(b, b->l_data + 4) < 0) return -1; |
724 | 45 | b->l_data += 4; |
725 | 45 | b->data[c->l_qname++] = '\0'; |
726 | 45 | c->l_extranul = 3; |
727 | 45 | } |
728 | 229 | return 0; |
729 | 229 | } |
730 | | |
731 | | /* |
732 | | * Note a second interface that returns a bam pointer instead would avoid bam_copy1 |
733 | | * in multi-threaded handling. This may be worth considering for htslib2. |
734 | | */ |
735 | | int bam_read1(BGZF *fp, bam1_t *b) |
736 | 357 | { |
737 | 357 | bam1_core_t *c = &b->core; |
738 | 357 | int32_t block_len, ret, i; |
739 | 357 | uint32_t x[8], new_l_data; |
740 | | |
741 | 357 | b->l_data = 0; |
742 | | |
743 | 357 | if ((ret = bgzf_read(fp, &block_len, 4)) != 4) { |
744 | 1 | if (ret == 0) return -1; // normal end-of-file |
745 | 1 | else return -2; // truncated |
746 | 1 | } |
747 | 356 | if (fp->is_be) |
748 | 0 | ed_swap_4p(&block_len); |
749 | 356 | if (block_len < 32) return -4; // block_len includes core data |
750 | 356 | if (bgzf_read(fp, x, 32) != 32) return -3; |
751 | 353 | if (fp->is_be) { |
752 | 0 | for (i = 0; i < 8; ++i) ed_swap_4p(x + i); |
753 | 0 | } |
754 | 353 | c->tid = x[0]; c->pos = (int32_t)x[1]; |
755 | 353 | c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; |
756 | 353 | c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0; |
757 | 353 | c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; |
758 | 353 | c->l_qseq = x[4]; |
759 | 353 | c->mtid = x[5]; c->mpos = (int32_t)x[6]; c->isize = (int32_t)x[7]; |
760 | | |
761 | 353 | new_l_data = block_len - 32 + c->l_extranul; |
762 | 353 | if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4; |
763 | 353 | if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul |
764 | 353 | + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data) |
765 | 1 | return -4; |
766 | 352 | if (realloc_bam_data(b, new_l_data) < 0) return -4; |
767 | 352 | b->l_data = new_l_data; |
768 | | |
769 | 352 | if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4; |
770 | 352 | if (b->data[c->l_qname - 1] != '\0') { // Try to fix missing NUL termination |
771 | 229 | if (fixup_missing_qname_nul(b) < 0) return -4; |
772 | 229 | } |
773 | 677 | for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0'; |
774 | 352 | c->l_qname += c->l_extranul; |
775 | 352 | if (b->l_data < c->l_qname || |
776 | 352 | bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname) |
777 | 9 | return -4; |
778 | 343 | if (fp->is_be) swap_data(c, b->l_data, b->data, 0); |
779 | 343 | if (bam_tag2cigar(b, 0, 0) < 0) |
780 | 0 | return -4; |
781 | | |
782 | 343 | if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency |
783 | 299 | hts_pos_t rlen, qlen; |
784 | 299 | bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen); |
785 | 299 | if ((b->core.flag & BAM_FUNMAP) || rlen == 0) rlen = 1; |
786 | 299 | b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5); |
787 | | // Sanity check for broken CIGAR alignments |
788 | 299 | if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) { |
789 | 0 | hts_log_error("CIGAR and query sequence lengths differ for %s", |
790 | 0 | bam_get_qname(b)); |
791 | 0 | return -4; |
792 | 0 | } |
793 | 299 | } |
794 | | |
795 | 343 | return 4 + block_len; |
796 | 343 | } |
797 | | |
798 | | int bam_write1(BGZF *fp, const bam1_t *b) |
799 | 0 | { |
800 | 0 | const bam1_core_t *c = &b->core; |
801 | 0 | uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y; |
802 | 0 | int i, ok; |
803 | 0 | if (c->l_qname - c->l_extranul > 255) { |
804 | 0 | hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b)); |
805 | 0 | errno = EOVERFLOW; |
806 | 0 | return -1; |
807 | 0 | } |
808 | 0 | if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR |
809 | 0 | if (c->pos > INT_MAX || |
810 | 0 | c->mpos > INT_MAX || |
811 | 0 | c->isize < INT_MIN || c->isize > INT_MAX) { |
812 | 0 | hts_log_error("Positional data is too large for BAM format"); |
813 | 0 | return -1; |
814 | 0 | } |
815 | 0 | x[0] = c->tid; |
816 | 0 | x[1] = c->pos; |
817 | 0 | x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul); |
818 | 0 | if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2; |
819 | 0 | else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff); |
820 | 0 | x[4] = c->l_qseq; |
821 | 0 | x[5] = c->mtid; |
822 | 0 | x[6] = c->mpos; |
823 | 0 | x[7] = c->isize; |
824 | 0 | ok = (bgzf_flush_try(fp, 4 + block_len) >= 0); |
825 | 0 | if (fp->is_be) { |
826 | 0 | for (i = 0; i < 8; ++i) ed_swap_4p(x + i); |
827 | 0 | y = block_len; |
828 | 0 | if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0); |
829 | 0 | swap_data(c, b->l_data, b->data, 1); |
830 | 0 | } else { |
831 | 0 | if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0); |
832 | 0 | } |
833 | 0 | if (ok) ok = (bgzf_write(fp, x, 32) >= 0); |
834 | 0 | if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0); |
835 | 0 | if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally |
836 | 0 | if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0); |
837 | 0 | } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag |
838 | 0 | uint8_t buf[8]; |
839 | 0 | uint32_t cigar_st, cigar_en, cigar[2]; |
840 | 0 | hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b)); |
841 | 0 | if (cigreflen >= (1<<28)) { |
842 | | // Length of reference covered is greater than the biggest |
843 | | // CIGAR operation currently allowed. |
844 | 0 | hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos |
845 | 0 | " cannot be written in BAM. Try writing SAM or CRAM instead.\n", |
846 | 0 | bam_get_qname(b), c->n_cigar, cigreflen); |
847 | 0 | return -1; |
848 | 0 | } |
849 | 0 | cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; |
850 | 0 | cigar_en = cigar_st + c->n_cigar * 4; |
851 | 0 | cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP; |
852 | 0 | cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP; |
853 | 0 | u32_to_le(cigar[0], buf); |
854 | 0 | u32_to_le(cigar[1], buf + 4); |
855 | 0 | if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N |
856 | 0 | if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR |
857 | 0 | if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I |
858 | 0 | u32_to_le(c->n_cigar, buf); |
859 | 0 | if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length |
860 | 0 | if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR |
861 | 0 | } |
862 | 0 | if (fp->is_be) swap_data(c, b->l_data, b->data, 0); |
863 | 0 | return ok? 4 + block_len : -1; |
864 | 0 | } |
865 | | |
866 | | /* |
867 | | * Write a BAM file and append to the in-memory index simultaneously. |
868 | | */ |
869 | 0 | static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) { |
870 | 0 | BGZF *bfp = fp->fp.bgzf; |
871 | |
|
872 | 0 | if (!fp->idx) |
873 | 0 | return bam_write1(bfp, b); |
874 | | |
875 | 0 | uint32_t block_len = b->l_data - b->core.l_extranul + 32; |
876 | 0 | if (bgzf_flush_try(bfp, 4 + block_len) < 0) |
877 | 0 | return -1; |
878 | 0 | if (!bfp->mt) |
879 | 0 | hts_idx_amend_last(fp->idx, bgzf_tell(bfp)); |
880 | 0 | else |
881 | 0 | bgzf_idx_amend_last(bfp, fp->idx, bgzf_tell(bfp)); |
882 | |
|
883 | 0 | int ret = bam_write1(bfp, b); |
884 | 0 | if (ret < 0) |
885 | 0 | return -1; |
886 | | |
887 | 0 | if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) { |
888 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
889 | 0 | bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); |
890 | 0 | ret = -1; |
891 | 0 | } |
892 | |
|
893 | 0 | return ret; |
894 | 0 | } |
895 | | |
896 | | /* |
897 | | * Set the qname in a BAM record |
898 | | */ |
899 | | int bam_set_qname(bam1_t *rec, const char *qname) |
900 | 0 | { |
901 | 0 | if (!rec) return -1; |
902 | 0 | if (!qname || !*qname) return -1; |
903 | | |
904 | 0 | size_t old_len = rec->core.l_qname; |
905 | 0 | size_t new_len = strlen(qname) + 1; |
906 | 0 | if (new_len < 1 || new_len > 255) return -1; |
907 | | |
908 | 0 | int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0; |
909 | |
|
910 | 0 | size_t new_data_len = rec->l_data - old_len + new_len + extranul; |
911 | 0 | if (realloc_bam_data(rec, new_data_len) < 0) return -1; |
912 | | |
913 | | // Make room |
914 | 0 | if (new_len + extranul != rec->core.l_qname) |
915 | 0 | memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname); |
916 | | // Copy in new name and pad if needed |
917 | 0 | memcpy(rec->data, qname, new_len); |
918 | 0 | int n; |
919 | 0 | for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0'; |
920 | |
|
921 | 0 | rec->l_data = new_data_len; |
922 | 0 | rec->core.l_qname = new_len + extranul; |
923 | 0 | rec->core.l_extranul = extranul; |
924 | |
|
925 | 0 | return 0; |
926 | 0 | } |
927 | | |
928 | | /******************** |
929 | | *** BAM indexing *** |
930 | | ********************/ |
931 | | |
932 | | static hts_idx_t *sam_index(htsFile *fp, int min_shift) |
933 | 0 | { |
934 | 0 | int n_lvls, i, fmt, ret; |
935 | 0 | bam1_t *b; |
936 | 0 | hts_idx_t *idx; |
937 | 0 | sam_hdr_t *h; |
938 | 0 | h = sam_hdr_read(fp); |
939 | 0 | if (h == NULL) return NULL; |
940 | 0 | if (min_shift > 0) { |
941 | 0 | hts_pos_t max_len = 0, s; |
942 | 0 | for (i = 0; i < h->n_targets; ++i) { |
943 | 0 | hts_pos_t len = sam_hdr_tid2len(h, i); |
944 | 0 | if (max_len < len) max_len = len; |
945 | 0 | } |
946 | 0 | max_len += 256; |
947 | 0 | for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3); |
948 | 0 | fmt = HTS_FMT_CSI; |
949 | 0 | } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI; |
950 | 0 | idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); |
951 | 0 | b = bam_init1(); |
952 | 0 | while ((ret = sam_read1(fp, h, b)) >= 0) { |
953 | 0 | ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)); |
954 | 0 | if (ret < 0) { // unsorted or doesn't fit |
955 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); |
956 | 0 | goto err; |
957 | 0 | } |
958 | 0 | } |
959 | 0 | if (ret < -1) goto err; // corrupted BAM file |
960 | | |
961 | 0 | hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf)); |
962 | 0 | sam_hdr_destroy(h); |
963 | 0 | bam_destroy1(b); |
964 | 0 | return idx; |
965 | | |
966 | 0 | err: |
967 | 0 | bam_destroy1(b); |
968 | 0 | hts_idx_destroy(idx); |
969 | 0 | return NULL; |
970 | 0 | } |
971 | | |
972 | | int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads) |
973 | 0 | { |
974 | 0 | hts_idx_t *idx; |
975 | 0 | htsFile *fp; |
976 | 0 | int ret = 0; |
977 | |
|
978 | 0 | if ((fp = hts_open(fn, "r")) == 0) return -2; |
979 | 0 | if (nthreads) |
980 | 0 | hts_set_threads(fp, nthreads); |
981 | |
|
982 | 0 | switch (fp->format.format) { |
983 | 0 | case cram: |
984 | |
|
985 | 0 | ret = cram_index_build(fp->fp.cram, fn, fnidx); |
986 | 0 | break; |
987 | | |
988 | 0 | case bam: |
989 | 0 | case sam: |
990 | 0 | if (fp->format.compression != bgzf) { |
991 | 0 | hts_log_error("%s file \"%s\" not BGZF compressed", |
992 | 0 | fp->format.format == bam ? "BAM" : "SAM", fn); |
993 | 0 | ret = -1; |
994 | 0 | break; |
995 | 0 | } |
996 | 0 | idx = sam_index(fp, min_shift); |
997 | 0 | if (idx) { |
998 | 0 | ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI); |
999 | 0 | if (ret < 0) ret = -4; |
1000 | 0 | hts_idx_destroy(idx); |
1001 | 0 | } |
1002 | 0 | else ret = -1; |
1003 | 0 | break; |
1004 | | |
1005 | 0 | default: |
1006 | 0 | ret = -3; |
1007 | 0 | break; |
1008 | 0 | } |
1009 | 0 | hts_close(fp); |
1010 | |
|
1011 | 0 | return ret; |
1012 | 0 | } |
1013 | | |
1014 | | int sam_index_build2(const char *fn, const char *fnidx, int min_shift) |
1015 | 0 | { |
1016 | 0 | return sam_index_build3(fn, fnidx, min_shift, 0); |
1017 | 0 | } |
1018 | | |
1019 | | int sam_index_build(const char *fn, int min_shift) |
1020 | 0 | { |
1021 | 0 | return sam_index_build3(fn, NULL, min_shift, 0); |
1022 | 0 | } |
1023 | | |
1024 | | // Provide bam_index_build() symbol for binary compatibility with earlier HTSlib |
1025 | | #undef bam_index_build |
1026 | | int bam_index_build(const char *fn, int min_shift) |
1027 | 0 | { |
1028 | 0 | return sam_index_build2(fn, NULL, min_shift); |
1029 | 0 | } |
1030 | | |
1031 | | // Initialise fp->idx for the current format type. |
1032 | | // This must be called after the header has been written but no other data. |
1033 | 0 | int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) { |
1034 | 0 | fp->fnidx = fnidx; |
1035 | 0 | if (fp->format.format == bam || fp->format.format == bcf || |
1036 | 0 | (fp->format.format == sam && fp->format.compression == bgzf)) { |
1037 | 0 | int n_lvls, fmt = HTS_FMT_CSI; |
1038 | 0 | if (min_shift > 0) { |
1039 | 0 | int64_t max_len = 0, s; |
1040 | 0 | int i; |
1041 | 0 | for (i = 0; i < h->n_targets; ++i) |
1042 | 0 | if (max_len < h->target_len[i]) max_len = h->target_len[i]; |
1043 | 0 | max_len += 256; |
1044 | 0 | for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3); |
1045 | |
|
1046 | 0 | } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI; |
1047 | |
|
1048 | 0 | fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); |
1049 | 0 | return fp->idx ? 0 : -1; |
1050 | 0 | } |
1051 | | |
1052 | 0 | if (fp->format.format == cram) { |
1053 | 0 | fp->fp.cram->idxfp = bgzf_open(fnidx, "wg"); |
1054 | 0 | return fp->fp.cram->idxfp ? 0 : -1; |
1055 | 0 | } |
1056 | | |
1057 | 0 | return -1; |
1058 | 0 | } |
1059 | | |
1060 | | // Finishes an index. Call after the last record has been written. |
1061 | | // Returns 0 on success, <0 on failure. |
1062 | 0 | int sam_idx_save(htsFile *fp) { |
1063 | 0 | if (fp->format.format == bam || fp->format.format == bcf || |
1064 | 0 | fp->format.format == vcf || fp->format.format == sam) { |
1065 | 0 | int ret; |
1066 | 0 | if ((ret = sam_state_destroy(fp)) < 0) { |
1067 | 0 | errno = -ret; |
1068 | 0 | return -1; |
1069 | 0 | } |
1070 | 0 | if (bgzf_flush(fp->fp.bgzf) < 0) |
1071 | 0 | return -1; |
1072 | 0 | hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf)); |
1073 | |
|
1074 | 0 | if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0) |
1075 | 0 | return -1; |
1076 | | |
1077 | 0 | return hts_idx_save_as(fp->idx, NULL, fp->fnidx, hts_idx_fmt(fp->idx)); |
1078 | |
|
1079 | 0 | } else if (fp->format.format == cram) { |
1080 | | // flushed and closed by cram_close |
1081 | 0 | } |
1082 | | |
1083 | 0 | return 0; |
1084 | 0 | } |
1085 | | |
1086 | | static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) |
1087 | 0 | { |
1088 | 0 | htsFile *fp = (htsFile *)fpv; |
1089 | 0 | bam1_t *b = bv; |
1090 | 0 | fp->line.l = 0; |
1091 | 0 | int ret = sam_read1(fp, fp->bam_header, b); |
1092 | 0 | if (ret >= 0) { |
1093 | 0 | *tid = b->core.tid; |
1094 | 0 | *beg = b->core.pos; |
1095 | 0 | *end = bam_endpos(b); |
1096 | 0 | } |
1097 | 0 | return ret; |
1098 | 0 | } |
1099 | | |
1100 | | // This is used only with read_rest=1 iterators, so need not set tid/beg/end. |
1101 | | static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) |
1102 | 0 | { |
1103 | 0 | htsFile *fp = (htsFile *)fpv; |
1104 | 0 | bam1_t *b = bv; |
1105 | 0 | fp->line.l = 0; |
1106 | 0 | int ret = sam_read1(fp, fp->bam_header, b); |
1107 | 0 | return ret; |
1108 | 0 | } |
1109 | | |
1110 | | // Internal (for now) func used by bam_sym_lookup. This is copied from |
1111 | | // samtools/bam.c. |
1112 | | static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b) |
1113 | 0 | { |
1114 | 0 | const char *rg; |
1115 | 0 | kstring_t lib = { 0, 0, NULL }; |
1116 | 0 | rg = (char *)bam_aux_get(b, "RG"); |
1117 | |
|
1118 | 0 | if (!rg) |
1119 | 0 | return NULL; |
1120 | 0 | else |
1121 | 0 | rg++; |
1122 | | |
1123 | 0 | if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib) < 0) |
1124 | 0 | return NULL; |
1125 | | |
1126 | 0 | static char LB_text[1024]; |
1127 | 0 | int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1; |
1128 | |
|
1129 | 0 | memcpy(LB_text, lib.s, len); |
1130 | 0 | LB_text[len] = 0; |
1131 | |
|
1132 | 0 | free(lib.s); |
1133 | |
|
1134 | 0 | return LB_text; |
1135 | 0 | } |
1136 | | |
1137 | | |
1138 | | // Bam record pointer and SAM header combined |
1139 | | typedef struct { |
1140 | | const sam_hdr_t *h; |
1141 | | const bam1_t *b; |
1142 | | } hb_pair; |
1143 | | |
1144 | | // Looks up variable names in str and replaces them with their value. |
1145 | | // Also supports aux tags. |
1146 | | // |
1147 | | // Note the expression parser deliberately overallocates str size so it |
1148 | | // is safe to use memcmp over strcmp. |
1149 | | static int bam_sym_lookup(void *data, char *str, char **end, |
1150 | 0 | hts_expr_val_t *res) { |
1151 | 0 | hb_pair *hb = (hb_pair *)data; |
1152 | 0 | const bam1_t *b = hb->b; |
1153 | |
|
1154 | 0 | res->is_str = 0; |
1155 | 0 | switch(*str) { |
1156 | 0 | case 'c': |
1157 | 0 | if (memcmp(str, "cigar", 5) == 0) { |
1158 | 0 | *end = str+5; |
1159 | 0 | res->is_str = 1; |
1160 | 0 | ks_clear(&res->s); |
1161 | 0 | uint32_t *cigar = bam_get_cigar(b); |
1162 | 0 | int i, n = b->core.n_cigar, r = 0; |
1163 | 0 | if (n) { |
1164 | 0 | for (i = 0; i < n; i++) { |
1165 | 0 | r |= kputw (bam_cigar_oplen(cigar[i]), &res->s) < 0; |
1166 | 0 | r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s) < 0; |
1167 | 0 | } |
1168 | 0 | r |= kputs("", &res->s) < 0; |
1169 | 0 | } else { |
1170 | 0 | r |= kputs("*", &res->s) < 0; |
1171 | 0 | } |
1172 | 0 | return r ? -1 : 0; |
1173 | 0 | } |
1174 | 0 | break; |
1175 | | |
1176 | 0 | case 'e': |
1177 | 0 | if (memcmp(str, "endpos", 6) == 0) { |
1178 | 0 | *end = str+6; |
1179 | 0 | res->d = bam_endpos(b); |
1180 | 0 | return 0; |
1181 | 0 | } |
1182 | 0 | break; |
1183 | | |
1184 | 0 | case 'f': |
1185 | 0 | if (memcmp(str, "flag", 4) == 0) { |
1186 | 0 | str = *end = str+4; |
1187 | 0 | if (*str != '.') { |
1188 | 0 | res->d = b->core.flag; |
1189 | 0 | return 0; |
1190 | 0 | } else { |
1191 | 0 | str++; |
1192 | 0 | if (!memcmp(str, "paired", 6)) { |
1193 | 0 | *end = str+6; |
1194 | 0 | res->d = b->core.flag & BAM_FPAIRED; |
1195 | 0 | return 0; |
1196 | 0 | } else if (!memcmp(str, "proper_pair", 11)) { |
1197 | 0 | *end = str+11; |
1198 | 0 | res->d = b->core.flag & BAM_FPROPER_PAIR; |
1199 | 0 | return 0; |
1200 | 0 | } else if (!memcmp(str, "unmap", 5)) { |
1201 | 0 | *end = str+5; |
1202 | 0 | res->d = b->core.flag & BAM_FUNMAP; |
1203 | 0 | return 0; |
1204 | 0 | } else if (!memcmp(str, "munmap", 6)) { |
1205 | 0 | *end = str+6; |
1206 | 0 | res->d = b->core.flag & BAM_FMUNMAP; |
1207 | 0 | return 0; |
1208 | 0 | } else if (!memcmp(str, "reverse", 7)) { |
1209 | 0 | *end = str+7; |
1210 | 0 | res->d = b->core.flag & BAM_FREVERSE; |
1211 | 0 | return 0; |
1212 | 0 | } else if (!memcmp(str, "mreverse", 8)) { |
1213 | 0 | *end = str+8; |
1214 | 0 | res->d = b->core.flag & BAM_FMREVERSE; |
1215 | 0 | return 0; |
1216 | 0 | } else if (!memcmp(str, "read1", 5)) { |
1217 | 0 | *end = str+5; |
1218 | 0 | res->d = b->core.flag & BAM_FREAD1; |
1219 | 0 | return 0; |
1220 | 0 | } else if (!memcmp(str, "read2", 5)) { |
1221 | 0 | *end = str+5; |
1222 | 0 | res->d = b->core.flag & BAM_FREAD2; |
1223 | 0 | return 0; |
1224 | 0 | } else if (!memcmp(str, "secondary", 9)) { |
1225 | 0 | *end = str+9; |
1226 | 0 | res->d = b->core.flag & BAM_FSECONDARY; |
1227 | 0 | return 0; |
1228 | 0 | } else if (!memcmp(str, "qcfail", 6)) { |
1229 | 0 | *end = str+6; |
1230 | 0 | res->d = b->core.flag & BAM_FQCFAIL; |
1231 | 0 | return 0; |
1232 | 0 | } else if (!memcmp(str, "dup", 3)) { |
1233 | 0 | *end = str+3; |
1234 | 0 | res->d = b->core.flag & BAM_FDUP; |
1235 | 0 | return 0; |
1236 | 0 | } else if (!memcmp(str, "supplementary", 13)) { |
1237 | 0 | *end = str+13; |
1238 | 0 | res->d = b->core.flag & BAM_FSUPPLEMENTARY; |
1239 | 0 | return 0; |
1240 | 0 | } else { |
1241 | 0 | hts_log_error("Unrecognised flag string"); |
1242 | 0 | return -1; |
1243 | 0 | } |
1244 | 0 | } |
1245 | 0 | } |
1246 | 0 | break; |
1247 | | |
1248 | 0 | case 'l': |
1249 | 0 | if (memcmp(str, "library", 7) == 0) { |
1250 | 0 | *end = str+7; |
1251 | 0 | res->is_str = 1; |
1252 | 0 | const char *lib = bam_get_library(hb->h, b); |
1253 | 0 | kputs(lib ? lib : "", ks_clear(&res->s)); |
1254 | 0 | return 0; |
1255 | 0 | } |
1256 | 0 | break; |
1257 | | |
1258 | 0 | case 'm': |
1259 | 0 | if (memcmp(str, "mapq", 4) == 0) { |
1260 | 0 | *end = str+4; |
1261 | 0 | res->d = b->core.qual; |
1262 | 0 | return 0; |
1263 | 0 | } else if (memcmp(str, "mpos", 4) == 0) { |
1264 | 0 | *end = str+4; |
1265 | 0 | res->d = b->core.mpos+1; |
1266 | 0 | return 0; |
1267 | 0 | } else if (memcmp(str, "mrname", 6) == 0) { |
1268 | 0 | *end = str+6; |
1269 | 0 | res->is_str = 1; |
1270 | 0 | const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); |
1271 | 0 | kputs(rn ? rn : "*", ks_clear(&res->s)); |
1272 | 0 | return 0; |
1273 | 0 | } else if (memcmp(str, "mrefid", 6) == 0) { |
1274 | 0 | *end = str+6; |
1275 | 0 | res->d = b->core.mtid; |
1276 | 0 | return 0; |
1277 | 0 | } |
1278 | 0 | break; |
1279 | | |
1280 | 0 | case 'n': |
1281 | 0 | if (memcmp(str, "ncigar", 6) == 0) { |
1282 | 0 | *end = str+6; |
1283 | 0 | res->d = b->core.n_cigar; |
1284 | 0 | return 0; |
1285 | 0 | } |
1286 | 0 | break; |
1287 | | |
1288 | 0 | case 'p': |
1289 | 0 | if (memcmp(str, "pos", 3) == 0) { |
1290 | 0 | *end = str+3; |
1291 | 0 | res->d = b->core.pos+1; |
1292 | 0 | return 0; |
1293 | 0 | } else if (memcmp(str, "pnext", 5) == 0) { |
1294 | 0 | *end = str+5; |
1295 | 0 | res->d = b->core.mpos+1; |
1296 | 0 | return 0; |
1297 | 0 | } |
1298 | 0 | break; |
1299 | | |
1300 | 0 | case 'q': |
1301 | 0 | if (memcmp(str, "qlen", 4) == 0) { |
1302 | 0 | *end = str+4; |
1303 | 0 | res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)); |
1304 | 0 | return 0; |
1305 | 0 | } else if (memcmp(str, "qname", 5) == 0) { |
1306 | 0 | *end = str+5; |
1307 | 0 | res->is_str = 1; |
1308 | 0 | kputs(bam_get_qname(b), ks_clear(&res->s)); |
1309 | 0 | return 0; |
1310 | 0 | } else if (memcmp(str, "qual", 4) == 0) { |
1311 | 0 | *end = str+4; |
1312 | 0 | ks_clear(&res->s); |
1313 | 0 | if (ks_resize(&res->s, b->core.l_qseq+1) < 0) |
1314 | 0 | return -1; |
1315 | 0 | memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq); |
1316 | 0 | res->s.l = b->core.l_qseq; |
1317 | 0 | res->is_str = 1; |
1318 | 0 | return 0; |
1319 | 0 | } |
1320 | 0 | break; |
1321 | | |
1322 | 0 | case 'r': |
1323 | 0 | if (memcmp(str, "rlen", 4) == 0) { |
1324 | 0 | *end = str+4; |
1325 | 0 | res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); |
1326 | 0 | return 0; |
1327 | 0 | } else if (memcmp(str, "rname", 5) == 0) { |
1328 | 0 | *end = str+5; |
1329 | 0 | res->is_str = 1; |
1330 | 0 | const char *rn = sam_hdr_tid2name(hb->h, b->core.tid); |
1331 | 0 | kputs(rn ? rn : "*", ks_clear(&res->s)); |
1332 | 0 | return 0; |
1333 | 0 | } else if (memcmp(str, "rnext", 5) == 0) { |
1334 | 0 | *end = str+5; |
1335 | 0 | res->is_str = 1; |
1336 | 0 | const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); |
1337 | 0 | kputs(rn ? rn : "*", ks_clear(&res->s)); |
1338 | 0 | return 0; |
1339 | 0 | } else if (memcmp(str, "refid", 5) == 0) { |
1340 | 0 | *end = str+5; |
1341 | 0 | res->d = b->core.tid; |
1342 | 0 | return 0; |
1343 | 0 | } |
1344 | 0 | break; |
1345 | | |
1346 | 0 | case 's': |
1347 | 0 | if (memcmp(str, "seq", 3) == 0) { |
1348 | 0 | *end = str+3; |
1349 | 0 | ks_clear(&res->s); |
1350 | 0 | if (ks_resize(&res->s, b->core.l_qseq+1) < 0) |
1351 | 0 | return -1; |
1352 | 0 | nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq); |
1353 | 0 | res->s.s[b->core.l_qseq] = 0; |
1354 | 0 | res->s.l = b->core.l_qseq; |
1355 | 0 | res->is_str = 1; |
1356 | 0 | return 0; |
1357 | 0 | } else if (memcmp(str, "sclen", 5) == 0) { |
1358 | 0 | int sclen = 0; |
1359 | 0 | uint32_t *cigar = bam_get_cigar(b); |
1360 | 0 | int ncigar = b->core.n_cigar; |
1361 | 0 | int left = 0; |
1362 | | |
1363 | | // left |
1364 | 0 | if (ncigar > 0 |
1365 | 0 | && bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) |
1366 | 0 | left = 0, sclen += bam_cigar_oplen(cigar[0]); |
1367 | 0 | else if (ncigar > 1 |
1368 | 0 | && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP |
1369 | 0 | && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP) |
1370 | 0 | left = 1, sclen += bam_cigar_oplen(cigar[1]); |
1371 | | |
1372 | | // right |
1373 | 0 | if (ncigar-1 > left |
1374 | 0 | && bam_cigar_op(cigar[ncigar-1]) == BAM_CSOFT_CLIP) |
1375 | 0 | sclen += bam_cigar_oplen(cigar[ncigar-1]); |
1376 | 0 | else if (ncigar-2 > left |
1377 | 0 | && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP |
1378 | 0 | && bam_cigar_op(cigar[ncigar-2]) == BAM_CSOFT_CLIP) |
1379 | 0 | sclen += bam_cigar_oplen(cigar[ncigar-2]); |
1380 | |
|
1381 | 0 | *end = str+5; |
1382 | 0 | res->d = sclen; |
1383 | 0 | return 0; |
1384 | 0 | } |
1385 | 0 | break; |
1386 | | |
1387 | 0 | case 't': |
1388 | 0 | if (memcmp(str, "tlen", 4) == 0) { |
1389 | 0 | *end = str+4; |
1390 | 0 | res->d = b->core.isize; |
1391 | 0 | return 0; |
1392 | 0 | } |
1393 | 0 | break; |
1394 | | |
1395 | 0 | case '[': |
1396 | 0 | if (*str == '[' && str[1] && str[2] && str[3] == ']') { |
1397 | | /* aux tags */ |
1398 | 0 | *end = str+4; |
1399 | |
|
1400 | 0 | uint8_t *aux = bam_aux_get(b, str+1); |
1401 | 0 | if (aux) { |
1402 | | // we define the truth of a tag to be its presence, even if 0. |
1403 | 0 | res->is_true = 1; |
1404 | 0 | switch (*aux) { |
1405 | 0 | case 'Z': |
1406 | 0 | case 'H': |
1407 | 0 | res->is_str = 1; |
1408 | 0 | kputs((char *)aux+1, ks_clear(&res->s)); |
1409 | 0 | break; |
1410 | | |
1411 | 0 | case 'A': |
1412 | 0 | res->is_str = 1; |
1413 | 0 | kputsn((char *)aux+1, 1, ks_clear(&res->s)); |
1414 | 0 | break; |
1415 | | |
1416 | 0 | case 'i': case 'I': |
1417 | 0 | case 's': case 'S': |
1418 | 0 | case 'c': case 'C': |
1419 | 0 | res->is_str = 0; |
1420 | 0 | res->d = bam_aux2i(aux); |
1421 | 0 | break; |
1422 | | |
1423 | 0 | case 'f': |
1424 | 0 | case 'd': |
1425 | 0 | res->is_str = 0; |
1426 | 0 | res->d = bam_aux2f(aux); |
1427 | 0 | break; |
1428 | | |
1429 | 0 | default: |
1430 | 0 | hts_log_error("Aux type '%c not yet supported by filters", |
1431 | 0 | *aux); |
1432 | 0 | return -1; |
1433 | 0 | } |
1434 | 0 | return 0; |
1435 | |
|
1436 | 0 | } else { |
1437 | | // hence absent tags are always false (and strings) |
1438 | 0 | res->is_str = 1; |
1439 | 0 | res->s.l = 0; |
1440 | 0 | res->d = 0; |
1441 | 0 | res->is_true = 0; |
1442 | 0 | return 0; |
1443 | 0 | } |
1444 | 0 | } |
1445 | 0 | break; |
1446 | 0 | } |
1447 | | |
1448 | | // All successful matches in switch should return 0. |
1449 | | // So if we didn't match, it's a parse error. |
1450 | 0 | return -1; |
1451 | 0 | } |
1452 | | |
1453 | | // Returns 1 when accepted by the filter, 0 if not, -1 on error. |
1454 | | int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt) |
1455 | 0 | { |
1456 | 0 | hb_pair hb = {h, b}; |
1457 | 0 | hts_expr_val_t res = HTS_EXPR_VAL_INIT; |
1458 | 0 | if (hts_filter_eval2(filt, &hb, bam_sym_lookup, &res)) { |
1459 | 0 | hts_log_error("Couldn't process filter expression"); |
1460 | 0 | hts_expr_val_free(&res); |
1461 | 0 | return -1; |
1462 | 0 | } |
1463 | | |
1464 | 0 | int t = res.is_true; |
1465 | 0 | hts_expr_val_free(&res); |
1466 | |
|
1467 | 0 | return t; |
1468 | 0 | } |
1469 | | |
1470 | | static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) |
1471 | 0 | { |
1472 | 0 | htsFile *fp = fpv; |
1473 | 0 | bam1_t *b = bv; |
1474 | 0 | int pass_filter, ret; |
1475 | |
|
1476 | 0 | do { |
1477 | 0 | ret = cram_get_bam_seq(fp->fp.cram, &b); |
1478 | 0 | if (ret < 0) |
1479 | 0 | return cram_eof(fp->fp.cram) ? -1 : -2; |
1480 | | |
1481 | 0 | if (bam_tag2cigar(b, 1, 1) < 0) |
1482 | 0 | return -2; |
1483 | | |
1484 | 0 | *tid = b->core.tid; |
1485 | 0 | *beg = b->core.pos; |
1486 | 0 | *end = bam_endpos(b); |
1487 | |
|
1488 | 0 | if (fp->filter) { |
1489 | 0 | pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter); |
1490 | 0 | if (pass_filter < 0) |
1491 | 0 | return -2; |
1492 | 0 | } else { |
1493 | 0 | pass_filter = 1; |
1494 | 0 | } |
1495 | 0 | } while (pass_filter == 0); |
1496 | | |
1497 | 0 | return ret; |
1498 | 0 | } |
1499 | | |
1500 | | static int cram_pseek(void *fp, int64_t offset, int whence) |
1501 | 0 | { |
1502 | 0 | cram_fd *fd = (cram_fd *)fp; |
1503 | |
|
1504 | 0 | if ((0 != cram_seek(fd, offset, SEEK_SET)) |
1505 | 0 | && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR))) |
1506 | 0 | return -1; |
1507 | | |
1508 | 0 | fd->curr_position = offset; |
1509 | |
|
1510 | 0 | if (fd->ctr) { |
1511 | 0 | cram_free_container(fd->ctr); |
1512 | 0 | if (fd->ctr_mt && fd->ctr_mt != fd->ctr) |
1513 | 0 | cram_free_container(fd->ctr_mt); |
1514 | |
|
1515 | 0 | fd->ctr = NULL; |
1516 | 0 | fd->ctr_mt = NULL; |
1517 | 0 | fd->ooc = 0; |
1518 | 0 | } |
1519 | |
|
1520 | 0 | return 0; |
1521 | 0 | } |
1522 | | |
1523 | | /* |
1524 | | * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only |
1525 | | * after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered |
1526 | | * container previously fetched. It was designed like this to integrate with the functionality |
1527 | | * of the iterator stepping logic. |
1528 | | */ |
1529 | | |
1530 | | static int64_t cram_ptell(void *fp) |
1531 | 0 | { |
1532 | 0 | cram_fd *fd = (cram_fd *)fp; |
1533 | 0 | cram_container *c; |
1534 | 0 | cram_slice *s; |
1535 | 0 | int64_t ret = -1L; |
1536 | |
|
1537 | 0 | if (fd) { |
1538 | 0 | if ((c = fd->ctr) != NULL) { |
1539 | 0 | if ((s = c->slice) != NULL && s->max_rec) { |
1540 | 0 | if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1)) |
1541 | 0 | fd->curr_position += c->offset + c->length; |
1542 | 0 | } |
1543 | 0 | } |
1544 | 0 | ret = fd->curr_position; |
1545 | 0 | } |
1546 | |
|
1547 | 0 | return ret; |
1548 | 0 | } |
1549 | | |
1550 | | static int bam_pseek(void *fp, int64_t offset, int whence) |
1551 | 0 | { |
1552 | 0 | BGZF *fd = (BGZF *)fp; |
1553 | |
|
1554 | 0 | return bgzf_seek(fd, offset, whence); |
1555 | 0 | } |
1556 | | |
1557 | | static int64_t bam_ptell(void *fp) |
1558 | 0 | { |
1559 | 0 | BGZF *fd = (BGZF *)fp; |
1560 | 0 | if (!fd) |
1561 | 0 | return -1L; |
1562 | | |
1563 | 0 | return bgzf_tell(fd); |
1564 | 0 | } |
1565 | | |
1566 | | |
1567 | | |
1568 | | static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags) |
1569 | 0 | { |
1570 | 0 | switch (fp->format.format) { |
1571 | 0 | case bam: |
1572 | 0 | case sam: |
1573 | 0 | return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags); |
1574 | | |
1575 | 0 | case cram: { |
1576 | 0 | if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL; |
1577 | | |
1578 | | // Cons up a fake "index" just pointing at the associated cram_fd: |
1579 | 0 | hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t)); |
1580 | 0 | if (idx == NULL) return NULL; |
1581 | 0 | idx->fmt = HTS_FMT_CRAI; |
1582 | 0 | idx->cram = fp->fp.cram; |
1583 | 0 | return (hts_idx_t *) idx; |
1584 | 0 | } |
1585 | | |
1586 | 0 | default: |
1587 | 0 | return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t |
1588 | 0 | } |
1589 | 0 | } |
1590 | | |
1591 | | hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags) |
1592 | 0 | { |
1593 | 0 | return index_load(fp, fn, fnidx, flags); |
1594 | 0 | } |
1595 | | |
1596 | 0 | hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) { |
1597 | 0 | return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE); |
1598 | 0 | } |
1599 | | |
1600 | | hts_idx_t *sam_index_load(htsFile *fp, const char *fn) |
1601 | 0 | { |
1602 | 0 | return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE); |
1603 | 0 | } |
1604 | | |
1605 | | static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec) |
1606 | 0 | { |
1607 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1608 | 0 | hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t)); |
1609 | 0 | if (iter == NULL) return NULL; |
1610 | | |
1611 | | // Cons up a dummy iterator for which hts_itr_next() will simply invoke |
1612 | | // the readrec function: |
1613 | 0 | iter->is_cram = 1; |
1614 | 0 | iter->read_rest = 1; |
1615 | 0 | iter->off = NULL; |
1616 | 0 | iter->bins.a = NULL; |
1617 | 0 | iter->readrec = readrec; |
1618 | |
|
1619 | 0 | if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) { |
1620 | 0 | cram_range r = { tid, beg+1, end }; |
1621 | 0 | int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r); |
1622 | |
|
1623 | 0 | iter->curr_off = 0; |
1624 | | // The following fields are not required by hts_itr_next(), but are |
1625 | | // filled in in case user code wants to look at them. |
1626 | 0 | iter->tid = tid; |
1627 | 0 | iter->beg = beg; |
1628 | 0 | iter->end = end; |
1629 | |
|
1630 | 0 | switch (ret) { |
1631 | 0 | case 0: |
1632 | 0 | break; |
1633 | | |
1634 | 0 | case -2: |
1635 | | // No data vs this ref, so mark iterator as completed. |
1636 | | // Same as HTS_IDX_NONE. |
1637 | 0 | iter->finished = 1; |
1638 | 0 | break; |
1639 | | |
1640 | 0 | default: |
1641 | 0 | free(iter); |
1642 | 0 | return NULL; |
1643 | 0 | } |
1644 | 0 | } |
1645 | 0 | else switch (tid) { |
1646 | 0 | case HTS_IDX_REST: |
1647 | 0 | iter->curr_off = 0; |
1648 | 0 | break; |
1649 | 0 | case HTS_IDX_NONE: |
1650 | 0 | iter->curr_off = 0; |
1651 | 0 | iter->finished = 1; |
1652 | 0 | break; |
1653 | 0 | default: |
1654 | 0 | hts_log_error("Query with tid=%d not implemented for CRAM files", tid); |
1655 | 0 | abort(); |
1656 | 0 | break; |
1657 | 0 | } |
1658 | | |
1659 | 0 | return iter; |
1660 | 0 | } |
1661 | | |
1662 | | hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end) |
1663 | 0 | { |
1664 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1665 | 0 | if (idx == NULL) |
1666 | 0 | return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest); |
1667 | 0 | else if (cidx->fmt == HTS_FMT_CRAI) |
1668 | 0 | return cram_itr_query(idx, tid, beg, end, sam_readrec); |
1669 | 0 | else |
1670 | 0 | return hts_itr_query(idx, tid, beg, end, sam_readrec); |
1671 | 0 | } |
1672 | | |
1673 | | static int cram_name2id(void *fdv, const char *ref) |
1674 | 0 | { |
1675 | 0 | cram_fd *fd = (cram_fd *) fdv; |
1676 | 0 | return sam_hdr_name2tid(fd->header, ref); |
1677 | 0 | } |
1678 | | |
1679 | | hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region) |
1680 | 0 | { |
1681 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1682 | 0 | return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, |
1683 | 0 | cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query, |
1684 | 0 | sam_readrec); |
1685 | 0 | } |
1686 | | |
1687 | | hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount) |
1688 | 0 | { |
1689 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1690 | 0 | hts_reglist_t *r_list = NULL; |
1691 | 0 | int r_count = 0; |
1692 | |
|
1693 | 0 | if (!cidx || !hdr) |
1694 | 0 | return NULL; |
1695 | | |
1696 | 0 | hts_itr_t *itr = NULL; |
1697 | 0 | if (cidx->fmt == HTS_FMT_CRAI) { |
1698 | 0 | r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id); |
1699 | 0 | if (!r_list) |
1700 | 0 | return NULL; |
1701 | 0 | itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram, |
1702 | 0 | hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell); |
1703 | 0 | } else { |
1704 | 0 | r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, (hts_name2id_f)(bam_name2id)); |
1705 | 0 | if (!r_list) |
1706 | 0 | return NULL; |
1707 | 0 | itr = hts_itr_regions(idx, r_list, r_count, (hts_name2id_f)(bam_name2id), hdr, |
1708 | 0 | hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell); |
1709 | 0 | } |
1710 | | |
1711 | 0 | if (!itr) |
1712 | 0 | hts_reglist_free(r_list, r_count); |
1713 | |
|
1714 | 0 | return itr; |
1715 | 0 | } |
1716 | | |
1717 | | hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount) |
1718 | 0 | { |
1719 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1720 | |
|
1721 | 0 | if(!cidx || !hdr || !reglist) |
1722 | 0 | return NULL; |
1723 | | |
1724 | 0 | if (cidx->fmt == HTS_FMT_CRAI) |
1725 | 0 | return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram, |
1726 | 0 | hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell); |
1727 | 0 | else |
1728 | 0 | return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr, |
1729 | 0 | hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell); |
1730 | 0 | } |
1731 | | |
1732 | | /********************** |
1733 | | *** SAM header I/O *** |
1734 | | **********************/ |
1735 | | |
1736 | | #include "htslib/kseq.h" |
1737 | | #include "htslib/kstring.h" |
1738 | | |
1739 | | sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text) |
1740 | 0 | { |
1741 | 0 | sam_hdr_t *bh = sam_hdr_init(); |
1742 | 0 | if (!bh) return NULL; |
1743 | | |
1744 | 0 | if (sam_hdr_add_lines(bh, text, l_text) != 0) { |
1745 | 0 | sam_hdr_destroy(bh); |
1746 | 0 | return NULL; |
1747 | 0 | } |
1748 | | |
1749 | 0 | return bh; |
1750 | 0 | } |
1751 | | |
1752 | 3.07M | static int valid_sam_header_type(const char *s) { |
1753 | 3.07M | if (s[0] != '@') return 0; |
1754 | 3.07M | switch (s[1]) { |
1755 | 11.4k | case 'H': |
1756 | 11.4k | return s[2] == 'D' && s[3] == '\t'; |
1757 | 2 | case 'S': |
1758 | 2 | return s[2] == 'Q' && s[3] == '\t'; |
1759 | 15.9k | case 'R': |
1760 | 3.05M | case 'P': |
1761 | 3.05M | return s[2] == 'G' && s[3] == '\t'; |
1762 | 368 | case 'C': |
1763 | 368 | return s[2] == 'O'; |
1764 | 3.07M | } |
1765 | 3 | return 0; |
1766 | 3.07M | } |
1767 | | |
1768 | | // Minimal sanitisation of a header to ensure. |
1769 | | // - null terminated string. |
1770 | | // - all lines start with @ (also implies no blank lines). |
1771 | | // |
1772 | | // Much more could be done, but currently is not, including: |
1773 | | // - checking header types are known (HD, SQ, etc). |
1774 | | // - syntax (eg checking tab separated fields). |
1775 | | // - validating n_targets matches @SQ records. |
1776 | | // - validating target lengths against @SQ records. |
1777 | 782 | static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) { |
1778 | 782 | if (!h) |
1779 | 18 | return NULL; |
1780 | | |
1781 | | // Special case for empty headers. |
1782 | 764 | if (h->l_text == 0) |
1783 | 252 | return h; |
1784 | | |
1785 | 512 | size_t i; |
1786 | 512 | unsigned int lnum = 0; |
1787 | 512 | char *cp = h->text, last = '\n'; |
1788 | 118M | for (i = 0; i < h->l_text; i++) { |
1789 | | // NB: l_text excludes terminating nul. This finds early ones. |
1790 | 118M | if (cp[i] == 0) |
1791 | 256 | break; |
1792 | | |
1793 | | // Error on \n[^@], including duplicate newlines |
1794 | 118M | if (last == '\n') { |
1795 | 2.71M | lnum++; |
1796 | 2.71M | if (cp[i] != '@') { |
1797 | 0 | hts_log_error("Malformed SAM header at line %u", lnum); |
1798 | 0 | sam_hdr_destroy(h); |
1799 | 0 | return NULL; |
1800 | 0 | } |
1801 | 2.71M | } |
1802 | | |
1803 | 118M | last = cp[i]; |
1804 | 118M | } |
1805 | | |
1806 | 512 | if (i < h->l_text) { // Early nul found. Complain if not just padding. |
1807 | 256 | size_t j = i; |
1808 | 1.14k | while (j < h->l_text && cp[j] == '\0') j++; |
1809 | 256 | if (j < h->l_text) |
1810 | 256 | hts_log_warning("Unexpected NUL character in header. Possibly truncated"); |
1811 | 256 | } |
1812 | | |
1813 | | // Add trailing newline and/or trailing nul if required. |
1814 | 512 | if (last != '\n') { |
1815 | 250 | hts_log_warning("Missing trailing newline on SAM header. Possibly truncated"); |
1816 | | |
1817 | 250 | if (h->l_text < 2 || i >= h->l_text - 2) { |
1818 | 31 | if (h->l_text >= SIZE_MAX - 2) { |
1819 | 0 | hts_log_error("No room for extra newline"); |
1820 | 0 | sam_hdr_destroy(h); |
1821 | 0 | return NULL; |
1822 | 0 | } |
1823 | | |
1824 | 31 | cp = realloc(h->text, (size_t) h->l_text+2); |
1825 | 31 | if (!cp) { |
1826 | 0 | sam_hdr_destroy(h); |
1827 | 0 | return NULL; |
1828 | 0 | } |
1829 | 31 | h->text = cp; |
1830 | 31 | } |
1831 | 250 | cp[i++] = '\n'; |
1832 | | |
1833 | | // l_text may be larger already due to multiple nul padding |
1834 | 250 | if (h->l_text < i) |
1835 | 0 | h->l_text = i; |
1836 | 250 | cp[h->l_text] = '\0'; |
1837 | 250 | } |
1838 | | |
1839 | 512 | return h; |
1840 | 512 | } |
1841 | | |
1842 | 212 | static void known_stderr(const char *tool, const char *advice) { |
1843 | 212 | hts_log_warning("SAM file corrupted by embedded %s error/log message", tool); |
1844 | 212 | hts_log_warning("%s", advice); |
1845 | 212 | } |
1846 | | |
1847 | 1.83k | static void warn_if_known_stderr(const char *line) { |
1848 | 1.83k | if (strstr(line, "M::bwa_idx_load_from_disk") != NULL) |
1849 | 131 | known_stderr("bwa", "Use `bwa mem -o file.sam ...` or `bwa sampe -f file.sam ...` instead of `bwa ... > file.sam`"); |
1850 | 1.70k | else if (strstr(line, "M::mem_pestat") != NULL) |
1851 | 11 | known_stderr("bwa", "Use `bwa mem -o file.sam ...` instead of `bwa mem ... > file.sam`"); |
1852 | 1.68k | else if (strstr(line, "loaded/built the index") != NULL) |
1853 | 70 | known_stderr("minimap2", "Use `minimap2 -o file.sam ...` instead of `minimap2 ... > file.sam`"); |
1854 | 1.83k | } |
1855 | | |
1856 | 501 | static sam_hdr_t *sam_hdr_create(htsFile* fp) { |
1857 | 501 | kstring_t str = { 0, 0, NULL }; |
1858 | 501 | khint_t k; |
1859 | 501 | sam_hdr_t* h = sam_hdr_init(); |
1860 | 501 | const char *q, *r; |
1861 | 501 | char* sn = NULL; |
1862 | 501 | khash_t(s2i) *d = kh_init(s2i); |
1863 | 501 | khash_t(s2i) *long_refs = NULL; |
1864 | 501 | if (!h || !d) |
1865 | 0 | goto error; |
1866 | | |
1867 | 501 | int ret, has_SQ = 0; |
1868 | 501 | int next_c = '@'; |
1869 | 3.07M | while (next_c == '@' && (ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) { |
1870 | 3.07M | if (fp->line.s[0] != '@') |
1871 | 0 | break; |
1872 | | |
1873 | 3.07M | if (fp->line.l > 3 && strncmp(fp->line.s, "@SQ", 3) == 0) { |
1874 | 5.51k | has_SQ = 1; |
1875 | 5.51k | hts_pos_t ln = -1; |
1876 | 27.9k | for (q = fp->line.s + 4;; ++q) { |
1877 | 27.9k | if (strncmp(q, "SN:", 3) == 0) { |
1878 | 7.59k | q += 3; |
1879 | 151M | for (r = q;*r != '\t' && *r != '\n' && *r != '\0';++r); |
1880 | | |
1881 | 7.59k | if (sn) { |
1882 | 3.65k | hts_log_warning("SQ header line has more than one SN: tag"); |
1883 | 3.65k | free(sn); |
1884 | 3.65k | } |
1885 | 7.59k | sn = (char*)calloc(r - q + 1, 1); |
1886 | 7.59k | if (!sn) |
1887 | 0 | goto error; |
1888 | | |
1889 | 7.59k | strncpy(sn, q, r - q); |
1890 | 7.59k | q = r; |
1891 | 20.3k | } else { |
1892 | 20.3k | if (strncmp(q, "LN:", 3) == 0) |
1893 | 7.44k | ln = strtoll(q + 3, (char**)&q, 10); |
1894 | 20.3k | } |
1895 | | |
1896 | 1.29M | while (*q != '\t' && *q != '\n' && *q != '\0') |
1897 | 1.27M | ++q; |
1898 | 27.9k | if (*q == '\0' || *q == '\n') |
1899 | 5.51k | break; |
1900 | 27.9k | } |
1901 | 5.51k | if (sn) { |
1902 | 3.94k | if (ln >= 0) { |
1903 | 3.69k | int absent; |
1904 | 3.69k | k = kh_put(s2i, d, sn, &absent); |
1905 | 3.69k | if (absent < 0) |
1906 | 0 | goto error; |
1907 | | |
1908 | 3.69k | if (!absent) { |
1909 | 1.32k | hts_log_warning("Duplicated sequence \"%s\" in file \"%s\"", sn, fp->fn); |
1910 | 1.32k | free(sn); |
1911 | 2.36k | } else { |
1912 | 2.36k | sn = NULL; |
1913 | 2.36k | if (ln >= UINT32_MAX) { |
1914 | | // Stash away ref length that |
1915 | | // doesn't fit in target_len array |
1916 | 1.08k | int k2; |
1917 | 1.08k | if (!long_refs) { |
1918 | 54 | long_refs = kh_init(s2i); |
1919 | 54 | if (!long_refs) |
1920 | 0 | goto error; |
1921 | 54 | } |
1922 | 1.08k | k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent); |
1923 | 1.08k | if (absent < 0) |
1924 | 0 | goto error; |
1925 | 1.08k | kh_val(long_refs, k2) = ln; |
1926 | 1.08k | kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32 |
1927 | 1.08k | | UINT32_MAX); |
1928 | 1.28k | } else { |
1929 | 1.28k | kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln; |
1930 | 1.28k | } |
1931 | 2.36k | } |
1932 | 3.69k | } else { |
1933 | 255 | hts_log_warning("Ignored @SQ SN:%s : bad or missing LN tag", sn); |
1934 | 255 | warn_if_known_stderr(fp->line.s); |
1935 | 255 | free(sn); |
1936 | 255 | } |
1937 | 3.94k | } else { |
1938 | 1.57k | hts_log_warning("Ignored @SQ line with missing SN: tag"); |
1939 | 1.57k | warn_if_known_stderr(fp->line.s); |
1940 | 1.57k | } |
1941 | 5.51k | sn = NULL; |
1942 | 5.51k | } |
1943 | 3.07M | else if (!valid_sam_header_type(fp->line.s)) { |
1944 | 6 | hts_log_error("Invalid header line: must start with @HD/@SQ/@RG/@PG/@CO"); |
1945 | 6 | warn_if_known_stderr(fp->line.s); |
1946 | 6 | goto error; |
1947 | 6 | } |
1948 | | |
1949 | 3.07M | if (kputsn(fp->line.s, fp->line.l, &str) < 0) |
1950 | 0 | goto error; |
1951 | | |
1952 | 3.07M | if (kputc('\n', &str) < 0) |
1953 | 0 | goto error; |
1954 | | |
1955 | 3.07M | if (fp->is_bgzf) { |
1956 | 3.07M | next_c = bgzf_peek(fp->fp.bgzf); |
1957 | 3.07M | } else { |
1958 | 5.50k | unsigned char nc; |
1959 | 5.50k | ssize_t pret = hpeek(fp->fp.hfile, &nc, 1); |
1960 | 5.50k | next_c = pret > 0 ? nc : pret - 1; |
1961 | 5.50k | } |
1962 | 3.07M | if (next_c < -1) |
1963 | 1 | goto error; |
1964 | 3.07M | } |
1965 | 494 | if (next_c != '@') |
1966 | 485 | fp->line.l = 0; |
1967 | | |
1968 | 494 | if (ret < -1) |
1969 | 9 | goto error; |
1970 | | |
1971 | 485 | if (!has_SQ && fp->fn_aux) { |
1972 | 0 | kstring_t line = { 0, 0, NULL }; |
1973 | | |
1974 | | /* The reference index (.fai) is actually needed here */ |
1975 | 0 | char *fai_fn = fp->fn_aux; |
1976 | 0 | char *fn_delim = strstr(fp->fn_aux, HTS_IDX_DELIM); |
1977 | 0 | if (fn_delim) |
1978 | 0 | fai_fn = fn_delim + strlen(HTS_IDX_DELIM); |
1979 | |
|
1980 | 0 | hFILE* f = hopen(fai_fn, "r"); |
1981 | 0 | int e = 0, absent; |
1982 | 0 | if (f == NULL) |
1983 | 0 | goto error; |
1984 | | |
1985 | 0 | while (line.l = 0, kgetline(&line, (kgets_func*) hgets, f) >= 0) { |
1986 | 0 | char* tab = strchr(line.s, '\t'); |
1987 | 0 | hts_pos_t ln; |
1988 | |
|
1989 | 0 | if (tab == NULL) |
1990 | 0 | continue; |
1991 | | |
1992 | 0 | sn = (char*)calloc(tab-line.s+1, 1); |
1993 | 0 | if (!sn) { |
1994 | 0 | e = 1; |
1995 | 0 | break; |
1996 | 0 | } |
1997 | 0 | memcpy(sn, line.s, tab-line.s); |
1998 | 0 | k = kh_put(s2i, d, sn, &absent); |
1999 | 0 | if (absent < 0) { |
2000 | 0 | e = 1; |
2001 | 0 | break; |
2002 | 0 | } |
2003 | | |
2004 | 0 | ln = strtoll(tab, NULL, 10); |
2005 | |
|
2006 | 0 | if (!absent) { |
2007 | 0 | hts_log_warning("Duplicated sequence \"%s\" in the file \"%s\"", sn, fai_fn); |
2008 | 0 | free(sn); |
2009 | 0 | sn = NULL; |
2010 | 0 | } else { |
2011 | 0 | sn = NULL; |
2012 | 0 | if (ln >= UINT32_MAX) { |
2013 | | // Stash away ref length that |
2014 | | // doesn't fit in target_len array |
2015 | 0 | khint_t k2; |
2016 | 0 | int absent = -1; |
2017 | 0 | if (!long_refs) { |
2018 | 0 | long_refs = kh_init(s2i); |
2019 | 0 | if (!long_refs) { |
2020 | 0 | e = 1; |
2021 | 0 | break; |
2022 | 0 | } |
2023 | 0 | } |
2024 | 0 | k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent); |
2025 | 0 | if (absent < 0) { |
2026 | 0 | e = 1; |
2027 | 0 | break; |
2028 | 0 | } |
2029 | 0 | kh_val(long_refs, k2) = ln; |
2030 | 0 | kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32 |
2031 | 0 | | UINT32_MAX); |
2032 | 0 | } else { |
2033 | 0 | kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln; |
2034 | 0 | } |
2035 | 0 | has_SQ = 1; |
2036 | 0 | } |
2037 | | |
2038 | 0 | e |= kputs("@SQ\tSN:", &str) < 0; |
2039 | 0 | e |= kputsn(line.s, tab - line.s, &str) < 0; |
2040 | 0 | e |= kputs("\tLN:", &str) < 0; |
2041 | 0 | e |= kputll(ln, &str) < 0; |
2042 | 0 | e |= kputc('\n', &str) < 0; |
2043 | 0 | if (e) |
2044 | 0 | break; |
2045 | 0 | } |
2046 | |
|
2047 | 0 | ks_free(&line); |
2048 | 0 | if (hclose(f) != 0) { |
2049 | 0 | hts_log_error("Error on closing %s", fai_fn); |
2050 | 0 | e = 1; |
2051 | 0 | } |
2052 | 0 | if (e) |
2053 | 0 | goto error; |
2054 | 0 | } |
2055 | | |
2056 | 485 | if (has_SQ) { |
2057 | | // Populate the targets array |
2058 | 277 | h->n_targets = kh_size(d); |
2059 | | |
2060 | 277 | h->target_name = (char**) malloc(sizeof(char*) * h->n_targets); |
2061 | 277 | if (!h->target_name) { |
2062 | 0 | h->n_targets = 0; |
2063 | 0 | goto error; |
2064 | 0 | } |
2065 | | |
2066 | 277 | h->target_len = (uint32_t*) malloc(sizeof(uint32_t) * h->n_targets); |
2067 | 277 | if (!h->target_len) { |
2068 | 0 | h->n_targets = 0; |
2069 | 0 | goto error; |
2070 | 0 | } |
2071 | | |
2072 | 3.80k | for (k = kh_begin(d); k != kh_end(d); ++k) { |
2073 | 3.52k | if (!kh_exist(d, k)) |
2074 | 1.89k | continue; |
2075 | | |
2076 | 1.62k | h->target_name[kh_val(d, k) >> 32] = (char*) kh_key(d, k); |
2077 | 1.62k | h->target_len[kh_val(d, k) >> 32] = kh_val(d, k) & 0xffffffffUL; |
2078 | 1.62k | kh_val(d, k) >>= 32; |
2079 | 1.62k | } |
2080 | 277 | } |
2081 | | |
2082 | | // Repurpose sdict to hold any references longer than UINT32_MAX |
2083 | 485 | h->sdict = long_refs; |
2084 | | |
2085 | 485 | kh_destroy(s2i, d); |
2086 | | |
2087 | 485 | if (str.l == 0) |
2088 | 0 | kputsn("", 0, &str); |
2089 | 485 | h->l_text = str.l; |
2090 | 485 | h->text = ks_release(&str); |
2091 | 485 | fp->bam_header = sam_hdr_sanitise(h); |
2092 | 485 | fp->bam_header->ref_count = 1; |
2093 | | |
2094 | 485 | return fp->bam_header; |
2095 | | |
2096 | 16 | error: |
2097 | 16 | if (h && d && (!h->target_name || !h->target_len)) { |
2098 | 1.58k | for (k = kh_begin(d); k != kh_end(d); ++k) |
2099 | 1.56k | if (kh_exist(d, k)) free((void *)kh_key(d, k)); |
2100 | 16 | } |
2101 | 16 | sam_hdr_destroy(h); |
2102 | 16 | ks_free(&str); |
2103 | 16 | kh_destroy(s2i, d); |
2104 | 16 | kh_destroy(s2i, long_refs); |
2105 | 16 | if (sn) free(sn); |
2106 | 16 | return NULL; |
2107 | 485 | } |
2108 | | |
2109 | | sam_hdr_t *sam_hdr_read(htsFile *fp) |
2110 | 919 | { |
2111 | 919 | if (!fp) { |
2112 | 0 | errno = EINVAL; |
2113 | 0 | return NULL; |
2114 | 0 | } |
2115 | | |
2116 | 919 | switch (fp->format.format) { |
2117 | 52 | case bam: |
2118 | 52 | return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf)); |
2119 | | |
2120 | 245 | case cram: |
2121 | 245 | return sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header)); |
2122 | | |
2123 | 501 | case sam: |
2124 | 501 | return sam_hdr_create(fp); |
2125 | | |
2126 | 28 | case fastq_format: |
2127 | 121 | case fasta_format: |
2128 | 121 | return sam_hdr_init(); |
2129 | | |
2130 | 0 | case empty_format: |
2131 | 0 | errno = EPIPE; |
2132 | 0 | return NULL; |
2133 | | |
2134 | 0 | default: |
2135 | 0 | errno = EFTYPE; |
2136 | 0 | return NULL; |
2137 | 919 | } |
2138 | 919 | } |
2139 | | |
2140 | | int sam_hdr_write(htsFile *fp, const sam_hdr_t *h) |
2141 | 885 | { |
2142 | 885 | if (!fp || !h) { |
2143 | 0 | errno = EINVAL; |
2144 | 0 | return -1; |
2145 | 0 | } |
2146 | | |
2147 | 885 | switch (fp->format.format) { |
2148 | 0 | case binary_format: |
2149 | 0 | fp->format.category = sequence_data; |
2150 | 0 | fp->format.format = bam; |
2151 | | /* fall-through */ |
2152 | 0 | case bam: |
2153 | 0 | if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1; |
2154 | 0 | break; |
2155 | | |
2156 | 0 | case cram: { |
2157 | 0 | cram_fd *fd = fp->fp.cram; |
2158 | 0 | if (cram_set_header2(fd, h) < 0) return -1; |
2159 | 0 | if (fp->fn_aux) |
2160 | 0 | cram_load_reference(fd, fp->fn_aux); |
2161 | 0 | if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1; |
2162 | 0 | } |
2163 | 0 | break; |
2164 | | |
2165 | 885 | case text_format: |
2166 | 885 | fp->format.category = sequence_data; |
2167 | 885 | fp->format.format = sam; |
2168 | | /* fall-through */ |
2169 | 885 | case sam: { |
2170 | 885 | if (!h->hrecs && !h->text) |
2171 | 0 | return 0; |
2172 | 885 | char *text; |
2173 | 885 | kstring_t hdr_ks = { 0, 0, NULL }; |
2174 | 885 | size_t l_text; |
2175 | 885 | ssize_t bytes; |
2176 | 885 | int r = 0, no_sq = 0; |
2177 | | |
2178 | 885 | if (h->hrecs) { |
2179 | 766 | if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) |
2180 | 0 | return -1; |
2181 | 766 | text = hdr_ks.s; |
2182 | 766 | l_text = hdr_ks.l; |
2183 | 766 | } else { |
2184 | 119 | const char *p = NULL; |
2185 | 151 | do { |
2186 | 151 | const char *q = p == NULL ? h->text : p + 4; |
2187 | 151 | p = strstr(q, "@SQ\t"); |
2188 | 151 | } while (!(p == NULL || p == h->text || *(p - 1) == '\n')); |
2189 | 119 | no_sq = p == NULL; |
2190 | 119 | text = h->text; |
2191 | 119 | l_text = h->l_text; |
2192 | 119 | } |
2193 | | |
2194 | 885 | if (fp->is_bgzf) { |
2195 | 0 | bytes = bgzf_write(fp->fp.bgzf, text, l_text); |
2196 | 885 | } else { |
2197 | 885 | bytes = hwrite(fp->fp.hfile, text, l_text); |
2198 | 885 | } |
2199 | 885 | free(hdr_ks.s); |
2200 | 885 | if (bytes != l_text) |
2201 | 0 | return -1; |
2202 | | |
2203 | 885 | if (no_sq) { |
2204 | 53 | int i; |
2205 | 383 | for (i = 0; i < h->n_targets; ++i) { |
2206 | 330 | fp->line.l = 0; |
2207 | 330 | r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0; |
2208 | 330 | r |= kputs(h->target_name[i], &fp->line) < 0; |
2209 | 330 | r |= kputsn("\tLN:", 4, &fp->line) < 0; |
2210 | 330 | r |= kputw(h->target_len[i], &fp->line) < 0; |
2211 | 330 | r |= kputc('\n', &fp->line) < 0; |
2212 | 330 | if (r != 0) |
2213 | 0 | return -1; |
2214 | | |
2215 | 330 | if (fp->is_bgzf) { |
2216 | 0 | bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l); |
2217 | 330 | } else { |
2218 | 330 | bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l); |
2219 | 330 | } |
2220 | 330 | if (bytes != fp->line.l) |
2221 | 0 | return -1; |
2222 | 330 | } |
2223 | 53 | } |
2224 | 885 | if (fp->is_bgzf) { |
2225 | 0 | if (bgzf_flush(fp->fp.bgzf) != 0) return -1; |
2226 | 885 | } else { |
2227 | 885 | if (hflush(fp->fp.hfile) != 0) return -1; |
2228 | 885 | } |
2229 | 885 | } |
2230 | 885 | break; |
2231 | | |
2232 | 885 | case fastq_format: |
2233 | 0 | case fasta_format: |
2234 | | // Nothing to output; FASTQ has no file headers. |
2235 | 0 | break; |
2236 | | |
2237 | 0 | default: |
2238 | 0 | errno = EBADF; |
2239 | 0 | return -1; |
2240 | 885 | } |
2241 | 885 | return 0; |
2242 | 885 | } |
2243 | | |
2244 | | static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val) |
2245 | 0 | { |
2246 | 0 | char *p, *q, *beg = NULL, *end = NULL, *newtext; |
2247 | 0 | size_t new_l_text; |
2248 | 0 | if (!h || !key) |
2249 | 0 | return -1; |
2250 | | |
2251 | 0 | if (h->l_text > 3) { |
2252 | 0 | if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists |
2253 | 0 | if ((p = strchr(h->text, '\n')) == 0) return -1; |
2254 | 0 | *p = '\0'; // for strstr call |
2255 | |
|
2256 | 0 | char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' }; |
2257 | |
|
2258 | 0 | if ((q = strstr(h->text, tmp)) != 0) { // key exists |
2259 | 0 | *p = '\n'; // change back |
2260 | | |
2261 | | // mark the key:val |
2262 | 0 | beg = q; |
2263 | 0 | for (q += 4; *q != '\n' && *q != '\t'; ++q); |
2264 | 0 | end = q; |
2265 | |
|
2266 | 0 | if (val && (strncmp(beg + 4, val, end - beg - 4) == 0) |
2267 | 0 | && strlen(val) == end - beg - 4) |
2268 | 0 | return 0; // val is the same, no need to change |
2269 | |
|
2270 | 0 | } else { |
2271 | 0 | beg = end = p; |
2272 | 0 | *p = '\n'; |
2273 | 0 | } |
2274 | 0 | } |
2275 | 0 | } |
2276 | 0 | if (beg == NULL) { // no @HD |
2277 | 0 | new_l_text = h->l_text; |
2278 | 0 | if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9) |
2279 | 0 | return -1; |
2280 | 0 | new_l_text += strlen(SAM_FORMAT_VERSION) + 8; |
2281 | 0 | if (val) { |
2282 | 0 | if (new_l_text > SIZE_MAX - strlen(val) - 5) |
2283 | 0 | return -1; |
2284 | 0 | new_l_text += strlen(val) + 4; |
2285 | 0 | } |
2286 | 0 | newtext = (char*)malloc(new_l_text + 1); |
2287 | 0 | if (!newtext) return -1; |
2288 | | |
2289 | 0 | if (val) |
2290 | 0 | snprintf(newtext, new_l_text + 1, |
2291 | 0 | "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text); |
2292 | 0 | else |
2293 | 0 | snprintf(newtext, new_l_text + 1, |
2294 | 0 | "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text); |
2295 | 0 | } else { // has @HD but different or no key |
2296 | 0 | new_l_text = (beg - h->text) + (h->text + h->l_text - end); |
2297 | 0 | if (val) { |
2298 | 0 | if (new_l_text > SIZE_MAX - strlen(val) - 5) |
2299 | 0 | return -1; |
2300 | 0 | new_l_text += strlen(val) + 4; |
2301 | 0 | } |
2302 | 0 | newtext = (char*)malloc(new_l_text + 1); |
2303 | 0 | if (!newtext) return -1; |
2304 | | |
2305 | 0 | if (val) { |
2306 | 0 | snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s", |
2307 | 0 | (int) (beg - h->text), h->text, key, val, end); |
2308 | 0 | } else { //delete key |
2309 | 0 | snprintf(newtext, new_l_text + 1, "%.*s%s", |
2310 | 0 | (int) (beg - h->text), h->text, end); |
2311 | 0 | } |
2312 | 0 | } |
2313 | 0 | free(h->text); |
2314 | 0 | h->text = newtext; |
2315 | 0 | h->l_text = new_l_text; |
2316 | 0 | return 0; |
2317 | 0 | } |
2318 | | |
2319 | | |
2320 | | int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val) |
2321 | 0 | { |
2322 | 0 | if (!h || !key) |
2323 | 0 | return -1; |
2324 | | |
2325 | 0 | if (!h->hrecs) |
2326 | 0 | return old_sam_hdr_change_HD(h, key, val); |
2327 | | |
2328 | 0 | if (val) { |
2329 | 0 | if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0) |
2330 | 0 | return -1; |
2331 | 0 | } else { |
2332 | 0 | if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0) |
2333 | 0 | return -1; |
2334 | 0 | } |
2335 | 0 | return sam_hdr_rebuild(h); |
2336 | 0 | } |
2337 | | /********************** |
2338 | | *** SAM record I/O *** |
2339 | | **********************/ |
2340 | | |
2341 | | static int sam_parse_B_vals(char type, uint32_t n, char *in, char **end, |
2342 | | char *r, bam1_t *b) |
2343 | 1.20M | { |
2344 | 1.20M | int orig_l = b->l_data; |
2345 | 1.20M | char *q = in; |
2346 | 1.20M | int32_t size; |
2347 | 1.20M | size_t bytes; |
2348 | 1.20M | int overflow = 0; |
2349 | | |
2350 | 1.20M | size = aux_type2size(type); |
2351 | 1.20M | if (size <= 0 || size > 4) { |
2352 | 2 | hts_log_error("Unrecognized type B:%c", type); |
2353 | 2 | return -1; |
2354 | 2 | } |
2355 | | |
2356 | | // Ensure space for type + values |
2357 | 1.20M | bytes = (size_t) n * (size_t) size; |
2358 | 1.20M | if (bytes / size != n |
2359 | 1.20M | || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) { |
2360 | 0 | hts_log_error("Out of memory"); |
2361 | 0 | return -1; |
2362 | 0 | } |
2363 | | |
2364 | 1.20M | b->data[b->l_data++] = 'B'; |
2365 | 1.20M | b->data[b->l_data++] = type; |
2366 | 1.20M | i32_to_le(n, b->data + b->l_data); |
2367 | 1.20M | b->l_data += sizeof(uint32_t); |
2368 | | // This ensures that q always ends up at the next comma after |
2369 | | // reading a number even if it's followed by junk. It |
2370 | | // prevents the possibility of trying to read more than n items. |
2371 | 328M | #define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0) |
2372 | 1.20M | if (type == 'c') { |
2373 | 70.2M | while (q < r) { |
2374 | 69.9M | *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, &overflow); |
2375 | 69.9M | b->l_data++; |
2376 | 69.9M | skip_to_comma_(q); |
2377 | 69.9M | } |
2378 | 903k | } else if (type == 'C') { |
2379 | 21.2M | while (q < r) { |
2380 | 20.7M | if (*q != '-') { |
2381 | 20.7M | *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, &overflow); |
2382 | 20.7M | b->l_data++; |
2383 | 20.7M | } else { |
2384 | 0 | overflow = 1; |
2385 | 0 | } |
2386 | 20.7M | skip_to_comma_(q); |
2387 | 20.7M | } |
2388 | 453k | } else if (type == 's') { |
2389 | 2.96M | while (q < r) { |
2390 | 2.82M | i16_to_le(hts_str2int(q + 1, &q, 16, &overflow), b->data + b->l_data); |
2391 | 2.82M | b->l_data += 2; |
2392 | 2.82M | skip_to_comma_(q); |
2393 | 2.82M | } |
2394 | 311k | } else if (type == 'S') { |
2395 | 522k | while (q < r) { |
2396 | 521k | if (*q != '-') { |
2397 | 521k | u16_to_le(hts_str2uint(q + 1, &q, 16, &overflow), b->data + b->l_data); |
2398 | 521k | b->l_data += 2; |
2399 | 521k | } else { |
2400 | 0 | overflow = 1; |
2401 | 0 | } |
2402 | 521k | skip_to_comma_(q); |
2403 | 521k | } |
2404 | 310k | } else if (type == 'i') { |
2405 | 68.6M | while (q < r) { |
2406 | 68.4M | i32_to_le(hts_str2int(q + 1, &q, 32, &overflow), b->data + b->l_data); |
2407 | 68.4M | b->l_data += 4; |
2408 | 68.4M | skip_to_comma_(q); |
2409 | 68.4M | } |
2410 | 247k | } else if (type == 'I') { |
2411 | 18.1M | while (q < r) { |
2412 | 18.1M | if (*q != '-') { |
2413 | 18.1M | u32_to_le(hts_str2uint(q + 1, &q, 32, &overflow), b->data + b->l_data); |
2414 | 18.1M | b->l_data += 4; |
2415 | 18.1M | } else { |
2416 | 0 | overflow = 1; |
2417 | 0 | } |
2418 | 18.1M | skip_to_comma_(q); |
2419 | 18.1M | } |
2420 | 62.6k | } else if (type == 'f') { |
2421 | 348k | while (q < r) { |
2422 | 347k | float_to_le(strtod(q + 1, &q), b->data + b->l_data); |
2423 | 347k | b->l_data += 4; |
2424 | 347k | skip_to_comma_(q); |
2425 | 347k | } |
2426 | 199 | } else { |
2427 | 0 | hts_log_error("Unrecognized type B:%c", type); |
2428 | 0 | return -1; |
2429 | 0 | } |
2430 | | |
2431 | 1.20M | if (!overflow) { |
2432 | 757k | *end = q; |
2433 | 757k | return 0; |
2434 | 757k | } else { |
2435 | 446k | int64_t max = 0, min = 0, val; |
2436 | | // Given type was incorrect. Try to rescue the situation. |
2437 | 446k | q = in; |
2438 | 446k | overflow = 0; |
2439 | 446k | b->l_data = orig_l; |
2440 | | // Find out what range of values is present |
2441 | 89.7M | while (q < r) { |
2442 | 89.3M | val = hts_str2int(q + 1, &q, 64, &overflow); |
2443 | 89.3M | if (max < val) max = val; |
2444 | 89.3M | if (min > val) min = val; |
2445 | 89.3M | skip_to_comma_(q); |
2446 | 89.3M | } |
2447 | | // Retry with appropriate type |
2448 | 446k | if (!overflow) { |
2449 | 446k | if (min < 0) { |
2450 | 383k | if (min >= INT8_MIN && max <= INT8_MAX) { |
2451 | 0 | return sam_parse_B_vals('c', n, in, end, r, b); |
2452 | 383k | } else if (min >= INT16_MIN && max <= INT16_MAX) { |
2453 | 136k | return sam_parse_B_vals('s', n, in, end, r, b); |
2454 | 246k | } else if (min >= INT32_MIN && max <= INT32_MAX) { |
2455 | 246k | return sam_parse_B_vals('i', n, in, end, r, b); |
2456 | 246k | } |
2457 | 383k | } else { |
2458 | 63.0k | if (max < UINT8_MAX) { |
2459 | 9 | return sam_parse_B_vals('C', n, in, end, r, b); |
2460 | 63.0k | } else if (max <= UINT16_MAX) { |
2461 | 583 | return sam_parse_B_vals('S', n, in, end, r, b); |
2462 | 62.4k | } else if (max <= UINT32_MAX) { |
2463 | 62.4k | return sam_parse_B_vals('I', n, in, end, r, b); |
2464 | 62.4k | } |
2465 | 63.0k | } |
2466 | 446k | } |
2467 | | // If here then at least one of the values is too big to store |
2468 | 14 | hts_log_error("Numeric value in B array out of allowed range"); |
2469 | 14 | return -1; |
2470 | 446k | } |
2471 | 1.20M | #undef skip_to_comma_ |
2472 | 1.20M | } |
2473 | | |
2474 | 5.80k | static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) { |
2475 | 5.80k | if (*v >= '1' && *v <= '9') { |
2476 | 3.15k | return hts_str2uint(v, rv, 16, overflow); |
2477 | 3.15k | } |
2478 | 2.65k | else if (*v == '0') { |
2479 | | // handle single-digit "0" directly; otherwise it's hex or octal |
2480 | 886 | if (v[1] == '\t') { *rv = v+1; return 0; } |
2481 | 672 | else { |
2482 | 672 | unsigned long val = strtoul(v, rv, 0); |
2483 | 672 | if (val > 65535) { *overflow = 1; return 65535; } |
2484 | 604 | return val; |
2485 | 672 | } |
2486 | 886 | } |
2487 | 1.76k | else { |
2488 | | // TODO implement symbolic flag letters |
2489 | 1.76k | *rv = v; |
2490 | 1.76k | return 0; |
2491 | 1.76k | } |
2492 | 5.80k | } |
2493 | | |
2494 | | // Parse tag line and append to bam object b. |
2495 | | // Shared by both SAM and FASTQ parsers. |
2496 | | // |
2497 | | // The difference between the two is how lenient we are to recognising |
2498 | | // non-compliant strings. The FASTQ parser glosses over arbitrary |
2499 | | // non-SAM looking strings. |
2500 | | static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient, |
2501 | 5.70k | khash_t(tag) *tag_whitelist) { |
2502 | 5.70k | int overflow = 0; |
2503 | 5.70k | int checkpoint; |
2504 | 5.70k | char logbuf[40]; |
2505 | 5.70k | char *q = start, *p = end; |
2506 | | |
2507 | 5.70k | #define _parse_err(cond, ...) \ |
2508 | 5.98M | do { \ |
2509 | 14.0M | if (cond) { \ |
2510 | 39 | if (lenient) { \ |
2511 | 0 | while (q < p && !isspace_c(*q)) \ |
2512 | 0 | q++; \ |
2513 | 0 | while (q < p && isspace_c(*q)) \ |
2514 | 0 | q++; \ |
2515 | 0 | b->l_data = checkpoint; \ |
2516 | 0 | goto loop; \ |
2517 | 39 | } else { \ |
2518 | 39 | hts_log_error(__VA_ARGS__); \ |
2519 | 39 | goto err_ret; \ |
2520 | 39 | } \ |
2521 | 39 | } \ |
2522 | 5.98M | } while (0) |
2523 | | |
2524 | 5.22M | while (q < p) loop: { |
2525 | 5.22M | char type; |
2526 | 5.22M | checkpoint = b->l_data; |
2527 | 5.22M | if (p - q < 5) { |
2528 | 6 | if (lenient) { |
2529 | 0 | break; |
2530 | 6 | } else { |
2531 | 6 | hts_log_error("Incomplete aux field"); |
2532 | 6 | goto err_ret; |
2533 | 6 | } |
2534 | 6 | } |
2535 | 2.61M | _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id"); |
2536 | | |
2537 | 2.61M | if (lenient && (q[2] | q[4]) != ':') { |
2538 | 0 | while (q < p && !isspace_c(*q)) |
2539 | 0 | q++; |
2540 | 0 | while (q < p && isspace_c(*q)) |
2541 | 0 | q++; |
2542 | 0 | continue; |
2543 | 0 | } |
2544 | | |
2545 | 2.61M | if (tag_whitelist) { |
2546 | 0 | int tt = q[0]*256 + q[1]; |
2547 | 0 | if (kh_get(tag, tag_whitelist, tt) == kh_end(tag_whitelist)) { |
2548 | 0 | while (q < p && *q != '\t') |
2549 | 0 | q++; |
2550 | 0 | continue; |
2551 | 0 | } |
2552 | 0 | } |
2553 | | |
2554 | | // Copy over id |
2555 | 2.61M | if (possibly_expand_bam_data(b, 2) < 0) goto err_ret; |
2556 | 2.61M | memcpy(b->data + b->l_data, q, 2); b->l_data += 2; |
2557 | 2.61M | q += 3; type = *q++; ++q; // q points to value |
2558 | 2.61M | if (type != 'Z' && type != 'H') // the only zero length acceptable fields |
2559 | 2.05M | _parse_err(*q <= '\t', "incomplete aux field"); |
2560 | | |
2561 | | // Ensure enough space for a double + type allocated. |
2562 | 2.61M | if (possibly_expand_bam_data(b, 16) < 0) goto err_ret; |
2563 | | |
2564 | 2.61M | if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { |
2565 | 258k | b->data[b->l_data++] = 'A'; |
2566 | 258k | b->data[b->l_data++] = *q++; |
2567 | 2.35M | } else if (type == 'i' || type == 'I') { |
2568 | 1.02M | if (*q == '-') { |
2569 | 686k | int32_t x = hts_str2int(q, &q, 32, &overflow); |
2570 | 686k | if (x >= INT8_MIN) { |
2571 | 298k | b->data[b->l_data++] = 'c'; |
2572 | 298k | b->data[b->l_data++] = x; |
2573 | 387k | } else if (x >= INT16_MIN) { |
2574 | 135k | b->data[b->l_data++] = 's'; |
2575 | 135k | i16_to_le(x, b->data + b->l_data); |
2576 | 135k | b->l_data += 2; |
2577 | 252k | } else { |
2578 | 252k | b->data[b->l_data++] = 'i'; |
2579 | 252k | i32_to_le(x, b->data + b->l_data); |
2580 | 252k | b->l_data += 4; |
2581 | 252k | } |
2582 | 686k | } else { |
2583 | 338k | uint32_t x = hts_str2uint(q, &q, 32, &overflow); |
2584 | 338k | if (x <= UINT8_MAX) { |
2585 | 68.2k | b->data[b->l_data++] = 'C'; |
2586 | 68.2k | b->data[b->l_data++] = x; |
2587 | 270k | } else if (x <= UINT16_MAX) { |
2588 | 176k | b->data[b->l_data++] = 'S'; |
2589 | 176k | u16_to_le(x, b->data + b->l_data); |
2590 | 176k | b->l_data += 2; |
2591 | 176k | } else { |
2592 | 93.9k | b->data[b->l_data++] = 'I'; |
2593 | 93.9k | u32_to_le(x, b->data + b->l_data); |
2594 | 93.9k | b->l_data += 4; |
2595 | 93.9k | } |
2596 | 338k | } |
2597 | 1.32M | } else if (type == 'f') { |
2598 | 335 | b->data[b->l_data++] = 'f'; |
2599 | 335 | float_to_le(strtod(q, &q), b->data + b->l_data); |
2600 | 335 | b->l_data += sizeof(float); |
2601 | 1.32M | } else if (type == 'd') { |
2602 | 13.5k | b->data[b->l_data++] = 'd'; |
2603 | 13.5k | double_to_le(strtod(q, &q), b->data + b->l_data); |
2604 | 13.5k | b->l_data += sizeof(double); |
2605 | 1.31M | } else if (type == 'Z' || type == 'H') { |
2606 | 555k | char *end = strchr(q, '\t'); |
2607 | 555k | if (!end) end = q + strlen(q); |
2608 | 555k | _parse_err(type == 'H' && ((end-q)&1) != 0, |
2609 | 555k | "hex field does not have an even number of digits"); |
2610 | 555k | b->data[b->l_data++] = type; |
2611 | 555k | if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret; |
2612 | 555k | memcpy(b->data + b->l_data, q, end - q); |
2613 | 555k | b->l_data += end - q; |
2614 | 555k | b->data[b->l_data++] = '\0'; |
2615 | 555k | q = end; |
2616 | 757k | } else if (type == 'B') { |
2617 | 757k | uint32_t n; |
2618 | 757k | char *r; |
2619 | 757k | type = *q++; // q points to the first ',' following the typing byte |
2620 | 757k | _parse_err(*q && *q != ',' && *q != '\t', |
2621 | 757k | "B aux field type not followed by ','"); |
2622 | | |
2623 | 117M | for (r = q, n = 0; *r > '\t'; ++r) |
2624 | 116M | if (*r == ',') ++n; |
2625 | | |
2626 | 757k | if (sam_parse_B_vals(type, n, q, &q, r, b) < 0) |
2627 | 16 | goto err_ret; |
2628 | 757k | } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1)); |
2629 | | |
2630 | 24.9M | while (*q > '\t') { q++; } // Skip any junk to next tab |
2631 | 2.61M | q++; |
2632 | 2.61M | } |
2633 | | |
2634 | 5.64k | _parse_err(!lenient && overflow != 0, "numeric value out of allowed range"); |
2635 | 5.64k | #undef _parse_err |
2636 | | |
2637 | 5.64k | return 0; |
2638 | | |
2639 | 61 | err_ret: |
2640 | 61 | return -2; |
2641 | 5.64k | } |
2642 | | |
2643 | | int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) |
2644 | 5.87k | { |
2645 | 27.5k | #define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0) |
2646 | | |
2647 | 5.87k | #if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff |
2648 | | |
2649 | | // Macro that operates on 64-bits at a time. |
2650 | 5.87k | #define COPY_MINUS_N(to,from,n,l,failed) \ |
2651 | 5.87k | do { \ |
2652 | 5.69k | uint64_u *from8 = (uint64_u *)(from); \ |
2653 | 5.69k | uint64_u *to8 = (uint64_u *)(to); \ |
2654 | 5.69k | uint64_t uflow = 0; \ |
2655 | 5.69k | size_t l8 = (l)>>3, i; \ |
2656 | 5.90k | for (i = 0; i < l8; i++) { \ |
2657 | 219 | to8[i] = from8[i] - (n)*0x0101010101010101UL; \ |
2658 | 219 | uflow |= to8[i]; \ |
2659 | 219 | } \ |
2660 | 6.10k | for (i<<=3; i < (l); ++i) { \ |
2661 | 412 | to[i] = from[i] - (n); \ |
2662 | 412 | uflow |= to[i]; \ |
2663 | 412 | } \ |
2664 | 5.69k | failed = (uflow & 0x8080808080808080UL) > 0; \ |
2665 | 5.69k | } while (0) |
2666 | | |
2667 | | #else |
2668 | | |
2669 | | // Basic version which operates a byte at a time |
2670 | | #define COPY_MINUS_N(to,from,n,l,failed) do { \ |
2671 | | uint8_t uflow = 0; \ |
2672 | | for (i = 0; i < (l); ++i) { \ |
2673 | | (to)[i] = (from)[i] - (n); \ |
2674 | | uflow |= (uint8_t) (to)[i]; \ |
2675 | | } \ |
2676 | | failed = (uflow & 0x80) > 0; \ |
2677 | | } while (0) |
2678 | | |
2679 | | #endif |
2680 | | |
2681 | 10.3k | #define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l) |
2682 | 66.7k | #define _parse_err(cond, ...) do { if (cond) { hts_log_error(__VA_ARGS__); goto err_ret; } } while (0) |
2683 | 17.9k | #define _parse_warn(cond, ...) do { if (cond) { hts_log_warning(__VA_ARGS__); } } while (0) |
2684 | | |
2685 | 5.87k | uint8_t *t; |
2686 | | |
2687 | 5.87k | char *p = s->s, *q; |
2688 | 5.87k | int i, overflow = 0; |
2689 | 5.87k | char logbuf[40]; |
2690 | 5.87k | hts_pos_t cigreflen; |
2691 | 5.87k | bam1_core_t *c = &b->core; |
2692 | | |
2693 | 5.87k | b->l_data = 0; |
2694 | 5.87k | memset(c, 0, 32); |
2695 | | |
2696 | | // qname |
2697 | 5.87k | q = _read_token(p); |
2698 | | |
2699 | 5.81k | _parse_warn(p - q <= 1, "empty query name"); |
2700 | 5.81k | _parse_err(p - q > 255, "query name too long"); |
2701 | | // resize large enough for name + extranul |
2702 | 5.80k | if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret; |
2703 | 5.80k | memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q; |
2704 | | |
2705 | 5.80k | c->l_extranul = (4 - (b->l_data & 3)) & 3; |
2706 | 5.80k | memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul); |
2707 | 5.80k | b->l_data += c->l_extranul; |
2708 | | |
2709 | 5.80k | c->l_qname = p - q + c->l_extranul; |
2710 | | |
2711 | | // flag |
2712 | 5.80k | c->flag = parse_sam_flag(p, &p, &overflow); |
2713 | 5.80k | if (*p++ != '\t') goto err_ret; // malformated flag |
2714 | | |
2715 | | // chr |
2716 | 5.78k | q = _read_token(p); |
2717 | 5.78k | if (strcmp(q, "*")) { |
2718 | 1.55k | _parse_err(h->n_targets == 0, "no SQ lines present in the header"); |
2719 | 1.55k | c->tid = bam_name2id(h, q); |
2720 | 1.55k | _parse_err(c->tid < -1, "failed to parse header"); |
2721 | 1.52k | _parse_warn(c->tid < 0, "unrecognized reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX)); |
2722 | 4.22k | } else c->tid = -1; |
2723 | | |
2724 | | // pos |
2725 | 5.75k | c->pos = hts_str2uint(p, &p, 63, &overflow) - 1; |
2726 | 5.75k | if (*p++ != '\t') goto err_ret; |
2727 | 5.75k | if (c->pos < 0 && c->tid >= 0) { |
2728 | 105 | _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped"); |
2729 | 105 | c->tid = -1; |
2730 | 105 | } |
2731 | 5.75k | if (c->tid < 0) c->flag |= BAM_FUNMAP; |
2732 | | |
2733 | | // mapq |
2734 | 5.75k | c->qual = hts_str2uint(p, &p, 8, &overflow); |
2735 | 5.75k | if (*p++ != '\t') goto err_ret; |
2736 | | // cigar |
2737 | 5.75k | if (*p != '*') { |
2738 | 1.28k | uint32_t *cigar = NULL; |
2739 | 1.28k | int old_l_data = b->l_data; |
2740 | 1.28k | int n_cigar = bam_parse_cigar(p, &p, b); |
2741 | 1.28k | if (n_cigar < 1 || *p++ != '\t') goto err_ret; |
2742 | 1.26k | cigar = (uint32_t *)(b->data + old_l_data); |
2743 | 1.26k | c->n_cigar = n_cigar; |
2744 | | |
2745 | | // can't use bam_endpos() directly as some fields not yet set up |
2746 | 1.26k | cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1; |
2747 | 1.26k | if (cigreflen == 0) cigreflen = 1; |
2748 | 4.46k | } else { |
2749 | 4.46k | _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped"); |
2750 | 4.46k | c->flag |= BAM_FUNMAP; |
2751 | 4.46k | q = _read_token(p); |
2752 | 4.46k | cigreflen = 1; |
2753 | 4.46k | } |
2754 | 5.72k | _parse_err(HTS_POS_MAX - cigreflen <= c->pos, |
2755 | 5.72k | "read ends beyond highest supported position"); |
2756 | 5.72k | c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5); |
2757 | | // mate chr |
2758 | 5.72k | q = _read_token(p); |
2759 | 5.72k | if (strcmp(q, "=") == 0) { |
2760 | 0 | c->mtid = c->tid; |
2761 | 5.72k | } else if (strcmp(q, "*") == 0) { |
2762 | 163 | c->mtid = -1; |
2763 | 5.56k | } else { |
2764 | 5.56k | c->mtid = bam_name2id(h, q); |
2765 | 5.56k | _parse_err(c->mtid < -1, "failed to parse header"); |
2766 | 5.55k | _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX)); |
2767 | 5.55k | } |
2768 | | // mpos |
2769 | 5.71k | c->mpos = hts_str2uint(p, &p, 63, &overflow) - 1; |
2770 | 5.71k | if (*p++ != '\t') goto err_ret; |
2771 | 5.70k | if (c->mpos < 0 && c->mtid >= 0) { |
2772 | 525 | _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped"); |
2773 | 525 | c->mtid = -1; |
2774 | 525 | } |
2775 | | // tlen |
2776 | 5.70k | c->isize = hts_str2int(p, &p, 64, &overflow); |
2777 | 5.70k | if (*p++ != '\t') goto err_ret; |
2778 | | // seq |
2779 | 5.70k | q = _read_token(p); |
2780 | 5.70k | if (strcmp(q, "*")) { |
2781 | 4.68k | _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long"); |
2782 | 4.68k | c->l_qseq = p - q - 1; |
2783 | 4.68k | hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname)); |
2784 | 4.68k | _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length"); |
2785 | 4.68k | i = (c->l_qseq + 1) >> 1; |
2786 | 4.68k | _get_mem(uint8_t, &t, b, i); |
2787 | | |
2788 | 4.68k | unsigned int lqs2 = c->l_qseq&~1, i; |
2789 | 45.4k | for (i = 0; i < lqs2; i+=2) |
2790 | 40.7k | t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]]; |
2791 | 4.91k | for (; i < c->l_qseq; ++i) |
2792 | 230 | t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2); |
2793 | 4.68k | } else c->l_qseq = 0; |
2794 | | // qual |
2795 | 11.4k | _get_mem(uint8_t, &t, b, c->l_qseq); |
2796 | 11.4k | if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) { |
2797 | 16 | memset(t, 0xff, c->l_qseq); |
2798 | 16 | p += 2; |
2799 | 5.69k | } else { |
2800 | 5.69k | int failed = 0; |
2801 | 5.69k | _parse_err(s->l - (p - s->s) < c->l_qseq |
2802 | 5.69k | || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'), |
2803 | 5.69k | "SEQ and QUAL are of different length"); |
2804 | 5.69k | COPY_MINUS_N(t, p, 33, c->l_qseq, failed); |
2805 | 5.69k | _parse_err(failed, "invalid QUAL character"); |
2806 | 5.68k | p += c->l_qseq + 1; |
2807 | 5.68k | } |
2808 | | |
2809 | | // aux |
2810 | 5.70k | if (aux_parse(p, s->s + s->l, b, 0, NULL) < 0) |
2811 | 61 | goto err_ret; |
2812 | | |
2813 | 5.64k | if (bam_tag2cigar(b, 1, 1) < 0) |
2814 | 0 | return -2; |
2815 | 5.64k | return 0; |
2816 | | |
2817 | 0 | #undef _parse_warn |
2818 | 0 | #undef _parse_err |
2819 | 0 | #undef _get_mem |
2820 | 0 | #undef _read_token |
2821 | 233 | err_ret: |
2822 | 233 | return -2; |
2823 | 5.64k | } |
2824 | | |
2825 | 1.28k | static uint32_t read_ncigar(const char *q) { |
2826 | 1.28k | uint32_t n_cigar = 0; |
2827 | 66.5M | for (; *q && *q != '\t'; ++q) |
2828 | 66.5M | if (!isdigit_c(*q)) ++n_cigar; |
2829 | 1.28k | if (!n_cigar) { |
2830 | 4 | hts_log_error("No CIGAR operations"); |
2831 | 4 | return 0; |
2832 | 4 | } |
2833 | 1.28k | if (n_cigar >= 2147483647) { |
2834 | 0 | hts_log_error("Too many CIGAR operations"); |
2835 | 0 | return 0; |
2836 | 0 | } |
2837 | | |
2838 | 1.28k | return n_cigar; |
2839 | 1.28k | } |
2840 | | |
2841 | | /*! @function |
2842 | | @abstract Parse a CIGAR string into preallocated a uint32_t array |
2843 | | @param in [in] pointer to the source string |
2844 | | @param a_cigar [out] address of the destination uint32_t buffer |
2845 | | @return number of processed input characters; 0 on error |
2846 | | */ |
2847 | 1.28k | static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) { |
2848 | 1.28k | int i, overflow = 0; |
2849 | 1.28k | const char *p = in; |
2850 | 3.19k | for (i = 0; i < n_cigar; i++) { |
2851 | 1.92k | uint32_t len; |
2852 | 1.92k | int op; |
2853 | 1.92k | char *q; |
2854 | 1.92k | len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT; |
2855 | 1.92k | if (q == p) { |
2856 | 12 | hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p); |
2857 | 12 | return 0; |
2858 | 12 | } |
2859 | 1.91k | if (overflow) { |
2860 | 0 | hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p); |
2861 | 0 | return 0; |
2862 | 0 | } |
2863 | 1.91k | p = q; |
2864 | 1.91k | op = bam_cigar_table[(unsigned char)*p++]; |
2865 | 1.91k | if (op < 0) { |
2866 | 1 | hts_log_error("Unrecognized CIGAR operator"); |
2867 | 1 | return 0; |
2868 | 1 | } |
2869 | 1.91k | a_cigar[i] = len; |
2870 | 1.91k | a_cigar[i] |= op; |
2871 | 1.91k | } |
2872 | | |
2873 | 1.26k | return p-in; |
2874 | 1.28k | } |
2875 | | |
2876 | 0 | ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) { |
2877 | 0 | size_t n_cigar = 0; |
2878 | 0 | int diff; |
2879 | |
|
2880 | 0 | if (!in || !a_cigar || !a_mem) { |
2881 | 0 | hts_log_error("NULL pointer arguments"); |
2882 | 0 | return -1; |
2883 | 0 | } |
2884 | 0 | if (end) *end = (char *)in; |
2885 | |
|
2886 | 0 | if (*in == '*') { |
2887 | 0 | if (end) (*end)++; |
2888 | 0 | return 0; |
2889 | 0 | } |
2890 | 0 | n_cigar = read_ncigar(in); |
2891 | 0 | if (!n_cigar) return 0; |
2892 | 0 | if (n_cigar > *a_mem) { |
2893 | 0 | uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar)); |
2894 | 0 | if (a_tmp) { |
2895 | 0 | *a_cigar = a_tmp; |
2896 | 0 | *a_mem = n_cigar; |
2897 | 0 | } else { |
2898 | 0 | hts_log_error("Memory allocation error"); |
2899 | 0 | return -1; |
2900 | 0 | } |
2901 | 0 | } |
2902 | | |
2903 | 0 | if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1; |
2904 | 0 | if (end) *end = (char *)in+diff; |
2905 | |
|
2906 | 0 | return n_cigar; |
2907 | 0 | } |
2908 | | |
2909 | 1.28k | ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) { |
2910 | 1.28k | size_t n_cigar = 0; |
2911 | 1.28k | int diff; |
2912 | | |
2913 | 1.28k | if (!in || !b) { |
2914 | 0 | hts_log_error("NULL pointer arguments"); |
2915 | 0 | return -1; |
2916 | 0 | } |
2917 | 1.28k | if (end) *end = (char *)in; |
2918 | | |
2919 | 1.28k | if (*in == '*') { |
2920 | 0 | if (end) (*end)++; |
2921 | 0 | return 0; |
2922 | 0 | } |
2923 | 1.28k | n_cigar = read_ncigar(in); |
2924 | 1.28k | if (!n_cigar) return 0; |
2925 | 1.28k | if (possibly_expand_bam_data(b, n_cigar * sizeof(uint32_t)) < 0) { |
2926 | 0 | hts_log_error("Memory allocation error"); |
2927 | 0 | return -1; |
2928 | 0 | } |
2929 | | |
2930 | 1.28k | if (!(diff = parse_cigar(in, (uint32_t *)(b->data + b->l_data), n_cigar))) return -1; |
2931 | 1.26k | b->l_data += (n_cigar * sizeof(uint32_t)); |
2932 | 1.26k | if (end) *end = (char *)in+diff; |
2933 | | |
2934 | 1.26k | return n_cigar; |
2935 | 1.28k | } |
2936 | | |
2937 | | /* |
2938 | | * ----------------------------------------------------------------------------- |
2939 | | * SAM threading |
2940 | | */ |
2941 | | // Size of SAM text block (reading) |
2942 | 0 | #define SAM_NBYTES 240000 |
2943 | | |
2944 | | // Number of BAM records (writing, up to NB_mem in size) |
2945 | 0 | #define SAM_NBAM 1000 |
2946 | | |
2947 | | struct SAM_state; |
2948 | | |
2949 | | // Output job - a block of BAM records |
2950 | | typedef struct sp_bams { |
2951 | | struct sp_bams *next; |
2952 | | int serial; |
2953 | | |
2954 | | bam1_t *bams; |
2955 | | int nbams, abams; // used and alloc for bams[] array |
2956 | | size_t bam_mem; // very approximate total size |
2957 | | |
2958 | | struct SAM_state *fd; |
2959 | | } sp_bams; |
2960 | | |
2961 | | // Input job - a block of SAM text |
2962 | | typedef struct sp_lines { |
2963 | | struct sp_lines *next; |
2964 | | int serial; |
2965 | | |
2966 | | char *data; |
2967 | | int data_size; |
2968 | | int alloc; |
2969 | | |
2970 | | struct SAM_state *fd; |
2971 | | sp_bams *bams; |
2972 | | } sp_lines; |
2973 | | |
2974 | | enum sam_cmd { |
2975 | | SAM_NONE = 0, |
2976 | | SAM_CLOSE, |
2977 | | SAM_CLOSE_DONE, |
2978 | | }; |
2979 | | |
2980 | | typedef struct SAM_state { |
2981 | | sam_hdr_t *h; |
2982 | | |
2983 | | hts_tpool *p; |
2984 | | int own_pool; |
2985 | | pthread_mutex_t lines_m; |
2986 | | hts_tpool_process *q; |
2987 | | pthread_t dispatcher; |
2988 | | int dispatcher_set; |
2989 | | |
2990 | | sp_lines *lines; |
2991 | | sp_bams *bams; |
2992 | | |
2993 | | sp_bams *curr_bam; |
2994 | | int curr_idx; |
2995 | | int serial; |
2996 | | |
2997 | | // Be warned: moving these mutexes around in this struct can reduce |
2998 | | // threading performance by up to 70%! |
2999 | | pthread_mutex_t command_m; |
3000 | | pthread_cond_t command_c; |
3001 | | enum sam_cmd command; |
3002 | | |
3003 | | // One of the E* errno codes |
3004 | | int errcode; |
3005 | | |
3006 | | htsFile *fp; |
3007 | | } SAM_state; |
3008 | | |
3009 | | // Returns a SAM_state struct from a generic hFILE. |
3010 | | // |
3011 | | // Returns NULL on failure. |
3012 | 0 | static SAM_state *sam_state_create(htsFile *fp) { |
3013 | | // Ideally sam_open wouldn't be a #define to hts_open but instead would |
3014 | | // be a redirect call with an additional 'S' mode. This in turn would |
3015 | | // correctly set the designed format to sam instead of a generic |
3016 | | // text_format. |
3017 | 0 | if (fp->format.format != sam && fp->format.format != text_format) |
3018 | 0 | return NULL; |
3019 | | |
3020 | 0 | SAM_state *fd = calloc(1, sizeof(*fd)); |
3021 | 0 | if (!fd) |
3022 | 0 | return NULL; |
3023 | | |
3024 | 0 | fp->state = fd; |
3025 | 0 | fd->fp = fp; |
3026 | |
|
3027 | 0 | return fd; |
3028 | 0 | } |
3029 | | |
3030 | | static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str); |
3031 | | static void *sam_format_worker(void *arg); |
3032 | | |
3033 | 0 | static void sam_state_err(SAM_state *fd, int errcode) { |
3034 | 0 | pthread_mutex_lock(&fd->command_m); |
3035 | 0 | if (!fd->errcode) |
3036 | 0 | fd->errcode = errcode; |
3037 | 0 | pthread_mutex_unlock(&fd->command_m); |
3038 | 0 | } |
3039 | | |
3040 | 0 | static void sam_free_sp_bams(sp_bams *b) { |
3041 | 0 | if (!b) |
3042 | 0 | return; |
3043 | | |
3044 | 0 | if (b->bams) { |
3045 | 0 | int i; |
3046 | 0 | for (i = 0; i < b->abams; i++) { |
3047 | 0 | if (b->bams[i].data) |
3048 | 0 | free(b->bams[i].data); |
3049 | 0 | } |
3050 | 0 | free(b->bams); |
3051 | 0 | } |
3052 | 0 | free(b); |
3053 | 0 | } |
3054 | | |
3055 | | // Destroys the state produce by sam_state_create. |
3056 | 1.38k | int sam_state_destroy(htsFile *fp) { |
3057 | 1.38k | int ret = 0; |
3058 | | |
3059 | 1.38k | if (!fp->state) |
3060 | 1.38k | return 0; |
3061 | | |
3062 | 0 | SAM_state *fd = fp->state; |
3063 | 0 | if (fd->p) { |
3064 | 0 | if (fd->h) { |
3065 | | // Notify sam_dispatcher we're closing |
3066 | 0 | pthread_mutex_lock(&fd->command_m); |
3067 | 0 | if (fd->command != SAM_CLOSE_DONE) |
3068 | 0 | fd->command = SAM_CLOSE; |
3069 | 0 | pthread_cond_signal(&fd->command_c); |
3070 | 0 | ret = -fd->errcode; |
3071 | 0 | if (fd->q) |
3072 | 0 | hts_tpool_wake_dispatch(fd->q); // unstick the reader |
3073 | |
|
3074 | 0 | if (!fp->is_write && fd->q && fd->dispatcher_set) { |
3075 | 0 | for (;;) { |
3076 | | // Avoid deadlocks with dispatcher |
3077 | 0 | if (fd->command == SAM_CLOSE_DONE) |
3078 | 0 | break; |
3079 | 0 | hts_tpool_wake_dispatch(fd->q); |
3080 | 0 | pthread_mutex_unlock(&fd->command_m); |
3081 | 0 | usleep(10000); |
3082 | 0 | pthread_mutex_lock(&fd->command_m); |
3083 | 0 | } |
3084 | 0 | } |
3085 | 0 | pthread_mutex_unlock(&fd->command_m); |
3086 | |
|
3087 | 0 | if (fp->is_write) { |
3088 | | // Dispatch the last partial block. |
3089 | 0 | sp_bams *gb = fd->curr_bam; |
3090 | 0 | if (!ret && gb && gb->nbams > 0 && fd->q) |
3091 | 0 | ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb); |
3092 | | |
3093 | | // Flush and drain output |
3094 | 0 | if (fd->q) |
3095 | 0 | hts_tpool_process_flush(fd->q); |
3096 | 0 | pthread_mutex_lock(&fd->command_m); |
3097 | 0 | if (!ret) ret = -fd->errcode; |
3098 | 0 | pthread_mutex_unlock(&fd->command_m); |
3099 | |
|
3100 | 0 | while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) { |
3101 | 0 | usleep(10000); |
3102 | 0 | pthread_mutex_lock(&fd->command_m); |
3103 | 0 | ret = -fd->errcode; |
3104 | | // not empty but shutdown implies error |
3105 | 0 | if (hts_tpool_process_is_shutdown(fd->q) && !ret) |
3106 | 0 | ret = EIO; |
3107 | 0 | pthread_mutex_unlock(&fd->command_m); |
3108 | 0 | } |
3109 | 0 | if (fd->q) |
3110 | 0 | hts_tpool_process_shutdown(fd->q); |
3111 | 0 | } |
3112 | | |
3113 | | // Wait for it to acknowledge |
3114 | 0 | if (fd->dispatcher_set) |
3115 | 0 | pthread_join(fd->dispatcher, NULL); |
3116 | 0 | if (!ret) ret = -fd->errcode; |
3117 | 0 | } |
3118 | | |
3119 | | // Tidy up memory |
3120 | 0 | if (fd->q) |
3121 | 0 | hts_tpool_process_destroy(fd->q); |
3122 | |
|
3123 | 0 | if (fd->own_pool && fp->format.compression == no_compression) { |
3124 | 0 | hts_tpool_destroy(fd->p); |
3125 | 0 | fd->p = NULL; |
3126 | 0 | } |
3127 | 0 | pthread_mutex_destroy(&fd->lines_m); |
3128 | 0 | pthread_mutex_destroy(&fd->command_m); |
3129 | 0 | pthread_cond_destroy(&fd->command_c); |
3130 | |
|
3131 | 0 | sp_lines *l = fd->lines; |
3132 | 0 | while (l) { |
3133 | 0 | sp_lines *n = l->next; |
3134 | 0 | free(l->data); |
3135 | 0 | free(l); |
3136 | 0 | l = n; |
3137 | 0 | } |
3138 | |
|
3139 | 0 | sp_bams *b = fd->bams; |
3140 | 0 | while (b) { |
3141 | 0 | if (fd->curr_bam == b) |
3142 | 0 | fd->curr_bam = NULL; |
3143 | 0 | sp_bams *n = b->next; |
3144 | 0 | sam_free_sp_bams(b); |
3145 | 0 | b = n; |
3146 | 0 | } |
3147 | |
|
3148 | 0 | if (fd->curr_bam) |
3149 | 0 | sam_free_sp_bams(fd->curr_bam); |
3150 | | |
3151 | | // Decrement counter by one, maybe destroying too. |
3152 | | // This is to permit the caller using bam_hdr_destroy |
3153 | | // before sam_close without triggering decode errors |
3154 | | // in the background threads. |
3155 | 0 | bam_hdr_destroy(fd->h); |
3156 | 0 | } |
3157 | |
|
3158 | 0 | free(fp->state); |
3159 | 0 | fp->state = NULL; |
3160 | 0 | return ret; |
3161 | 1.38k | } |
3162 | | |
3163 | | // Cleanup function - job for sam_parse_worker; result for sam_format_worker |
3164 | 0 | static void cleanup_sp_lines(void *arg) { |
3165 | 0 | sp_lines *gl = (sp_lines *)arg; |
3166 | 0 | if (!gl) return; |
3167 | | |
3168 | | // Should always be true for lines passed to / from thread workers. |
3169 | 0 | assert(gl->next == NULL); |
3170 | | |
3171 | 0 | free(gl->data); |
3172 | 0 | sam_free_sp_bams(gl->bams); |
3173 | 0 | free(gl); |
3174 | 0 | } |
3175 | | |
3176 | | // Run from one of the worker threads. |
3177 | | // Convert a passed in array of lines to array of BAMs, returning |
3178 | | // the result back to the thread queue. |
3179 | 0 | static void *sam_parse_worker(void *arg) { |
3180 | 0 | sp_lines *gl = (sp_lines *)arg; |
3181 | 0 | sp_bams *gb = NULL; |
3182 | 0 | char *lines = gl->data; |
3183 | 0 | int i; |
3184 | 0 | bam1_t *b; |
3185 | 0 | SAM_state *fd = gl->fd; |
3186 | | |
3187 | | // Use a block of BAM structs we had earlier if available. |
3188 | 0 | pthread_mutex_lock(&fd->lines_m); |
3189 | 0 | if (fd->bams) { |
3190 | 0 | gb = fd->bams; |
3191 | 0 | fd->bams = gb->next; |
3192 | 0 | } |
3193 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3194 | |
|
3195 | 0 | if (gb == NULL) { |
3196 | 0 | gb = calloc(1, sizeof(*gb)); |
3197 | 0 | if (!gb) { |
3198 | 0 | return NULL; |
3199 | 0 | } |
3200 | 0 | gb->abams = 100; |
3201 | 0 | gb->bams = b = calloc(gb->abams, sizeof(*b)); |
3202 | 0 | if (!gb->bams) { |
3203 | 0 | sam_state_err(fd, ENOMEM); |
3204 | 0 | goto err; |
3205 | 0 | } |
3206 | 0 | gb->nbams = 0; |
3207 | 0 | gb->bam_mem = 0; |
3208 | 0 | } |
3209 | 0 | gb->serial = gl->serial; |
3210 | 0 | gb->next = NULL; |
3211 | |
|
3212 | 0 | b = (bam1_t *)gb->bams; |
3213 | 0 | if (!b) { |
3214 | 0 | sam_state_err(fd, ENOMEM); |
3215 | 0 | goto err; |
3216 | 0 | } |
3217 | | |
3218 | 0 | i = 0; |
3219 | 0 | char *cp = lines, *cp_end = lines + gl->data_size; |
3220 | 0 | while (cp < cp_end) { |
3221 | 0 | if (i >= gb->abams) { |
3222 | 0 | int old_abams = gb->abams; |
3223 | 0 | gb->abams *= 2; |
3224 | 0 | b = (bam1_t *)realloc(gb->bams, gb->abams*sizeof(bam1_t)); |
3225 | 0 | if (!b) { |
3226 | 0 | gb->abams /= 2; |
3227 | 0 | sam_state_err(fd, ENOMEM); |
3228 | 0 | goto err; |
3229 | 0 | } |
3230 | 0 | memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b)); |
3231 | 0 | gb->bams = b; |
3232 | 0 | } |
3233 | | |
3234 | | // Ideally we'd get sam_parse1 to return the number of |
3235 | | // bytes decoded and to be able to stop on newline as |
3236 | | // well as \0. |
3237 | | // |
3238 | | // We can then avoid the additional strchr loop. |
3239 | | // It's around 6% of our CPU cost, albeit threadable. |
3240 | | // |
3241 | | // However this is an API change so for now we copy. |
3242 | | |
3243 | 0 | char *nl = strchr(cp, '\n'); |
3244 | 0 | char *line_end; |
3245 | 0 | if (nl) { |
3246 | 0 | line_end = nl; |
3247 | 0 | if (line_end > cp && *(line_end - 1) == '\r') |
3248 | 0 | line_end--; |
3249 | 0 | nl++; |
3250 | 0 | } else { |
3251 | 0 | nl = line_end = cp_end; |
3252 | 0 | } |
3253 | 0 | *line_end = '\0'; |
3254 | 0 | kstring_t ks = { line_end - cp, gl->alloc, cp }; |
3255 | 0 | if (sam_parse1(&ks, fd->h, &b[i]) < 0) { |
3256 | 0 | sam_state_err(fd, errno ? errno : EIO); |
3257 | 0 | cleanup_sp_lines(gl); |
3258 | 0 | goto err; |
3259 | 0 | } |
3260 | | |
3261 | 0 | cp = nl; |
3262 | 0 | i++; |
3263 | 0 | } |
3264 | 0 | gb->nbams = i; |
3265 | |
|
3266 | 0 | pthread_mutex_lock(&fd->lines_m); |
3267 | 0 | gl->next = fd->lines; |
3268 | 0 | fd->lines = gl; |
3269 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3270 | 0 | return gb; |
3271 | | |
3272 | 0 | err: |
3273 | 0 | sam_free_sp_bams(gb); |
3274 | 0 | return NULL; |
3275 | 0 | } |
3276 | | |
3277 | 0 | static void *sam_parse_eof(void *arg) { |
3278 | 0 | return NULL; |
3279 | 0 | } |
3280 | | |
3281 | | // Cleanup function - result for sam_parse_worker; job for sam_format_worker |
3282 | 0 | static void cleanup_sp_bams(void *arg) { |
3283 | 0 | sam_free_sp_bams((sp_bams *) arg); |
3284 | 0 | } |
3285 | | |
3286 | | // Runs in its own thread. |
3287 | | // Reads a block of text (SAM) and sends a new job to the thread queue to |
3288 | | // translate this to BAM. |
3289 | 0 | static void *sam_dispatcher_read(void *vp) { |
3290 | 0 | htsFile *fp = vp; |
3291 | 0 | kstring_t line = {0}; |
3292 | 0 | int line_frag = 0; |
3293 | 0 | SAM_state *fd = fp->state; |
3294 | 0 | sp_lines *l = NULL; |
3295 | | |
3296 | | // Pre-allocate buffer for left-over bits of line (exact size doesn't |
3297 | | // matter as it will grow if necessary). |
3298 | 0 | if (ks_resize(&line, 1000) < 0) |
3299 | 0 | goto err; |
3300 | | |
3301 | 0 | for (;;) { |
3302 | | // Check for command |
3303 | 0 | pthread_mutex_lock(&fd->command_m); |
3304 | 0 | switch (fd->command) { |
3305 | | |
3306 | 0 | case SAM_CLOSE: |
3307 | 0 | pthread_cond_signal(&fd->command_c); |
3308 | 0 | pthread_mutex_unlock(&fd->command_m); |
3309 | 0 | hts_tpool_process_shutdown(fd->q); |
3310 | 0 | goto tidyup; |
3311 | | |
3312 | 0 | default: |
3313 | 0 | break; |
3314 | 0 | } |
3315 | 0 | pthread_mutex_unlock(&fd->command_m); |
3316 | |
|
3317 | 0 | pthread_mutex_lock(&fd->lines_m); |
3318 | 0 | if (fd->lines) { |
3319 | | // reuse existing line buffer |
3320 | 0 | l = fd->lines; |
3321 | 0 | fd->lines = l->next; |
3322 | 0 | } |
3323 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3324 | |
|
3325 | 0 | if (l == NULL) { |
3326 | | // none to reuse, to create a new one |
3327 | 0 | l = calloc(1, sizeof(*l)); |
3328 | 0 | if (!l) |
3329 | 0 | goto err; |
3330 | 0 | l->alloc = SAM_NBYTES; |
3331 | 0 | l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1 |
3332 | 0 | if (!l->data) { |
3333 | 0 | free(l); |
3334 | 0 | l = NULL; |
3335 | 0 | goto err; |
3336 | 0 | } |
3337 | 0 | l->fd = fd; |
3338 | 0 | } |
3339 | 0 | l->next = NULL; |
3340 | |
|
3341 | 0 | if (l->alloc < line_frag+SAM_NBYTES/2) { |
3342 | 0 | char *rp = realloc(l->data, line_frag+SAM_NBYTES/2 +8); |
3343 | 0 | if (!rp) |
3344 | 0 | goto err; |
3345 | 0 | l->alloc = line_frag+SAM_NBYTES/2; |
3346 | 0 | l->data = rp; |
3347 | 0 | } |
3348 | 0 | memcpy(l->data, line.s, line_frag); |
3349 | |
|
3350 | 0 | l->data_size = line_frag; |
3351 | 0 | ssize_t nbytes; |
3352 | 0 | longer_line: |
3353 | 0 | if (fp->is_bgzf) |
3354 | 0 | nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag); |
3355 | 0 | else |
3356 | 0 | nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag); |
3357 | 0 | if (nbytes < 0) { |
3358 | 0 | sam_state_err(fd, errno ? errno : EIO); |
3359 | 0 | goto err; |
3360 | 0 | } else if (nbytes == 0) |
3361 | 0 | break; // EOF |
3362 | 0 | l->data_size += nbytes; |
3363 | | |
3364 | | // trim to last \n. Maybe \r\n, but that's still fine |
3365 | 0 | if (nbytes == l->alloc - line_frag) { |
3366 | 0 | char *cp_end = l->data + l->data_size; |
3367 | 0 | char *cp = cp_end-1; |
3368 | |
|
3369 | 0 | while (cp > (char *)l->data && *cp != '\n') |
3370 | 0 | cp--; |
3371 | | |
3372 | | // entire buffer is part of a single line |
3373 | 0 | if (cp == l->data) { |
3374 | 0 | line_frag = l->data_size; |
3375 | 0 | char *rp = realloc(l->data, l->alloc * 2 + 8); |
3376 | 0 | if (!rp) |
3377 | 0 | goto err; |
3378 | 0 | l->alloc *= 2; |
3379 | 0 | l->data = rp; |
3380 | 0 | assert(l->alloc >= l->data_size); |
3381 | 0 | assert(l->alloc >= line_frag); |
3382 | 0 | assert(l->alloc >= l->alloc - line_frag); |
3383 | 0 | goto longer_line; |
3384 | 0 | } |
3385 | 0 | cp++; |
3386 | | |
3387 | | // line holds the remainder of our line. |
3388 | 0 | if (ks_resize(&line, cp_end - cp) < 0) |
3389 | 0 | goto err; |
3390 | 0 | memcpy(line.s, cp, cp_end - cp); |
3391 | 0 | line_frag = cp_end - cp; |
3392 | 0 | l->data_size = l->alloc - line_frag; |
3393 | 0 | } else { |
3394 | | // out of buffer |
3395 | 0 | line_frag = 0; |
3396 | 0 | } |
3397 | | |
3398 | 0 | l->serial = fd->serial++; |
3399 | | //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial); |
3400 | 0 | if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l, |
3401 | 0 | cleanup_sp_lines, cleanup_sp_bams, 0) < 0) |
3402 | 0 | goto err; |
3403 | 0 | pthread_mutex_lock(&fd->command_m); |
3404 | 0 | if (fd->command == SAM_CLOSE) { |
3405 | 0 | pthread_mutex_unlock(&fd->command_m); |
3406 | 0 | l = NULL; |
3407 | 0 | goto tidyup; |
3408 | 0 | } |
3409 | 0 | l = NULL; // Now "owned" by sam_parse_worker() |
3410 | 0 | pthread_mutex_unlock(&fd->command_m); |
3411 | 0 | } |
3412 | | |
3413 | 0 | if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0) |
3414 | 0 | goto err; |
3415 | | |
3416 | | // At EOF, wait for close request. |
3417 | | // (In future if we add support for seek, this is where we need to catch it.) |
3418 | 0 | for (;;) { |
3419 | 0 | pthread_mutex_lock(&fd->command_m); |
3420 | 0 | if (fd->command == SAM_NONE) |
3421 | 0 | pthread_cond_wait(&fd->command_c, &fd->command_m); |
3422 | 0 | switch (fd->command) { |
3423 | 0 | case SAM_CLOSE: |
3424 | 0 | pthread_cond_signal(&fd->command_c); |
3425 | 0 | pthread_mutex_unlock(&fd->command_m); |
3426 | 0 | hts_tpool_process_shutdown(fd->q); |
3427 | 0 | goto tidyup; |
3428 | | |
3429 | 0 | default: |
3430 | 0 | pthread_mutex_unlock(&fd->command_m); |
3431 | 0 | break; |
3432 | 0 | } |
3433 | 0 | } |
3434 | | |
3435 | 0 | tidyup: |
3436 | 0 | pthread_mutex_lock(&fd->command_m); |
3437 | 0 | fd->command = SAM_CLOSE_DONE; |
3438 | 0 | pthread_cond_signal(&fd->command_c); |
3439 | 0 | pthread_mutex_unlock(&fd->command_m); |
3440 | |
|
3441 | 0 | if (l) { |
3442 | 0 | pthread_mutex_lock(&fd->lines_m); |
3443 | 0 | l->next = fd->lines; |
3444 | 0 | fd->lines = l; |
3445 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3446 | 0 | } |
3447 | 0 | free(line.s); |
3448 | |
|
3449 | 0 | return NULL; |
3450 | | |
3451 | 0 | err: |
3452 | 0 | sam_state_err(fd, errno ? errno : ENOMEM); |
3453 | 0 | hts_tpool_process_shutdown(fd->q); |
3454 | 0 | goto tidyup; |
3455 | 0 | } |
3456 | | |
3457 | | // Runs in its own thread. |
3458 | | // Takes encoded blocks of SAM off the thread results queue and writes them |
3459 | | // to our output stream. |
3460 | 0 | static void *sam_dispatcher_write(void *vp) { |
3461 | 0 | htsFile *fp = vp; |
3462 | 0 | SAM_state *fd = fp->state; |
3463 | 0 | hts_tpool_result *r; |
3464 | | |
3465 | | // Iterates until result queue is shutdown, where it returns NULL. |
3466 | 0 | while ((r = hts_tpool_next_result_wait(fd->q))) { |
3467 | 0 | sp_lines *gl = (sp_lines *)hts_tpool_result_data(r); |
3468 | 0 | if (!gl) { |
3469 | 0 | sam_state_err(fd, ENOMEM); |
3470 | 0 | goto err; |
3471 | 0 | } |
3472 | | |
3473 | 0 | if (fp->idx) { |
3474 | 0 | sp_bams *gb = gl->bams; |
3475 | 0 | int i = 0, count = 0; |
3476 | 0 | while (i < gl->data_size) { |
3477 | 0 | int j = i; |
3478 | 0 | while (i < gl->data_size && gl->data[i] != '\n') |
3479 | 0 | i++; |
3480 | 0 | if (i < gl->data_size) |
3481 | 0 | i++; |
3482 | |
|
3483 | 0 | if (fp->is_bgzf) { |
3484 | 0 | if (bgzf_flush_try(fp->fp.bgzf, i-j) < 0) |
3485 | 0 | goto err; |
3486 | 0 | if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j) |
3487 | 0 | goto err; |
3488 | 0 | } else { |
3489 | 0 | if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j) |
3490 | 0 | goto err; |
3491 | 0 | } |
3492 | | |
3493 | 0 | bam1_t *b = &gb->bams[count++]; |
3494 | 0 | if (fp->format.compression == bgzf) { |
3495 | 0 | if (bgzf_idx_push(fp->fp.bgzf, fp->idx, |
3496 | 0 | b->core.tid, b->core.pos, bam_endpos(b), |
3497 | 0 | bgzf_tell(fp->fp.bgzf), |
3498 | 0 | !(b->core.flag&BAM_FUNMAP)) < 0) { |
3499 | 0 | sam_state_err(fd, errno ? errno : ENOMEM); |
3500 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
3501 | 0 | bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1); |
3502 | 0 | goto err; |
3503 | 0 | } |
3504 | 0 | } else { |
3505 | 0 | if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b), |
3506 | 0 | bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) { |
3507 | 0 | sam_state_err(fd, errno ? errno : ENOMEM); |
3508 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
3509 | 0 | bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1); |
3510 | 0 | goto err; |
3511 | 0 | } |
3512 | 0 | } |
3513 | 0 | } |
3514 | | |
3515 | 0 | assert(count == gb->nbams); |
3516 | | |
3517 | | // Add bam array to free-list |
3518 | 0 | pthread_mutex_lock(&fd->lines_m); |
3519 | 0 | gb->next = fd->bams; |
3520 | 0 | fd->bams = gl->bams; |
3521 | 0 | gl->bams = NULL; |
3522 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3523 | 0 | } else { |
3524 | 0 | if (fp->is_bgzf) { |
3525 | | // We keep track of how much in the current block we have |
3526 | | // remaining => R. We look for the last newline in input |
3527 | | // [i] to [i+R], backwards => position N. |
3528 | | // |
3529 | | // If we find a newline, we write out bytes i to N. |
3530 | | // We know we cannot fit the next record in this bgzf block, |
3531 | | // so we flush what we have and copy input N to i+R into |
3532 | | // the start of a new block, and recompute a new R for that. |
3533 | | // |
3534 | | // If we don't find a newline (i==N) then we cannot extend |
3535 | | // the current block at all, so flush whatever is in it now |
3536 | | // if it ends on a newline. |
3537 | | // We still copy i(==N) to i+R to the next block and |
3538 | | // continue as before with a new R. |
3539 | | // |
3540 | | // The only exception on the flush is when we run out of |
3541 | | // data in the input. In that case we skip it as we don't |
3542 | | // yet know if the next record will fit. |
3543 | | // |
3544 | | // Both conditions share the same code here: |
3545 | | // - Look for newline (pos N) |
3546 | | // - Write i to N (which maybe 0) |
3547 | | // - Flush if block ends on newline and not end of input |
3548 | | // - write N to i+R |
3549 | |
|
3550 | 0 | int i = 0; |
3551 | 0 | BGZF *fb = fp->fp.bgzf; |
3552 | 0 | while (i < gl->data_size) { |
3553 | | // remaining space in block |
3554 | 0 | int R = BGZF_BLOCK_SIZE - fb->block_offset; |
3555 | 0 | int eod = 0; |
3556 | 0 | if (R > gl->data_size-i) |
3557 | 0 | R = gl->data_size-i, eod = 1; |
3558 | | |
3559 | | // Find last newline in input data |
3560 | 0 | int N = i + R; |
3561 | 0 | while (--N > i) { |
3562 | 0 | if (gl->data[N] == '\n') |
3563 | 0 | break; |
3564 | 0 | } |
3565 | |
|
3566 | 0 | if (N != i) { |
3567 | | // Found a newline |
3568 | 0 | N++; |
3569 | 0 | if (bgzf_write(fb, &gl->data[i], N-i) != N-i) |
3570 | 0 | goto err; |
3571 | 0 | } |
3572 | | |
3573 | | // Flush bgzf block |
3574 | 0 | int b_off = fb->block_offset; |
3575 | 0 | if (!eod && b_off && |
3576 | 0 | ((char *)fb->uncompressed_block)[b_off-1] == '\n') |
3577 | 0 | if (bgzf_flush_try(fb, BGZF_BLOCK_SIZE) < 0) |
3578 | 0 | goto err; |
3579 | | |
3580 | | // Copy from N onwards into next block |
3581 | 0 | if (i+R > N) |
3582 | 0 | if (bgzf_write(fb, &gl->data[N], i+R - N) |
3583 | 0 | != i+R - N) |
3584 | 0 | goto err; |
3585 | | |
3586 | 0 | i = i+R; |
3587 | 0 | } |
3588 | 0 | } else { |
3589 | 0 | if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size) |
3590 | 0 | goto err; |
3591 | 0 | } |
3592 | 0 | } |
3593 | | |
3594 | 0 | hts_tpool_delete_result(r, 0); |
3595 | | |
3596 | | // Also updated by main thread |
3597 | 0 | pthread_mutex_lock(&fd->lines_m); |
3598 | 0 | gl->next = fd->lines; |
3599 | 0 | fd->lines = gl; |
3600 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3601 | 0 | } |
3602 | | |
3603 | 0 | sam_state_err(fd, 0); // success |
3604 | 0 | hts_tpool_process_shutdown(fd->q); |
3605 | 0 | return NULL; |
3606 | | |
3607 | 0 | err: |
3608 | 0 | sam_state_err(fd, errno ? errno : EIO); |
3609 | 0 | return (void *)-1; |
3610 | 0 | } |
3611 | | |
3612 | | // Run from one of the worker threads. |
3613 | | // Convert a passed in array of BAMs (sp_bams) and converts to a block |
3614 | | // of text SAM records (sp_lines). |
3615 | 0 | static void *sam_format_worker(void *arg) { |
3616 | 0 | sp_bams *gb = (sp_bams *)arg; |
3617 | 0 | sp_lines *gl = NULL; |
3618 | 0 | int i; |
3619 | 0 | SAM_state *fd = gb->fd; |
3620 | 0 | htsFile *fp = fd->fp; |
3621 | | |
3622 | | // Use a block of SAM strings we had earlier if available. |
3623 | 0 | pthread_mutex_lock(&fd->lines_m); |
3624 | 0 | if (fd->lines) { |
3625 | 0 | gl = fd->lines; |
3626 | 0 | fd->lines = gl->next; |
3627 | 0 | } |
3628 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3629 | |
|
3630 | 0 | if (gl == NULL) { |
3631 | 0 | gl = calloc(1, sizeof(*gl)); |
3632 | 0 | if (!gl) { |
3633 | 0 | sam_state_err(fd, ENOMEM); |
3634 | 0 | return NULL; |
3635 | 0 | } |
3636 | 0 | gl->alloc = gl->data_size = 0; |
3637 | 0 | gl->data = NULL; |
3638 | 0 | } |
3639 | 0 | gl->serial = gb->serial; |
3640 | 0 | gl->next = NULL; |
3641 | |
|
3642 | 0 | kstring_t ks = {0, gl->alloc, gl->data}; |
3643 | |
|
3644 | 0 | for (i = 0; i < gb->nbams; i++) { |
3645 | 0 | if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) { |
3646 | 0 | sam_state_err(fd, errno ? errno : EIO); |
3647 | 0 | goto err; |
3648 | 0 | } |
3649 | 0 | kputc('\n', &ks); |
3650 | 0 | } |
3651 | | |
3652 | 0 | pthread_mutex_lock(&fd->lines_m); |
3653 | 0 | gl->data_size = ks.l; |
3654 | 0 | gl->alloc = ks.m; |
3655 | 0 | gl->data = ks.s; |
3656 | |
|
3657 | 0 | if (fp->idx) { |
3658 | | // Keep hold of the bam array a little longer as |
3659 | | // sam_dispatcher_write needs to use them for building the index. |
3660 | 0 | gl->bams = gb; |
3661 | 0 | } else { |
3662 | | // Add bam array to free-list |
3663 | 0 | gb->next = fd->bams; |
3664 | 0 | fd->bams = gb; |
3665 | 0 | } |
3666 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3667 | |
|
3668 | 0 | return gl; |
3669 | | |
3670 | 0 | err: |
3671 | | // Possible race between this and fd->curr_bam. |
3672 | | // Easier to not free and leave it on the input list so it |
3673 | | // gets freed there instead? |
3674 | | // sam_free_sp_bams(gb); |
3675 | 0 | if (gl) { |
3676 | 0 | free(gl->data); |
3677 | 0 | free(gl); |
3678 | 0 | } |
3679 | 0 | return NULL; |
3680 | 0 | } |
3681 | | |
3682 | 0 | int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) { |
3683 | 0 | if (fp->state) |
3684 | 0 | return 0; |
3685 | | |
3686 | 0 | if (!(fp->state = sam_state_create(fp))) |
3687 | 0 | return -1; |
3688 | 0 | SAM_state *fd = (SAM_state *)fp->state; |
3689 | |
|
3690 | 0 | pthread_mutex_init(&fd->lines_m, NULL); |
3691 | 0 | pthread_mutex_init(&fd->command_m, NULL); |
3692 | 0 | pthread_cond_init(&fd->command_c, NULL); |
3693 | 0 | fd->p = p->pool; |
3694 | 0 | int qsize = p->qsize; |
3695 | 0 | if (!qsize) |
3696 | 0 | qsize = 2*hts_tpool_size(fd->p); |
3697 | 0 | fd->q = hts_tpool_process_init(fd->p, qsize, 0); |
3698 | 0 | if (!fd->q) { |
3699 | 0 | sam_state_destroy(fp); |
3700 | 0 | return -1; |
3701 | 0 | } |
3702 | | |
3703 | 0 | if (fp->format.compression == bgzf) |
3704 | 0 | return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize); |
3705 | | |
3706 | 0 | return 0; |
3707 | 0 | } |
3708 | | |
3709 | 0 | int sam_set_threads(htsFile *fp, int nthreads) { |
3710 | 0 | if (nthreads <= 0) |
3711 | 0 | return 0; |
3712 | | |
3713 | 0 | htsThreadPool p; |
3714 | 0 | p.pool = hts_tpool_init(nthreads); |
3715 | 0 | p.qsize = nthreads*2; |
3716 | |
|
3717 | 0 | int ret = sam_set_thread_pool(fp, &p); |
3718 | 0 | if (ret < 0) |
3719 | 0 | return ret; |
3720 | | |
3721 | 0 | SAM_state *fd = (SAM_state *)fp->state; |
3722 | 0 | fd->own_pool = 1; |
3723 | |
|
3724 | 0 | return 0; |
3725 | 0 | } |
3726 | | |
3727 | | typedef struct { |
3728 | | kstring_t name; |
3729 | | kstring_t comment; // NB: pointer into name, do not free |
3730 | | kstring_t seq; |
3731 | | kstring_t qual; |
3732 | | int casava; |
3733 | | int aux; |
3734 | | int rnum; |
3735 | | char BC[3]; // aux tag ID for barcode |
3736 | | khash_t(tag) *tags; // which aux tags to use (if empty, use all). |
3737 | | char nprefix; |
3738 | | int sra_names; |
3739 | | } fastq_state; |
3740 | | |
3741 | | // Initialise fastq state. |
3742 | | // Name char of '@' or '>' distinguishes fastq vs fasta variant |
3743 | 121 | static fastq_state *fastq_state_init(int name_char) { |
3744 | 121 | fastq_state *x = (fastq_state *)calloc(1, sizeof(*x)); |
3745 | 121 | if (!x) |
3746 | 0 | return NULL; |
3747 | 121 | strcpy(x->BC, "BC"); |
3748 | 121 | x->nprefix = name_char; |
3749 | | |
3750 | 121 | return x; |
3751 | 121 | } |
3752 | | |
3753 | 121 | void fastq_state_destroy(htsFile *fp) { |
3754 | 121 | if (fp->state) { |
3755 | 121 | fastq_state *x = (fastq_state *)fp->state; |
3756 | 121 | if (x->tags) |
3757 | 121 | kh_destroy(tag, x->tags); |
3758 | 121 | ks_free(&x->name); |
3759 | 121 | ks_free(&x->seq); |
3760 | 121 | ks_free(&x->qual); |
3761 | 121 | free(fp->state); |
3762 | 121 | } |
3763 | 121 | } |
3764 | | |
3765 | 0 | int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) { |
3766 | 0 | va_list args; |
3767 | |
|
3768 | 0 | if (!fp) |
3769 | 0 | return -1; |
3770 | 0 | if (!fp->state) |
3771 | 0 | if (!(fp->state = fastq_state_init(fp->format.format == fastq_format |
3772 | 0 | ? '@' : '>'))) |
3773 | 0 | return -1; |
3774 | | |
3775 | 0 | fastq_state *x = (fastq_state *)fp->state; |
3776 | |
|
3777 | 0 | switch (opt) { |
3778 | 0 | case FASTQ_OPT_CASAVA: |
3779 | 0 | x->casava = 1; |
3780 | 0 | break; |
3781 | | |
3782 | 0 | case FASTQ_OPT_NAME2: |
3783 | 0 | x->sra_names = 1; |
3784 | 0 | break; |
3785 | | |
3786 | 0 | case FASTQ_OPT_AUX: { |
3787 | 0 | va_start(args, opt); |
3788 | 0 | x->aux = 1; |
3789 | 0 | char *tag = va_arg(args, char *); |
3790 | 0 | va_end(args); |
3791 | 0 | if (tag && strcmp(tag, "1") != 0) { |
3792 | 0 | if (!x->tags) |
3793 | 0 | if (!(x->tags = kh_init(tag))) |
3794 | 0 | return -1; |
3795 | | |
3796 | 0 | size_t i, tlen = strlen(tag); |
3797 | 0 | for (i = 0; i+3 <= tlen+1; i += 3) { |
3798 | 0 | if (tag[i+0] == ',' || tag[i+1] == ',' || |
3799 | 0 | !(tag[i+2] == ',' || tag[i+2] == '\0')) { |
3800 | 0 | hts_log_warning("Bad tag format '%.3s'; skipping option", tag+i); |
3801 | 0 | break; |
3802 | 0 | } |
3803 | 0 | int ret, tcode = tag[i+0]*256 + tag[i+1]; |
3804 | 0 | kh_put(tag, x->tags, tcode, &ret); |
3805 | 0 | if (ret < 0) |
3806 | 0 | return -1; |
3807 | 0 | } |
3808 | 0 | } |
3809 | 0 | break; |
3810 | 0 | } |
3811 | | |
3812 | 0 | case FASTQ_OPT_BARCODE: { |
3813 | 0 | va_start(args, opt); |
3814 | 0 | char *bc = va_arg(args, char *); |
3815 | 0 | va_end(args); |
3816 | 0 | strncpy(x->BC, bc, 2); |
3817 | 0 | x->BC[2] = 0; |
3818 | 0 | break; |
3819 | 0 | } |
3820 | | |
3821 | 0 | case FASTQ_OPT_RNUM: |
3822 | 0 | x->rnum = 1; |
3823 | 0 | break; |
3824 | | |
3825 | 0 | default: |
3826 | 0 | break; |
3827 | 0 | } |
3828 | 0 | return 0; |
3829 | 0 | } |
3830 | | |
3831 | 11.9k | static int fastq_parse1(htsFile *fp, bam1_t *b) { |
3832 | 11.9k | fastq_state *x = (fastq_state *)fp->state; |
3833 | 11.9k | size_t i, l; |
3834 | 11.9k | int ret = 0; |
3835 | | |
3836 | 11.9k | if (fp->format.format == fasta_format && fp->line.s) { |
3837 | | // For FASTA we've already read the >name line; steal it |
3838 | | // Not the most efficient, but we don't optimise for fasta reading. |
3839 | 11.5k | if (fp->line.l == 0) |
3840 | 37 | return -1; // EOF |
3841 | | |
3842 | 11.5k | free(x->name.s); |
3843 | 11.5k | x->name = fp->line; |
3844 | 11.5k | fp->line.l = fp->line.m = 0; |
3845 | 11.5k | fp->line.s = NULL; |
3846 | 11.5k | } else { |
3847 | | // Read a FASTQ format entry. |
3848 | 320 | ret = hts_getline(fp, KS_SEP_LINE, &x->name); |
3849 | 320 | if (ret == -1) |
3850 | 0 | return -1; // EOF |
3851 | 320 | else if (ret < -1) |
3852 | 18 | return ret; // ERR |
3853 | 320 | } |
3854 | | |
3855 | | // Name |
3856 | 11.8k | if (*x->name.s != x->nprefix) |
3857 | 2 | return -2; |
3858 | | |
3859 | | // Reverse the SRA strangeness of putting the run_name.number before |
3860 | | // the read name. |
3861 | 11.8k | i = 0; |
3862 | 11.8k | char *name = x->name.s+1; |
3863 | 11.8k | if (x->sra_names) { |
3864 | 0 | char *cp = strpbrk(x->name.s, " \t"); |
3865 | 0 | if (cp) { |
3866 | 0 | while (*cp == ' ' || *cp == '\t') |
3867 | 0 | cp++; |
3868 | 0 | *--cp = '@'; |
3869 | 0 | i = cp - x->name.s; |
3870 | 0 | name = cp+1; |
3871 | 0 | } |
3872 | 0 | } |
3873 | | |
3874 | 11.8k | l = x->name.l; |
3875 | 11.8k | char *s = x->name.s; |
3876 | 16.6M | while (i < l && !isspace_c(s[i])) |
3877 | 16.6M | i++; |
3878 | 11.8k | if (i < l) { |
3879 | 372 | s[i] = 0; |
3880 | 372 | x->name.l = i++; |
3881 | 372 | } |
3882 | | |
3883 | | // Comment; a kstring struct, but pointer into name line. (Do not free) |
3884 | 21.3k | while (i < l && isspace_c(s[i])) |
3885 | 9.48k | i++; |
3886 | 11.8k | x->comment.s = s+i; |
3887 | 11.8k | x->comment.l = l - i; |
3888 | | |
3889 | | // Seq |
3890 | 11.8k | x->seq.l = 0; |
3891 | 1.27M | for (;;) { |
3892 | 1.27M | if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0) |
3893 | 81 | if (fp->format.format == fastq_format || ret < -1) |
3894 | 38 | return -2; |
3895 | 1.27M | if (ret == -1 || |
3896 | 1.27M | *fp->line.s == (fp->format.format == fastq_format ? '+' : '>')) |
3897 | 11.8k | break; |
3898 | 1.26M | if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0) |
3899 | 0 | return -2; |
3900 | 1.26M | } |
3901 | | |
3902 | | // Qual |
3903 | 11.8k | if (fp->format.format == fastq_format) { |
3904 | 217 | size_t remainder = x->seq.l; |
3905 | 217 | x->qual.l = 0; |
3906 | 3.37k | do { |
3907 | 3.37k | if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0) |
3908 | 6 | return -2; |
3909 | 3.36k | if (fp->line.l > remainder) |
3910 | 12 | return -2; |
3911 | 3.35k | if (kputsn(fp->line.s, fp->line.l, &x->qual) < 0) |
3912 | 0 | return -2; |
3913 | 3.35k | remainder -= fp->line.l; |
3914 | 3.35k | } while (remainder > 0); |
3915 | | |
3916 | | // Decr qual |
3917 | 392 | for (i = 0; i < x->qual.l; i++) |
3918 | 193 | x->qual.s[i] -= '!'; |
3919 | 199 | } |
3920 | | |
3921 | 11.7k | int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED; |
3922 | 11.7k | if (x->name.l > 2 && |
3923 | 11.7k | x->name.s[x->name.l-2] == '/' && |
3924 | 11.7k | isdigit_c(x->name.s[x->name.l-1])) { |
3925 | 129 | switch(x->name.s[x->name.l-1]) { |
3926 | 0 | case '1': flag |= BAM_FREAD1 | pflag; break; |
3927 | 0 | case '2': flag |= BAM_FREAD2 | pflag; break; |
3928 | 129 | default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break; |
3929 | 129 | } |
3930 | 129 | x->name.s[x->name.l-=2] = 0; |
3931 | 129 | } |
3932 | | |
3933 | | // Convert to BAM |
3934 | 11.7k | ret = bam_set1(b, |
3935 | 11.7k | x->name.s + x->name.l - name, name, |
3936 | 11.7k | flag, |
3937 | 11.7k | -1, -1, 0, // ref '*', pos, mapq, |
3938 | 11.7k | 0, NULL, // no cigar, |
3939 | 11.7k | -1, -1, 0, // mate |
3940 | 11.7k | x->seq.l, x->seq.s, x->qual.s, |
3941 | 11.7k | 0); |
3942 | | |
3943 | | // Identify Illumina CASAVA strings. |
3944 | | // <read>:<is_filtered>:<control_bits>:<barcode_sequence> |
3945 | 11.7k | char *barcode = NULL; |
3946 | 11.7k | int barcode_len = 0; |
3947 | 11.7k | kstring_t *kc = &x->comment; |
3948 | 11.7k | char *endptr; |
3949 | 11.7k | if (x->casava && |
3950 | | // \d:[YN]:\d+:[ACGTN]+ |
3951 | 11.7k | kc->l > 6 && (kc->s[1] | kc->s[3]) == ':' && isdigit_c(kc->s[0]) && |
3952 | 11.7k | strtol(kc->s+4, &endptr, 10) >= 0 && endptr != kc->s+4 |
3953 | 11.7k | && *endptr == ':') { |
3954 | | |
3955 | | // read num |
3956 | 0 | switch(kc->s[0]) { |
3957 | 0 | case '1': b->core.flag |= BAM_FREAD1 | pflag; break; |
3958 | 0 | case '2': b->core.flag |= BAM_FREAD2 | pflag; break; |
3959 | 0 | default : b->core.flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break; |
3960 | 0 | } |
3961 | | |
3962 | 0 | if (kc->s[2] == 'Y') |
3963 | 0 | b->core.flag |= BAM_FQCFAIL; |
3964 | | |
3965 | | // Barcode, maybe numeric in which case we skip it |
3966 | 0 | if (!isdigit_c(endptr[1])) { |
3967 | 0 | barcode = endptr+1; |
3968 | 0 | for (i = barcode - kc->s; i < kc->l; i++) |
3969 | 0 | if (isspace_c(kc->s[i])) |
3970 | 0 | break; |
3971 | |
|
3972 | 0 | kc->s[i] = 0; |
3973 | 0 | barcode_len = i+1-(barcode - kc->s); |
3974 | 0 | } |
3975 | 0 | } |
3976 | | |
3977 | 11.7k | if (ret >= 0 && barcode_len) |
3978 | 0 | if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0) |
3979 | 0 | ret = -2; |
3980 | | |
3981 | 11.7k | if (!x->aux) |
3982 | 11.7k | return ret; |
3983 | | |
3984 | | // Identify any SAM style aux tags in comments too. |
3985 | 0 | if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0) |
3986 | 0 | ret = -2; |
3987 | |
|
3988 | 0 | return ret; |
3989 | 11.7k | } |
3990 | | |
3991 | | // Internal component of sam_read1 below |
3992 | 357 | static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { |
3993 | 357 | int ret = bam_read1(fp->fp.bgzf, b); |
3994 | 357 | if (h && ret >= 0) { |
3995 | 343 | if (b->core.tid >= h->n_targets || b->core.tid < -1 || |
3996 | 343 | b->core.mtid >= h->n_targets || b->core.mtid < -1) { |
3997 | 3 | errno = ERANGE; |
3998 | 3 | return -3; |
3999 | 3 | } |
4000 | 343 | } |
4001 | 354 | return ret; |
4002 | 357 | } |
4003 | | |
4004 | | // Internal component of sam_read1 below |
4005 | 245 | static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) { |
4006 | 245 | int ret = cram_get_bam_seq(fp->fp.cram, b); |
4007 | 245 | if (ret < 0) |
4008 | 245 | return cram_eof(fp->fp.cram) ? -1 : -2; |
4009 | | |
4010 | 0 | if (bam_tag2cigar(*b, 1, 1) < 0) |
4011 | 0 | return -2; |
4012 | | |
4013 | 0 | return ret; |
4014 | 0 | } |
4015 | | |
4016 | | // Internal component of sam_read1 below |
4017 | 6.12k | static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { |
4018 | 6.12k | int ret; |
4019 | | |
4020 | | // Consume 1st line after header parsing as it wasn't using peek |
4021 | 6.12k | if (fp->line.l != 0) { |
4022 | 0 | ret = sam_parse1(&fp->line, h, b); |
4023 | 0 | fp->line.l = 0; |
4024 | 0 | return ret; |
4025 | 0 | } |
4026 | | |
4027 | 6.12k | if (fp->state) { |
4028 | 0 | SAM_state *fd = (SAM_state *)fp->state; |
4029 | |
|
4030 | 0 | if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) { |
4031 | | // We don't support multi-threaded SAM parsing with seeks yet. |
4032 | 0 | int ret; |
4033 | 0 | if ((ret = sam_state_destroy(fp)) < 0) { |
4034 | 0 | errno = -ret; |
4035 | 0 | return -2; |
4036 | 0 | } |
4037 | 0 | if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0) |
4038 | 0 | return -1; |
4039 | 0 | fp->fp.bgzf->seeked = 0; |
4040 | 0 | goto err_recover; |
4041 | 0 | } |
4042 | | |
4043 | 0 | if (!fd->h) { |
4044 | 0 | fd->h = h; |
4045 | 0 | fd->h->ref_count++; |
4046 | | // Ensure hrecs is initialised now as we don't want multiple |
4047 | | // threads trying to do this simultaneously. |
4048 | 0 | if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0) |
4049 | 0 | return -2; |
4050 | | |
4051 | | // We can only do this once we've got a header |
4052 | 0 | if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read, |
4053 | 0 | fp) != 0) |
4054 | 0 | return -2; |
4055 | 0 | fd->dispatcher_set = 1; |
4056 | 0 | } |
4057 | | |
4058 | 0 | if (fd->h != h) { |
4059 | 0 | hts_log_error("SAM multi-threaded decoding does not support changing header"); |
4060 | 0 | return -1; |
4061 | 0 | } |
4062 | | |
4063 | 0 | sp_bams *gb = fd->curr_bam; |
4064 | 0 | if (!gb) { |
4065 | 0 | if (fd->errcode) { |
4066 | | // In case reader failed |
4067 | 0 | errno = fd->errcode; |
4068 | 0 | return -2; |
4069 | 0 | } |
4070 | 0 | hts_tpool_result *r = hts_tpool_next_result_wait(fd->q); |
4071 | 0 | if (!r) |
4072 | 0 | return -2; |
4073 | 0 | fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r); |
4074 | 0 | hts_tpool_delete_result(r, 0); |
4075 | 0 | } |
4076 | 0 | if (!gb) |
4077 | 0 | return fd->errcode ? -2 : -1; |
4078 | 0 | bam1_t *b_array = (bam1_t *)gb->bams; |
4079 | 0 | if (fd->curr_idx < gb->nbams) |
4080 | 0 | if (!bam_copy1(b, &b_array[fd->curr_idx++])) |
4081 | 0 | return -2; |
4082 | 0 | if (fd->curr_idx == gb->nbams) { |
4083 | 0 | pthread_mutex_lock(&fd->lines_m); |
4084 | 0 | gb->next = fd->bams; |
4085 | 0 | fd->bams = gb; |
4086 | 0 | pthread_mutex_unlock(&fd->lines_m); |
4087 | |
|
4088 | 0 | fd->curr_bam = NULL; |
4089 | 0 | fd->curr_idx = 0; |
4090 | 0 | } |
4091 | |
|
4092 | 0 | ret = 0; |
4093 | |
|
4094 | 6.12k | } else { |
4095 | 6.12k | err_recover: |
4096 | 6.12k | ret = hts_getline(fp, KS_SEP_LINE, &fp->line); |
4097 | 6.12k | if (ret < 0) return ret; |
4098 | | |
4099 | 5.87k | ret = sam_parse1(&fp->line, h, b); |
4100 | 5.87k | fp->line.l = 0; |
4101 | 5.87k | if (ret < 0) { |
4102 | 233 | hts_log_warning("Parse error at line %lld", (long long)fp->lineno); |
4103 | 233 | if (h && h->ignore_sam_err) goto err_recover; |
4104 | 233 | } |
4105 | 5.87k | } |
4106 | | |
4107 | 5.87k | return ret; |
4108 | 6.12k | } |
4109 | | |
4110 | | // Returns 0 on success, |
4111 | | // -1 on EOF, |
4112 | | // <-1 on error |
4113 | | int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) |
4114 | 18.6k | { |
4115 | 18.6k | int ret, pass_filter; |
4116 | | |
4117 | 18.6k | do { |
4118 | 18.6k | switch (fp->format.format) { |
4119 | 357 | case bam: |
4120 | 357 | ret = sam_read1_bam(fp, h, b); |
4121 | 357 | break; |
4122 | | |
4123 | 245 | case cram: |
4124 | 245 | ret = sam_read1_cram(fp, h, &b); |
4125 | 245 | break; |
4126 | | |
4127 | 6.12k | case sam: |
4128 | 6.12k | ret = sam_read1_sam(fp, h, b); |
4129 | 6.12k | break; |
4130 | | |
4131 | 11.6k | case fasta_format: |
4132 | 11.9k | case fastq_format: { |
4133 | 11.9k | fastq_state *x = (fastq_state *)fp->state; |
4134 | 11.9k | if (!x) { |
4135 | 121 | if (!(fp->state = fastq_state_init(fp->format.format |
4136 | 121 | == fastq_format ? '@' : '>'))) |
4137 | 0 | return -2; |
4138 | 121 | } |
4139 | | |
4140 | 11.9k | return fastq_parse1(fp, b); |
4141 | 11.9k | } |
4142 | | |
4143 | 0 | case empty_format: |
4144 | 0 | errno = EPIPE; |
4145 | 0 | return -3; |
4146 | | |
4147 | 0 | default: |
4148 | 0 | errno = EFTYPE; |
4149 | 0 | return -3; |
4150 | 18.6k | } |
4151 | | |
4152 | 6.72k | pass_filter = (ret >= 0 && fp->filter) |
4153 | 6.72k | ? sam_passes_filter(h, b, fp->filter) |
4154 | 6.72k | : 1; |
4155 | 6.72k | } while (pass_filter == 0); |
4156 | | |
4157 | 6.72k | return pass_filter < 0 ? -2 : ret; |
4158 | 18.6k | } |
4159 | | |
4160 | | |
4161 | | static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) |
4162 | 17.7k | { |
4163 | 17.7k | int i, r = 0; |
4164 | 17.7k | uint8_t *s, *end; |
4165 | 17.7k | const bam1_core_t *c = &b->core; |
4166 | | |
4167 | 17.7k | if (c->l_qname == 0) |
4168 | 0 | return -1; |
4169 | 17.7k | r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str); |
4170 | 17.7k | r |= kputc_('\t', str); // query name |
4171 | 17.7k | r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag |
4172 | 17.7k | if (c->tid >= 0) { // chr |
4173 | 1.17k | r |= kputs(h->target_name[c->tid] , str); |
4174 | 1.17k | r |= kputc_('\t', str); |
4175 | 16.5k | } else r |= kputsn_("*\t", 2, str); |
4176 | 17.7k | r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos |
4177 | 17.7k | r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual |
4178 | 17.7k | if (c->n_cigar) { // cigar |
4179 | 1.53k | uint32_t *cigar = bam_get_cigar(b); |
4180 | 171k | for (i = 0; i < c->n_cigar; ++i) { |
4181 | 170k | r |= kputw(bam_cigar_oplen(cigar[i]), str); |
4182 | 170k | r |= kputc_(bam_cigar_opchr(cigar[i]), str); |
4183 | 170k | } |
4184 | 16.2k | } else r |= kputc_('*', str); |
4185 | 17.7k | r |= kputc_('\t', str); |
4186 | 17.7k | if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr |
4187 | 233 | else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str); |
4188 | 74 | else { |
4189 | 74 | r |= kputs(h->target_name[c->mtid], str); |
4190 | 74 | r |= kputc_('\t', str); |
4191 | 74 | } |
4192 | 17.7k | r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos |
4193 | 17.7k | r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len |
4194 | 17.7k | if (c->l_qseq) { // seq and qual |
4195 | 2.33k | uint8_t *s = bam_get_seq(b); |
4196 | 2.33k | if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err; |
4197 | 2.33k | char *cp = str->s + str->l; |
4198 | | |
4199 | | // Sequence, 2 bases at a time |
4200 | 2.33k | nibble2base(s, cp, c->l_qseq); |
4201 | 2.33k | cp[c->l_qseq] = '\t'; |
4202 | 2.33k | cp += c->l_qseq+1; |
4203 | | |
4204 | | // Quality |
4205 | 2.33k | s = bam_get_qual(b); |
4206 | 2.33k | i = 0; |
4207 | 2.33k | if (s[0] == 0xff) { |
4208 | 2.08k | cp[i++] = '*'; |
4209 | 2.08k | } else { |
4210 | | // local copy of c->l_qseq to aid unrolling |
4211 | 250 | uint32_t lqseq = c->l_qseq; |
4212 | 1.08k | for (i = 0; i < lqseq; ++i) |
4213 | 833 | cp[i]=s[i]+33; |
4214 | 250 | } |
4215 | 2.33k | cp[i] = 0; |
4216 | 2.33k | cp += i; |
4217 | 2.33k | str->l = cp - str->s; |
4218 | 15.4k | } else r |= kputsn_("*\t*", 3, str); |
4219 | | |
4220 | 17.7k | s = bam_get_aux(b); // aux |
4221 | 17.7k | end = b->data + b->l_data; |
4222 | | |
4223 | 2.61M | while (end - s >= 4) { |
4224 | 2.59M | r |= kputc_('\t', str); |
4225 | 2.59M | if ((s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)) == NULL) |
4226 | 17 | goto bad_aux; |
4227 | 2.59M | } |
4228 | 17.7k | r |= kputsn("", 0, str); // nul terminate |
4229 | 17.7k | if (r < 0) goto mem_err; |
4230 | | |
4231 | 17.7k | return str->l; |
4232 | | |
4233 | 17 | bad_aux: |
4234 | 17 | hts_log_error("Corrupted aux data for read %.*s", |
4235 | 17 | b->core.l_qname, bam_get_qname(b)); |
4236 | 17 | errno = EINVAL; |
4237 | 17 | return -1; |
4238 | | |
4239 | 0 | mem_err: |
4240 | 0 | hts_log_error("Out of memory"); |
4241 | 0 | errno = ENOMEM; |
4242 | 0 | return -1; |
4243 | 17.7k | } |
4244 | | |
4245 | | int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) |
4246 | 17.7k | { |
4247 | 17.7k | str->l = 0; |
4248 | 17.7k | return sam_format1_append(h, b, str); |
4249 | 17.7k | } |
4250 | | |
4251 | | static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end); |
4252 | | int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str) |
4253 | 0 | { |
4254 | 0 | unsigned flag = b->core.flag; |
4255 | 0 | int i, e = 0, len = b->core.l_qseq; |
4256 | 0 | uint8_t *seq, *qual; |
4257 | |
|
4258 | 0 | str->l = 0; |
4259 | |
|
4260 | 0 | if (len == 0) return 0; |
4261 | | |
4262 | | // Name |
4263 | 0 | if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF) |
4264 | 0 | return -1; |
4265 | | |
4266 | | // /1 or /2 suffix |
4267 | 0 | if (x && x->rnum && (flag & BAM_FPAIRED)) { |
4268 | 0 | int r12 = flag & (BAM_FREAD1 | BAM_FREAD2); |
4269 | 0 | if (r12 == BAM_FREAD1) { |
4270 | 0 | if (kputs("/1", str) == EOF) |
4271 | 0 | return -1; |
4272 | 0 | } else if (r12 == BAM_FREAD2) { |
4273 | 0 | if (kputs("/2", str) == EOF) |
4274 | 0 | return -1; |
4275 | 0 | } |
4276 | 0 | } |
4277 | | |
4278 | | // Illumina CASAVA tag. |
4279 | | // This is <rnum>:<Y/N qcfail>:<control-bits>:<barcode-or-zero> |
4280 | 0 | if (x && x->casava) { |
4281 | 0 | int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0; |
4282 | 0 | char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N'; |
4283 | 0 | uint8_t *bc = bam_aux_get(b, x->BC); |
4284 | 0 | if (ksprintf(str, " %d:%c:0:%s", rnum, filtered, |
4285 | 0 | bc ? (char *)bc+1 : "0") < 0) |
4286 | 0 | return -1; |
4287 | | |
4288 | 0 | if (bc && (*bc != 'Z' || (!isupper_c(bc[1]) && !islower_c(bc[1])))) { |
4289 | 0 | hts_log_warning("BC tag starts with non-sequence base; using '0'"); |
4290 | 0 | str->l -= strlen((char *)bc)-2; // limit to 1 char |
4291 | 0 | str->s[str->l-1] = '0'; |
4292 | 0 | str->s[str->l] = 0; |
4293 | 0 | bc = NULL; |
4294 | 0 | } |
4295 | | |
4296 | | // Replace any non-alpha with '+'. Ie seq-seq to seq+seq |
4297 | 0 | if (bc) { |
4298 | 0 | int l = strlen((char *)bc+1); |
4299 | 0 | char *c = (char *)str->s + str->l - l; |
4300 | 0 | for (i = 0; i < l; i++) { |
4301 | 0 | if (!isalpha_c(c[i])) |
4302 | 0 | c[i] = '+'; |
4303 | 0 | else if (islower_c(c[i])) |
4304 | 0 | c[i] = toupper_c(c[i]); |
4305 | 0 | } |
4306 | 0 | } |
4307 | 0 | } |
4308 | | |
4309 | | // Aux tags |
4310 | 0 | if (x && x->aux) { |
4311 | 0 | uint8_t *s = bam_get_aux(b), *end = b->data + b->l_data; |
4312 | 0 | while (s && end - s >= 4) { |
4313 | 0 | int tt = s[0]*256 + s[1]; |
4314 | 0 | if (x->tags == NULL || |
4315 | 0 | kh_get(tag, x->tags, tt) != kh_end(x->tags)) { |
4316 | 0 | e |= kputc_('\t', str) < 0; |
4317 | 0 | if (!(s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str))) |
4318 | 0 | return -1; |
4319 | 0 | } else { |
4320 | 0 | s = skip_aux(s+2, end); |
4321 | 0 | } |
4322 | 0 | } |
4323 | 0 | e |= kputsn("", 0, str) < 0; // nul terminate |
4324 | 0 | } |
4325 | | |
4326 | 0 | if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1; |
4327 | 0 | e |= kputc_('\n', str) < 0; |
4328 | | |
4329 | | // Seq line |
4330 | 0 | seq = bam_get_seq(b); |
4331 | 0 | if (flag & BAM_FREVERSE) |
4332 | 0 | for (i = len-1; i >= 0; i--) |
4333 | 0 | e |= kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str) < 0; |
4334 | 0 | else |
4335 | 0 | for (i = 0; i < len; i++) |
4336 | 0 | e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0; |
4337 | | |
4338 | | |
4339 | | // Qual line |
4340 | 0 | if (x->nprefix == '@') { |
4341 | 0 | kputsn("\n+\n", 3, str); |
4342 | 0 | qual = bam_get_qual(b); |
4343 | 0 | if (qual[0] == 0xff) |
4344 | 0 | for (i = 0; i < len; i++) |
4345 | 0 | e |= kputc_('B', str) < 0; |
4346 | 0 | else if (flag & BAM_FREVERSE) |
4347 | 0 | for (i = len-1; i >= 0; i--) |
4348 | 0 | e |= kputc_(33 + qual[i], str) < 0; |
4349 | 0 | else |
4350 | 0 | for (i = 0; i < len; i++) |
4351 | 0 | e |= kputc_(33 + qual[i], str) < 0; |
4352 | |
|
4353 | 0 | } |
4354 | 0 | e |= kputc('\n', str) < 0; |
4355 | |
|
4356 | 0 | return e ? -1 : str->l; |
4357 | 0 | } |
4358 | | |
4359 | | // Sadly we need to be able to modify the bam_hdr here so we can |
4360 | | // reference count the structure. |
4361 | | int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) |
4362 | 17.7k | { |
4363 | 17.7k | switch (fp->format.format) { |
4364 | 0 | case binary_format: |
4365 | 0 | fp->format.category = sequence_data; |
4366 | 0 | fp->format.format = bam; |
4367 | | /* fall-through */ |
4368 | 0 | case bam: |
4369 | 0 | return bam_write_idx1(fp, h, b); |
4370 | | |
4371 | 0 | case cram: |
4372 | 0 | return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b); |
4373 | | |
4374 | 0 | case text_format: |
4375 | 0 | fp->format.category = sequence_data; |
4376 | 0 | fp->format.format = sam; |
4377 | | /* fall-through */ |
4378 | 17.7k | case sam: |
4379 | 17.7k | if (fp->state) { |
4380 | 0 | SAM_state *fd = (SAM_state *)fp->state; |
4381 | | |
4382 | | // Threaded output |
4383 | 0 | if (!fd->h) { |
4384 | | // NB: discard const. We don't actually modify sam_hdr_t here, |
4385 | | // just data pointed to by it (which is a bit weasely still), |
4386 | | // but out cached pointer must be non-const as we want to |
4387 | | // destroy it later on and sam_hdr_destroy takes non-const. |
4388 | | // |
4389 | | // We do this because some tools do sam_hdr_destroy; sam_close |
4390 | | // while others do sam_close; sam_hdr_destroy. The former is |
4391 | | // an issue as we need the header still when flushing. |
4392 | 0 | fd->h = (sam_hdr_t *)h; |
4393 | 0 | fd->h->ref_count++; |
4394 | |
|
4395 | 0 | if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write, |
4396 | 0 | fp) != 0) |
4397 | 0 | return -2; |
4398 | 0 | fd->dispatcher_set = 1; |
4399 | 0 | } |
4400 | | |
4401 | 0 | if (fd->h != h) { |
4402 | 0 | hts_log_error("SAM multi-threaded decoding does not support changing header"); |
4403 | 0 | return -2; |
4404 | 0 | } |
4405 | | |
4406 | | // Find a suitable BAM array to copy to |
4407 | 0 | sp_bams *gb = fd->curr_bam; |
4408 | 0 | if (!gb) { |
4409 | 0 | pthread_mutex_lock(&fd->lines_m); |
4410 | 0 | if (fd->bams) { |
4411 | 0 | fd->curr_bam = gb = fd->bams; |
4412 | 0 | fd->bams = gb->next; |
4413 | 0 | gb->next = NULL; |
4414 | 0 | gb->nbams = 0; |
4415 | 0 | gb->bam_mem = 0; |
4416 | 0 | pthread_mutex_unlock(&fd->lines_m); |
4417 | 0 | } else { |
4418 | 0 | pthread_mutex_unlock(&fd->lines_m); |
4419 | 0 | if (!(gb = calloc(1, sizeof(*gb)))) return -1; |
4420 | 0 | if (!(gb->bams = calloc(SAM_NBAM, sizeof(*gb->bams)))) { |
4421 | 0 | free(gb); |
4422 | 0 | return -1; |
4423 | 0 | } |
4424 | 0 | gb->nbams = 0; |
4425 | 0 | gb->abams = SAM_NBAM; |
4426 | 0 | gb->bam_mem = 0; |
4427 | 0 | gb->fd = fd; |
4428 | 0 | fd->curr_idx = 0; |
4429 | 0 | fd->curr_bam = gb; |
4430 | 0 | } |
4431 | 0 | } |
4432 | | |
4433 | 0 | if (!bam_copy1(&gb->bams[gb->nbams++], b)) |
4434 | 0 | return -2; |
4435 | 0 | gb->bam_mem += b->l_data + sizeof(*b); |
4436 | | |
4437 | | // Dispatch if full |
4438 | 0 | if (gb->nbams == SAM_NBAM || gb->bam_mem > SAM_NBYTES*0.8) { |
4439 | 0 | gb->serial = fd->serial++; |
4440 | 0 | pthread_mutex_lock(&fd->command_m); |
4441 | 0 | if (fd->errcode != 0) { |
4442 | 0 | pthread_mutex_unlock(&fd->command_m); |
4443 | 0 | return -fd->errcode; |
4444 | 0 | } |
4445 | 0 | if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb, |
4446 | 0 | cleanup_sp_bams, |
4447 | 0 | cleanup_sp_lines, 0) < 0) { |
4448 | 0 | pthread_mutex_unlock(&fd->command_m); |
4449 | 0 | return -1; |
4450 | 0 | } |
4451 | 0 | pthread_mutex_unlock(&fd->command_m); |
4452 | 0 | fd->curr_bam = NULL; |
4453 | 0 | } |
4454 | | |
4455 | | // Dummy value as we don't know how long it really is. |
4456 | | // We could track file sizes via a SAM_state field, but I don't think |
4457 | | // it is necessary. |
4458 | 0 | return 1; |
4459 | 17.7k | } else { |
4460 | 17.7k | if (sam_format1(h, b, &fp->line) < 0) return -1; |
4461 | 17.7k | kputc('\n', &fp->line); |
4462 | 17.7k | if (fp->is_bgzf) { |
4463 | 0 | if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) |
4464 | 0 | return -1; |
4465 | 0 | if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1; |
4466 | 17.7k | } else { |
4467 | 17.7k | if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1; |
4468 | 17.7k | } |
4469 | | |
4470 | 17.7k | if (fp->idx) { |
4471 | 0 | if (fp->format.compression == bgzf) { |
4472 | 0 | if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), |
4473 | 0 | bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) { |
4474 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
4475 | 0 | bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); |
4476 | 0 | return -1; |
4477 | 0 | } |
4478 | 0 | } else { |
4479 | 0 | if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b), |
4480 | 0 | bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) { |
4481 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
4482 | 0 | bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); |
4483 | 0 | return -1; |
4484 | 0 | } |
4485 | 0 | } |
4486 | 0 | } |
4487 | | |
4488 | 17.7k | return fp->line.l; |
4489 | 17.7k | } |
4490 | | |
4491 | | |
4492 | 0 | case fasta_format: |
4493 | 0 | case fastq_format: { |
4494 | 0 | fastq_state *x = (fastq_state *)fp->state; |
4495 | 0 | if (!x) { |
4496 | 0 | if (!(fp->state = fastq_state_init(fp->format.format |
4497 | 0 | == fastq_format ? '@' : '>'))) |
4498 | 0 | return -2; |
4499 | 0 | } |
4500 | | |
4501 | 0 | if (fastq_format1(fp->state, b, &fp->line) < 0) |
4502 | 0 | return -1; |
4503 | 0 | if (fp->is_bgzf) { |
4504 | 0 | if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) |
4505 | 0 | return -1; |
4506 | 0 | if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l) |
4507 | 0 | return -1; |
4508 | 0 | } else { |
4509 | 0 | if (hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l) |
4510 | 0 | return -1; |
4511 | 0 | } |
4512 | 0 | return fp->line.l; |
4513 | 0 | } |
4514 | | |
4515 | 0 | default: |
4516 | 0 | errno = EBADF; |
4517 | 0 | return -1; |
4518 | 17.7k | } |
4519 | 17.7k | } |
4520 | | |
4521 | | /************************ |
4522 | | *** Auxiliary fields *** |
4523 | | ************************/ |
4524 | | #ifndef HTS_LITTLE_ENDIAN |
4525 | | static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) { |
4526 | | int tsz = aux_type2size(type); |
4527 | | |
4528 | | if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1; |
4529 | | |
4530 | | switch (tsz) { |
4531 | | case 'H': case 'Z': case 1: // Trivial |
4532 | | memcpy(out, in, len); |
4533 | | break; |
4534 | | |
4535 | | #define aux_val_to_le(type_t, store_le) do { \ |
4536 | | type_t v; \ |
4537 | | size_t i; \ |
4538 | | for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \ |
4539 | | memcpy(&v, in + i, sizeof(type_t)); \ |
4540 | | store_le(v, out); \ |
4541 | | } \ |
4542 | | } while (0) |
4543 | | |
4544 | | case 2: aux_val_to_le(uint16_t, u16_to_le); break; |
4545 | | case 4: aux_val_to_le(uint32_t, u32_to_le); break; |
4546 | | case 8: aux_val_to_le(uint64_t, u64_to_le); break; |
4547 | | |
4548 | | #undef aux_val_to_le |
4549 | | |
4550 | | case 'B': { // Recurse! |
4551 | | uint32_t n; |
4552 | | if (len < 5) return -1; |
4553 | | memcpy(&n, in + 1, 4); |
4554 | | out[0] = in[0]; |
4555 | | u32_to_le(n, out + 1); |
4556 | | return aux_to_le(in[0], out + 5, in + 5, len - 5); |
4557 | | } |
4558 | | |
4559 | | default: // Unknown type code |
4560 | | return -1; |
4561 | | } |
4562 | | |
4563 | | |
4564 | | |
4565 | | return 0; |
4566 | | } |
4567 | | #endif |
4568 | | |
4569 | | int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data) |
4570 | 0 | { |
4571 | 0 | uint32_t new_len; |
4572 | |
|
4573 | 0 | assert(b->l_data >= 0); |
4574 | 0 | new_len = b->l_data + 3 + len; |
4575 | 0 | if (new_len > INT32_MAX || new_len < b->l_data) goto nomem; |
4576 | | |
4577 | 0 | if (realloc_bam_data(b, new_len) < 0) return -1; |
4578 | | |
4579 | 0 | b->data[b->l_data] = tag[0]; |
4580 | 0 | b->data[b->l_data + 1] = tag[1]; |
4581 | 0 | b->data[b->l_data + 2] = type; |
4582 | |
|
4583 | 0 | #ifdef HTS_LITTLE_ENDIAN |
4584 | 0 | memcpy(b->data + b->l_data + 3, data, len); |
4585 | | #else |
4586 | | if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) { |
4587 | | errno = EINVAL; |
4588 | | return -1; |
4589 | | } |
4590 | | #endif |
4591 | |
|
4592 | 0 | b->l_data = new_len; |
4593 | |
|
4594 | 0 | return 0; |
4595 | | |
4596 | 0 | nomem: |
4597 | 0 | errno = ENOMEM; |
4598 | 0 | return -1; |
4599 | 0 | } |
4600 | | |
4601 | | static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end) |
4602 | 2.08M | { |
4603 | 2.08M | int size; |
4604 | 2.08M | uint32_t n; |
4605 | 2.08M | if (s >= end) return end; |
4606 | 2.08M | size = aux_type2size(*s); ++s; // skip type |
4607 | 2.08M | switch (size) { |
4608 | 554k | case 'Z': |
4609 | 554k | case 'H': |
4610 | 246M | while (s < end && *s) ++s; |
4611 | 554k | return s < end ? s + 1 : end; |
4612 | 752k | case 'B': |
4613 | 752k | if (end - s < 5) return NULL; |
4614 | 752k | size = aux_type2size(*s); ++s; |
4615 | 752k | n = le_to_u32(s); |
4616 | 752k | s += 4; |
4617 | 752k | if (size == 0 || end - s < size * n) return NULL; |
4618 | 752k | return s + size * n; |
4619 | 0 | case 0: |
4620 | 0 | return NULL; |
4621 | 776k | default: |
4622 | 776k | if (end - s < size) return NULL; |
4623 | 776k | return s + size; |
4624 | 2.08M | } |
4625 | 2.08M | } |
4626 | | |
4627 | | uint8_t *bam_aux_first(const bam1_t *b) |
4628 | 970 | { |
4629 | 970 | uint8_t *s = bam_get_aux(b); |
4630 | 970 | uint8_t *end = b->data + b->l_data; |
4631 | 970 | if (s >= end) { errno = ENOENT; return NULL; } |
4632 | 781 | return s+2; |
4633 | 970 | } |
4634 | | |
4635 | | uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s) |
4636 | 2.08M | { |
4637 | 2.08M | uint8_t *end = b->data + b->l_data; |
4638 | 2.08M | uint8_t *next = s? skip_aux((uint8_t *) s, end) : end; |
4639 | 2.08M | if (next == NULL) goto bad_aux; |
4640 | 2.08M | if (next >= end) { errno = ENOENT; return NULL; } |
4641 | 2.08M | return next+2; |
4642 | | |
4643 | 0 | bad_aux: |
4644 | 0 | hts_log_error("Corrupted aux data for read %s", bam_get_qname(b)); |
4645 | 0 | errno = EINVAL; |
4646 | 0 | return NULL; |
4647 | 2.08M | } |
4648 | | |
4649 | | uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) |
4650 | 970 | { |
4651 | 970 | uint8_t *s; |
4652 | 2.08M | for (s = bam_aux_first(b); s; s = bam_aux_next(b, s)) |
4653 | 2.08M | if (s[-2] == tag[0] && s[-1] == tag[1]) { |
4654 | | // Check the tag value is valid and complete |
4655 | 618 | uint8_t *e = skip_aux(s, b->data + b->l_data); |
4656 | 618 | if (e == NULL) goto bad_aux; |
4657 | 618 | if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux; |
4658 | | |
4659 | 618 | return s; |
4660 | 618 | } |
4661 | | |
4662 | | // errno now as set by bam_aux_first()/bam_aux_next() |
4663 | 352 | return NULL; |
4664 | | |
4665 | 0 | bad_aux: |
4666 | 0 | hts_log_error("Corrupted aux data for read %s", bam_get_qname(b)); |
4667 | 0 | errno = EINVAL; |
4668 | 0 | return NULL; |
4669 | 970 | } |
4670 | | |
4671 | | int bam_aux_del(bam1_t *b, uint8_t *s) |
4672 | 0 | { |
4673 | 0 | s = bam_aux_remove(b, s); |
4674 | 0 | return (s || errno == ENOENT)? 0 : -1; |
4675 | 0 | } |
4676 | | |
4677 | | uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s) |
4678 | 0 | { |
4679 | 0 | uint8_t *end = b->data + b->l_data; |
4680 | 0 | uint8_t *next = skip_aux(s, end); |
4681 | 0 | if (next == NULL) goto bad_aux; |
4682 | | |
4683 | 0 | b->l_data -= next - (s-2); |
4684 | 0 | if (next >= end) { errno = ENOENT; return NULL; } |
4685 | | |
4686 | 0 | memmove(s-2, next, end - next); |
4687 | 0 | return s; |
4688 | | |
4689 | 0 | bad_aux: |
4690 | 0 | hts_log_error("Corrupted aux data for read %s", bam_get_qname(b)); |
4691 | 0 | errno = EINVAL; |
4692 | 0 | return NULL; |
4693 | 0 | } |
4694 | | |
4695 | | int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data) |
4696 | 0 | { |
4697 | | // FIXME: This is not at all efficient! |
4698 | 0 | size_t ln = len >= 0 ? len : strlen(data) + 1; |
4699 | 0 | size_t old_ln = 0; |
4700 | 0 | int need_nul = ln == 0 || data[ln - 1] != '\0'; |
4701 | 0 | int save_errno = errno; |
4702 | 0 | int new_tag = 0; |
4703 | 0 | uint8_t *s = bam_aux_get(b,tag), *e; |
4704 | |
|
4705 | 0 | if (s) { // Replacing existing tag |
4706 | 0 | char type = *s; |
4707 | 0 | if (type != 'Z') { |
4708 | 0 | hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type); |
4709 | 0 | errno = EINVAL; |
4710 | 0 | return -1; |
4711 | 0 | } |
4712 | 0 | s++; |
4713 | 0 | e = memchr(s, '\0', b->data + b->l_data - s); |
4714 | 0 | old_ln = (e ? e - s : b->data + b->l_data - s) + 1; |
4715 | 0 | s -= 3; |
4716 | 0 | } else { |
4717 | 0 | if (errno != ENOENT) { // Invalid aux data, give up |
4718 | 0 | return -1; |
4719 | 0 | } else { // Tag doesn't exist - put it on the end |
4720 | 0 | errno = save_errno; |
4721 | 0 | s = b->data + b->l_data; |
4722 | 0 | new_tag = 3; |
4723 | 0 | } |
4724 | 0 | } |
4725 | | |
4726 | 0 | if (old_ln < ln + need_nul + new_tag) { |
4727 | 0 | ptrdiff_t s_offset = s - b->data; |
4728 | 0 | if (possibly_expand_bam_data(b, ln + need_nul + new_tag - old_ln) < 0) |
4729 | 0 | return -1; |
4730 | 0 | s = b->data + s_offset; |
4731 | 0 | } |
4732 | 0 | if (!new_tag) { |
4733 | 0 | memmove(s + 3 + ln + need_nul, |
4734 | 0 | s + 3 + old_ln, |
4735 | 0 | b->l_data - (s + 3 - b->data) - old_ln); |
4736 | 0 | } |
4737 | 0 | b->l_data += new_tag + ln + need_nul - old_ln; |
4738 | |
|
4739 | 0 | s[0] = tag[0]; |
4740 | 0 | s[1] = tag[1]; |
4741 | 0 | s[2] = 'Z'; |
4742 | 0 | memmove(s+3,data,ln); |
4743 | 0 | if (need_nul) s[3 + ln] = '\0'; |
4744 | 0 | return 0; |
4745 | 0 | } |
4746 | | |
4747 | | int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val) |
4748 | 0 | { |
4749 | 0 | uint32_t sz, old_sz = 0, new = 0; |
4750 | 0 | uint8_t *s, type; |
4751 | |
|
4752 | 0 | if (val < INT32_MIN || val > UINT32_MAX) { |
4753 | 0 | errno = EOVERFLOW; |
4754 | 0 | return -1; |
4755 | 0 | } |
4756 | 0 | if (val < INT16_MIN) { type = 'i'; sz = 4; } |
4757 | 0 | else if (val < INT8_MIN) { type = 's'; sz = 2; } |
4758 | 0 | else if (val < 0) { type = 'c'; sz = 1; } |
4759 | 0 | else if (val < UINT8_MAX) { type = 'C'; sz = 1; } |
4760 | 0 | else if (val < UINT16_MAX) { type = 'S'; sz = 2; } |
4761 | 0 | else { type = 'I'; sz = 4; } |
4762 | |
|
4763 | 0 | s = bam_aux_get(b, tag); |
4764 | 0 | if (s) { // Tag present - how big was the old one? |
4765 | 0 | switch (*s) { |
4766 | 0 | case 'c': case 'C': old_sz = 1; break; |
4767 | 0 | case 's': case 'S': old_sz = 2; break; |
4768 | 0 | case 'i': case 'I': old_sz = 4; break; |
4769 | 0 | default: errno = EINVAL; return -1; // Not an integer |
4770 | 0 | } |
4771 | 0 | } else { |
4772 | 0 | if (errno == ENOENT) { // Tag doesn't exist - add a new one |
4773 | 0 | s = b->data + b->l_data; |
4774 | 0 | new = 1; |
4775 | 0 | } else { // Invalid aux data, give up. |
4776 | 0 | return -1; |
4777 | 0 | } |
4778 | 0 | } |
4779 | | |
4780 | 0 | if (new || old_sz < sz) { |
4781 | | // Make room for new tag |
4782 | 0 | ptrdiff_t s_offset = s - b->data; |
4783 | 0 | if (possibly_expand_bam_data(b, (new ? 3 : 0) + sz - old_sz) < 0) |
4784 | 0 | return -1; |
4785 | 0 | s = b->data + s_offset; |
4786 | 0 | if (new) { // Add tag id |
4787 | 0 | *s++ = tag[0]; |
4788 | 0 | *s++ = tag[1]; |
4789 | 0 | } else { // Shift following data so we have space |
4790 | 0 | memmove(s + sz, s + old_sz, b->l_data - s_offset - old_sz); |
4791 | 0 | } |
4792 | 0 | } else { |
4793 | | // Reuse old space. Data value may be bigger than necessary but |
4794 | | // we avoid having to move everything else |
4795 | 0 | sz = old_sz; |
4796 | 0 | type = (val < 0 ? "\0cs\0i" : "\0CS\0I")[old_sz]; |
4797 | 0 | assert(type > 0); |
4798 | 0 | } |
4799 | 0 | *s++ = type; |
4800 | 0 | #ifdef HTS_LITTLE_ENDIAN |
4801 | 0 | memcpy(s, &val, sz); |
4802 | | #else |
4803 | | switch (sz) { |
4804 | | case 4: u32_to_le(val, s); break; |
4805 | | case 2: u16_to_le(val, s); break; |
4806 | | default: *s = val; break; |
4807 | | } |
4808 | | #endif |
4809 | 0 | b->l_data += (new ? 3 : 0) + sz - old_sz; |
4810 | 0 | return 0; |
4811 | 0 | } |
4812 | | |
4813 | | int bam_aux_update_float(bam1_t *b, const char tag[2], float val) |
4814 | 0 | { |
4815 | 0 | uint8_t *s = bam_aux_get(b, tag); |
4816 | 0 | int shrink = 0, new = 0; |
4817 | |
|
4818 | 0 | if (s) { // Tag present - what was it? |
4819 | 0 | switch (*s) { |
4820 | 0 | case 'f': break; |
4821 | 0 | case 'd': shrink = 1; break; |
4822 | 0 | default: errno = EINVAL; return -1; // Not a float |
4823 | 0 | } |
4824 | 0 | } else { |
4825 | 0 | if (errno == ENOENT) { // Tag doesn't exist - add a new one |
4826 | 0 | new = 1; |
4827 | 0 | } else { // Invalid aux data, give up. |
4828 | 0 | return -1; |
4829 | 0 | } |
4830 | 0 | } |
4831 | | |
4832 | 0 | if (new) { // Ensure there's room |
4833 | 0 | if (possibly_expand_bam_data(b, 3 + 4) < 0) |
4834 | 0 | return -1; |
4835 | 0 | s = b->data + b->l_data; |
4836 | 0 | *s++ = tag[0]; |
4837 | 0 | *s++ = tag[1]; |
4838 | 0 | } else if (shrink) { // Convert non-standard double tag to float |
4839 | 0 | memmove(s + 5, s + 9, b->l_data - ((s + 9) - b->data)); |
4840 | 0 | b->l_data -= 4; |
4841 | 0 | } |
4842 | 0 | *s++ = 'f'; |
4843 | 0 | float_to_le(val, s); |
4844 | 0 | if (new) b->l_data += 7; |
4845 | |
|
4846 | 0 | return 0; |
4847 | 0 | } |
4848 | | |
4849 | | int bam_aux_update_array(bam1_t *b, const char tag[2], |
4850 | | uint8_t type, uint32_t items, void *data) |
4851 | 0 | { |
4852 | 0 | uint8_t *s = bam_aux_get(b, tag); |
4853 | 0 | size_t old_sz = 0, new_sz; |
4854 | 0 | int new = 0; |
4855 | |
|
4856 | 0 | if (s) { // Tag present |
4857 | 0 | if (*s != 'B') { errno = EINVAL; return -1; } |
4858 | 0 | old_sz = aux_type2size(s[1]); |
4859 | 0 | if (old_sz < 1 || old_sz > 4) { errno = EINVAL; return -1; } |
4860 | 0 | old_sz *= le_to_u32(s + 2); |
4861 | 0 | } else { |
4862 | 0 | if (errno == ENOENT) { // Tag doesn't exist - add a new one |
4863 | 0 | s = b->data + b->l_data; |
4864 | 0 | new = 1; |
4865 | 0 | } else { // Invalid aux data, give up. |
4866 | 0 | return -1; |
4867 | 0 | } |
4868 | 0 | } |
4869 | | |
4870 | 0 | new_sz = aux_type2size(type); |
4871 | 0 | if (new_sz < 1 || new_sz > 4) { errno = EINVAL; return -1; } |
4872 | 0 | if (items > INT32_MAX / new_sz) { errno = ENOMEM; return -1; } |
4873 | 0 | new_sz *= items; |
4874 | |
|
4875 | 0 | if (new || old_sz < new_sz) { |
4876 | | // Make room for new tag |
4877 | 0 | ptrdiff_t s_offset = s - b->data; |
4878 | 0 | if (possibly_expand_bam_data(b, (new ? 8 : 0) + new_sz - old_sz) < 0) |
4879 | 0 | return -1; |
4880 | 0 | s = b->data + s_offset; |
4881 | 0 | } |
4882 | 0 | if (new) { // Add tag id and type |
4883 | 0 | *s++ = tag[0]; |
4884 | 0 | *s++ = tag[1]; |
4885 | 0 | *s = 'B'; |
4886 | 0 | b->l_data += 8 + new_sz; |
4887 | 0 | } else if (old_sz != new_sz) { // shift following data if necessary |
4888 | 0 | memmove(s + 6 + new_sz, s + 6 + old_sz, |
4889 | 0 | b->l_data - ((s + 6 + old_sz) - b->data)); |
4890 | 0 | b->l_data -= old_sz; |
4891 | 0 | b->l_data += new_sz; |
4892 | 0 | } |
4893 | |
|
4894 | 0 | s[1] = type; |
4895 | 0 | u32_to_le(items, s + 2); |
4896 | 0 | #ifdef HTS_LITTLE_ENDIAN |
4897 | 0 | memcpy(s + 6, data, new_sz); |
4898 | 0 | return 0; |
4899 | | #else |
4900 | | return aux_to_le(type, s + 6, data, new_sz); |
4901 | | #endif |
4902 | 0 | } |
4903 | | |
4904 | | static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s, |
4905 | | uint32_t idx) |
4906 | 0 | { |
4907 | 0 | switch (type) { |
4908 | 0 | case 'c': return le_to_i8(s + idx); |
4909 | 0 | case 'C': return s[idx]; |
4910 | 0 | case 's': return le_to_i16(s + 2 * idx); |
4911 | 0 | case 'S': return le_to_u16(s + 2 * idx); |
4912 | 0 | case 'i': return le_to_i32(s + 4 * idx); |
4913 | 0 | case 'I': return le_to_u32(s + 4 * idx); |
4914 | 0 | default: |
4915 | 0 | errno = EINVAL; |
4916 | 0 | return 0; |
4917 | 0 | } |
4918 | 0 | } |
4919 | | |
4920 | | int64_t bam_aux2i(const uint8_t *s) |
4921 | 0 | { |
4922 | 0 | int type; |
4923 | 0 | type = *s++; |
4924 | 0 | return get_int_aux_val(type, s, 0); |
4925 | 0 | } |
4926 | | |
4927 | | double bam_aux2f(const uint8_t *s) |
4928 | 0 | { |
4929 | 0 | int type; |
4930 | 0 | type = *s++; |
4931 | 0 | if (type == 'd') return le_to_double(s); |
4932 | 0 | else if (type == 'f') return le_to_float(s); |
4933 | 0 | else return get_int_aux_val(type, s, 0); |
4934 | 0 | } |
4935 | | |
4936 | | char bam_aux2A(const uint8_t *s) |
4937 | 0 | { |
4938 | 0 | int type; |
4939 | 0 | type = *s++; |
4940 | 0 | if (type == 'A') return *(char*)s; |
4941 | 0 | errno = EINVAL; |
4942 | 0 | return 0; |
4943 | 0 | } |
4944 | | |
4945 | | char *bam_aux2Z(const uint8_t *s) |
4946 | 0 | { |
4947 | 0 | int type; |
4948 | 0 | type = *s++; |
4949 | 0 | if (type == 'Z' || type == 'H') return (char*)s; |
4950 | 0 | errno = EINVAL; |
4951 | 0 | return 0; |
4952 | 0 | } |
4953 | | |
4954 | | uint32_t bam_auxB_len(const uint8_t *s) |
4955 | 0 | { |
4956 | 0 | if (s[0] != 'B') { |
4957 | 0 | errno = EINVAL; |
4958 | 0 | return 0; |
4959 | 0 | } |
4960 | 0 | return le_to_u32(s + 2); |
4961 | 0 | } |
4962 | | |
4963 | | int64_t bam_auxB2i(const uint8_t *s, uint32_t idx) |
4964 | 0 | { |
4965 | 0 | uint32_t len = bam_auxB_len(s); |
4966 | 0 | if (idx >= len) { |
4967 | 0 | errno = ERANGE; |
4968 | 0 | return 0; |
4969 | 0 | } |
4970 | 0 | return get_int_aux_val(s[1], s + 6, idx); |
4971 | 0 | } |
4972 | | |
4973 | | double bam_auxB2f(const uint8_t *s, uint32_t idx) |
4974 | 0 | { |
4975 | 0 | uint32_t len = bam_auxB_len(s); |
4976 | 0 | if (idx >= len) { |
4977 | 0 | errno = ERANGE; |
4978 | 0 | return 0.0; |
4979 | 0 | } |
4980 | 0 | if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx); |
4981 | 0 | else return get_int_aux_val(s[1], s + 6, idx); |
4982 | 0 | } |
4983 | | |
4984 | | int sam_open_mode(char *mode, const char *fn, const char *format) |
4985 | 0 | { |
4986 | | // TODO Parse "bam5" etc for compression level |
4987 | 0 | if (format == NULL) { |
4988 | | // Try to pick a format based on the filename extension |
4989 | 0 | char extension[HTS_MAX_EXT_LEN]; |
4990 | 0 | if (find_file_extension(fn, extension) < 0) return -1; |
4991 | 0 | return sam_open_mode(mode, fn, extension); |
4992 | 0 | } |
4993 | 0 | else if (strcasecmp(format, "bam") == 0) strcpy(mode, "b"); |
4994 | 0 | else if (strcasecmp(format, "cram") == 0) strcpy(mode, "c"); |
4995 | 0 | else if (strcasecmp(format, "sam") == 0) strcpy(mode, ""); |
4996 | 0 | else if (strcasecmp(format, "sam.gz") == 0) strcpy(mode, "z"); |
4997 | 0 | else if (strcasecmp(format, "fastq") == 0 || |
4998 | 0 | strcasecmp(format, "fq") == 0) strcpy(mode, "f"); |
4999 | 0 | else if (strcasecmp(format, "fastq.gz") == 0 || |
5000 | 0 | strcasecmp(format, "fq.gz") == 0) strcpy(mode, "fz"); |
5001 | 0 | else if (strcasecmp(format, "fasta") == 0 || |
5002 | 0 | strcasecmp(format, "fa") == 0) strcpy(mode, "F"); |
5003 | 0 | else if (strcasecmp(format, "fasta.gz") == 0 || |
5004 | 0 | strcasecmp(format, "fa.gz") == 0) strcpy(mode, "Fz"); |
5005 | 0 | else return -1; |
5006 | | |
5007 | 0 | return 0; |
5008 | 0 | } |
5009 | | |
5010 | | // A version of sam_open_mode that can handle ,key=value options. |
5011 | | // The format string is allocated and returned, to be freed by the caller. |
5012 | | // Prefix should be "r" or "w", |
5013 | | char *sam_open_mode_opts(const char *fn, |
5014 | | const char *mode, |
5015 | | const char *format) |
5016 | 0 | { |
5017 | 0 | char *mode_opts = malloc((format ? strlen(format) : 1) + |
5018 | 0 | (mode ? strlen(mode) : 1) + 12); |
5019 | 0 | char *opts, *cp; |
5020 | 0 | int format_len; |
5021 | |
|
5022 | 0 | if (!mode_opts) |
5023 | 0 | return NULL; |
5024 | | |
5025 | 0 | strcpy(mode_opts, mode ? mode : "r"); |
5026 | 0 | cp = mode_opts + strlen(mode_opts); |
5027 | |
|
5028 | 0 | if (format == NULL) { |
5029 | | // Try to pick a format based on the filename extension |
5030 | 0 | char extension[HTS_MAX_EXT_LEN]; |
5031 | 0 | if (find_file_extension(fn, extension) < 0) { |
5032 | 0 | free(mode_opts); |
5033 | 0 | return NULL; |
5034 | 0 | } |
5035 | 0 | if (sam_open_mode(cp, fn, extension) == 0) { |
5036 | 0 | return mode_opts; |
5037 | 0 | } else { |
5038 | 0 | free(mode_opts); |
5039 | 0 | return NULL; |
5040 | 0 | } |
5041 | 0 | } |
5042 | | |
5043 | 0 | if ((opts = strchr(format, ','))) { |
5044 | 0 | format_len = opts-format; |
5045 | 0 | } else { |
5046 | 0 | opts=""; |
5047 | 0 | format_len = strlen(format); |
5048 | 0 | } |
5049 | |
|
5050 | 0 | if (strncmp(format, "bam", format_len) == 0) { |
5051 | 0 | *cp++ = 'b'; |
5052 | 0 | } else if (strncmp(format, "cram", format_len) == 0) { |
5053 | 0 | *cp++ = 'c'; |
5054 | 0 | } else if (strncmp(format, "cram2", format_len) == 0) { |
5055 | 0 | *cp++ = 'c'; |
5056 | 0 | strcpy(cp, ",VERSION=2.1"); |
5057 | 0 | cp += 12; |
5058 | 0 | } else if (strncmp(format, "cram3", format_len) == 0) { |
5059 | 0 | *cp++ = 'c'; |
5060 | 0 | strcpy(cp, ",VERSION=3.0"); |
5061 | 0 | cp += 12; |
5062 | 0 | } else if (strncmp(format, "sam", format_len) == 0) { |
5063 | 0 | ; // format mode="" |
5064 | 0 | } else if (strncmp(format, "sam.gz", format_len) == 0) { |
5065 | 0 | *cp++ = 'z'; |
5066 | 0 | } else if (strncmp(format, "fastq", format_len) == 0 || |
5067 | 0 | strncmp(format, "fq", format_len) == 0) { |
5068 | 0 | *cp++ = 'f'; |
5069 | 0 | } else if (strncmp(format, "fastq.gz", format_len) == 0 || |
5070 | 0 | strncmp(format, "fq.gz", format_len) == 0) { |
5071 | 0 | *cp++ = 'f'; |
5072 | 0 | *cp++ = 'z'; |
5073 | 0 | } else if (strncmp(format, "fasta", format_len) == 0 || |
5074 | 0 | strncmp(format, "fa", format_len) == 0) { |
5075 | 0 | *cp++ = 'F'; |
5076 | 0 | } else if (strncmp(format, "fasta.gz", format_len) == 0 || |
5077 | 0 | strncmp(format, "fa", format_len) == 0) { |
5078 | 0 | *cp++ = 'F'; |
5079 | 0 | *cp++ = 'z'; |
5080 | 0 | } else { |
5081 | 0 | free(mode_opts); |
5082 | 0 | return NULL; |
5083 | 0 | } |
5084 | | |
5085 | 0 | strcpy(cp, opts); |
5086 | |
|
5087 | 0 | return mode_opts; |
5088 | 0 | } |
5089 | | |
5090 | 0 | #define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n)) |
5091 | | int bam_str2flag(const char *str) |
5092 | 0 | { |
5093 | 0 | char *end, *beg = (char*) str; |
5094 | 0 | long int flag = strtol(str, &end, 0); |
5095 | 0 | if ( end!=str ) return flag; // the conversion was successful |
5096 | 0 | flag = 0; |
5097 | 0 | while ( *str ) |
5098 | 0 | { |
5099 | 0 | end = beg; |
5100 | 0 | while ( *end && *end!=',' ) end++; |
5101 | 0 | if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED; |
5102 | 0 | else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR; |
5103 | 0 | else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP; |
5104 | 0 | else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP; |
5105 | 0 | else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE; |
5106 | 0 | else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE; |
5107 | 0 | else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1; |
5108 | 0 | else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2; |
5109 | 0 | else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY; |
5110 | 0 | else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL; |
5111 | 0 | else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP; |
5112 | 0 | else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY; |
5113 | 0 | else return -1; |
5114 | 0 | if ( !*end ) break; |
5115 | 0 | beg = end + 1; |
5116 | 0 | } |
5117 | 0 | return flag; |
5118 | 0 | } |
5119 | | |
5120 | | char *bam_flag2str(int flag) |
5121 | 0 | { |
5122 | 0 | kstring_t str = {0,0,0}; |
5123 | 0 | if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED"); |
5124 | 0 | if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR"); |
5125 | 0 | if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP"); |
5126 | 0 | if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP"); |
5127 | 0 | if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE"); |
5128 | 0 | if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE"); |
5129 | 0 | if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1"); |
5130 | 0 | if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2"); |
5131 | 0 | if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY"); |
5132 | 0 | if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL"); |
5133 | 0 | if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP"); |
5134 | 0 | if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY"); |
5135 | 0 | if ( str.l == 0 ) kputsn("", 0, &str); |
5136 | 0 | return str.s; |
5137 | 0 | } |
5138 | | |
5139 | | |
5140 | | /************************** |
5141 | | *** Pileup and Mpileup *** |
5142 | | **************************/ |
5143 | | |
5144 | | #if !defined(BAM_NO_PILEUP) |
5145 | | |
5146 | | #include <assert.h> |
5147 | | |
5148 | | /******************* |
5149 | | *** Memory pool *** |
5150 | | *******************/ |
5151 | | |
5152 | | typedef struct { |
5153 | | int k, y; |
5154 | | hts_pos_t x, end; |
5155 | | } cstate_t; |
5156 | | |
5157 | | static cstate_t g_cstate_null = { -1, 0, 0, 0 }; |
5158 | | |
5159 | | typedef struct __linkbuf_t { |
5160 | | bam1_t b; |
5161 | | hts_pos_t beg, end; |
5162 | | cstate_t s; |
5163 | | struct __linkbuf_t *next; |
5164 | | bam_pileup_cd cd; |
5165 | | } lbnode_t; |
5166 | | |
5167 | | typedef struct { |
5168 | | int cnt, n, max; |
5169 | | lbnode_t **buf; |
5170 | | } mempool_t; |
5171 | | |
5172 | | static mempool_t *mp_init(void) |
5173 | 0 | { |
5174 | 0 | mempool_t *mp; |
5175 | 0 | mp = (mempool_t*)calloc(1, sizeof(mempool_t)); |
5176 | 0 | return mp; |
5177 | 0 | } |
5178 | | static void mp_destroy(mempool_t *mp) |
5179 | 0 | { |
5180 | 0 | int k; |
5181 | 0 | for (k = 0; k < mp->n; ++k) { |
5182 | 0 | free(mp->buf[k]->b.data); |
5183 | 0 | free(mp->buf[k]); |
5184 | 0 | } |
5185 | 0 | free(mp->buf); |
5186 | 0 | free(mp); |
5187 | 0 | } |
5188 | | static inline lbnode_t *mp_alloc(mempool_t *mp) |
5189 | 0 | { |
5190 | 0 | ++mp->cnt; |
5191 | 0 | if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t)); |
5192 | 0 | else return mp->buf[--mp->n]; |
5193 | 0 | } |
5194 | | static inline void mp_free(mempool_t *mp, lbnode_t *p) |
5195 | 0 | { |
5196 | 0 | --mp->cnt; p->next = 0; // clear lbnode_t::next here |
5197 | 0 | if (mp->n == mp->max) { |
5198 | 0 | mp->max = mp->max? mp->max<<1 : 256; |
5199 | 0 | mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max); |
5200 | 0 | } |
5201 | 0 | mp->buf[mp->n++] = p; |
5202 | 0 | } |
5203 | | |
5204 | | /********************** |
5205 | | *** CIGAR resolver *** |
5206 | | **********************/ |
5207 | | |
5208 | | /* s->k: the index of the CIGAR operator that has just been processed. |
5209 | | s->x: the reference coordinate of the start of s->k |
5210 | | s->y: the query coordinate of the start of s->k |
5211 | | */ |
5212 | | static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s) |
5213 | 0 | { |
5214 | 0 | #define _cop(c) ((c)&BAM_CIGAR_MASK) |
5215 | 0 | #define _cln(c) ((c)>>BAM_CIGAR_SHIFT) |
5216 | |
|
5217 | 0 | bam1_t *b = p->b; |
5218 | 0 | bam1_core_t *c = &b->core; |
5219 | 0 | uint32_t *cigar = bam_get_cigar(b); |
5220 | 0 | int k; |
5221 | | // determine the current CIGAR operation |
5222 | | //fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y); |
5223 | 0 | if (s->k == -1) { // never processed |
5224 | 0 | p->qpos = 0; |
5225 | 0 | if (c->n_cigar == 1) { // just one operation, save a loop |
5226 | 0 | if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0; |
5227 | 0 | } else { // find the first match or deletion |
5228 | 0 | for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) { |
5229 | 0 | int op = _cop(cigar[k]); |
5230 | 0 | int l = _cln(cigar[k]); |
5231 | 0 | if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || |
5232 | 0 | op == BAM_CEQUAL || op == BAM_CDIFF) break; |
5233 | 0 | else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; |
5234 | 0 | } |
5235 | 0 | assert(k < c->n_cigar); |
5236 | 0 | s->k = k; |
5237 | 0 | } |
5238 | 0 | } else { // the read has been processed before |
5239 | 0 | int op, l = _cln(cigar[s->k]); |
5240 | 0 | if (pos - s->x >= l) { // jump to the next operation |
5241 | 0 | assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case |
5242 | 0 | op = _cop(cigar[s->k+1]); |
5243 | 0 | if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop |
5244 | 0 | if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; |
5245 | 0 | s->x += l; |
5246 | 0 | ++s->k; |
5247 | 0 | } else { // find the next M/D/N/=/X |
5248 | 0 | if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l; |
5249 | 0 | s->x += l; |
5250 | 0 | for (k = s->k + 1; k < c->n_cigar; ++k) { |
5251 | 0 | op = _cop(cigar[k]), l = _cln(cigar[k]); |
5252 | 0 | if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break; |
5253 | 0 | else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l; |
5254 | 0 | } |
5255 | 0 | s->k = k; |
5256 | 0 | } |
5257 | 0 | assert(s->k < c->n_cigar); // otherwise a bug |
5258 | 0 | } // else, do nothing |
5259 | 0 | } |
5260 | 0 | { // collect pileup information |
5261 | 0 | int op, l; |
5262 | 0 | op = _cop(cigar[s->k]); l = _cln(cigar[s->k]); |
5263 | 0 | p->is_del = p->indel = p->is_refskip = 0; |
5264 | 0 | if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation |
5265 | 0 | int op2 = _cop(cigar[s->k+1]); |
5266 | 0 | int l2 = _cln(cigar[s->k+1]); |
5267 | 0 | if (op2 == BAM_CDEL) p->indel = -(int)l2; |
5268 | 0 | else if (op2 == BAM_CINS) p->indel = l2; |
5269 | 0 | else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding |
5270 | 0 | int l3 = 0; |
5271 | 0 | for (k = s->k + 2; k < c->n_cigar; ++k) { |
5272 | 0 | op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); |
5273 | 0 | if (op2 == BAM_CINS) l3 += l2; |
5274 | 0 | else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break; |
5275 | 0 | } |
5276 | 0 | if (l3 > 0) p->indel = l3; |
5277 | 0 | } |
5278 | 0 | } |
5279 | 0 | if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { |
5280 | 0 | p->qpos = s->y + (pos - s->x); |
5281 | 0 | } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { |
5282 | 0 | p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!! |
5283 | 0 | p->is_refskip = (op == BAM_CREF_SKIP); |
5284 | 0 | } // cannot be other operations; otherwise a bug |
5285 | 0 | p->is_head = (pos == c->pos); p->is_tail = (pos == s->end); |
5286 | 0 | } |
5287 | 0 | p->cigar_ind = s->k; |
5288 | 0 | return 1; |
5289 | 0 | } |
5290 | | |
5291 | | /******************************* |
5292 | | *** Expansion of insertions *** |
5293 | | *******************************/ |
5294 | | |
5295 | | /* |
5296 | | * Fills out the kstring with the padded insertion sequence for the current |
5297 | | * location in 'p'. If this is not an insertion site, the string is blank. |
5298 | | * |
5299 | | * This variant handles base modifications, but only when "m" is non-NULL. |
5300 | | * |
5301 | | * Returns the number of inserted base on success, with string length being |
5302 | | * accessable via ins->l; |
5303 | | * -1 on failure. |
5304 | | */ |
5305 | | int bam_plp_insertion_mod(const bam_pileup1_t *p, |
5306 | | hts_base_mod_state *m, |
5307 | 0 | kstring_t *ins, int *del_len) { |
5308 | 0 | int j, k, indel, nb = 0; |
5309 | 0 | uint32_t *cigar; |
5310 | |
|
5311 | 0 | if (p->indel <= 0) { |
5312 | 0 | if (ks_resize(ins, 1) < 0) |
5313 | 0 | return -1; |
5314 | 0 | ins->l = 0; |
5315 | 0 | ins->s[0] = '\0'; |
5316 | 0 | return 0; |
5317 | 0 | } |
5318 | | |
5319 | 0 | if (del_len) |
5320 | 0 | *del_len = 0; |
5321 | | |
5322 | | // Measure indel length including pads |
5323 | 0 | indel = 0; |
5324 | 0 | k = p->cigar_ind+1; |
5325 | 0 | cigar = bam_get_cigar(p->b); |
5326 | 0 | while (k < p->b->core.n_cigar) { |
5327 | 0 | switch (cigar[k] & BAM_CIGAR_MASK) { |
5328 | 0 | case BAM_CPAD: |
5329 | 0 | case BAM_CINS: |
5330 | 0 | indel += (cigar[k] >> BAM_CIGAR_SHIFT); |
5331 | 0 | break; |
5332 | 0 | default: |
5333 | 0 | k = p->b->core.n_cigar; |
5334 | 0 | break; |
5335 | 0 | } |
5336 | 0 | k++; |
5337 | 0 | } |
5338 | 0 | nb = ins->l = indel; |
5339 | | |
5340 | | // Produce sequence |
5341 | 0 | if (ks_resize(ins, indel+1) < 0) |
5342 | 0 | return -1; |
5343 | 0 | indel = 0; |
5344 | 0 | k = p->cigar_ind+1; |
5345 | 0 | j = 1; |
5346 | 0 | while (k < p->b->core.n_cigar) { |
5347 | 0 | int l, c; |
5348 | 0 | switch (cigar[k] & BAM_CIGAR_MASK) { |
5349 | 0 | case BAM_CPAD: |
5350 | 0 | for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++) |
5351 | 0 | ins->s[indel++] = '*'; |
5352 | 0 | break; |
5353 | 0 | case BAM_CINS: |
5354 | 0 | for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) { |
5355 | 0 | c = p->qpos + j - p->is_del < p->b->core.l_qseq |
5356 | 0 | ? seq_nt16_str[bam_seqi(bam_get_seq(p->b), |
5357 | 0 | p->qpos + j - p->is_del)] |
5358 | 0 | : 'N'; |
5359 | 0 | ins->s[indel++] = c; |
5360 | 0 | int nm; |
5361 | 0 | hts_base_mod mod[256]; |
5362 | 0 | if (m && (nm = bam_mods_at_qpos(p->b, p->qpos + j - p->is_del, |
5363 | 0 | m, mod, 256)) > 0) { |
5364 | 0 | int o_indel = indel; |
5365 | 0 | if (ks_resize(ins, ins->l + nm*16+3) < 0) |
5366 | 0 | return -1; |
5367 | 0 | ins->s[indel++] = '['; |
5368 | 0 | int j; |
5369 | 0 | for (j = 0; j < nm; j++) { |
5370 | 0 | char qual[20]; |
5371 | 0 | if (mod[j].qual >= 0) |
5372 | 0 | sprintf(qual, "%d", mod[j].qual); |
5373 | 0 | else |
5374 | 0 | *qual=0; |
5375 | 0 | if (mod[j].modified_base < 0) |
5376 | | // ChEBI |
5377 | 0 | indel += sprintf(&ins->s[indel], "%c(%d)%s", |
5378 | 0 | "+-"[mod[j].strand], |
5379 | 0 | -mod[j].modified_base, |
5380 | 0 | qual); |
5381 | 0 | else |
5382 | 0 | indel += sprintf(&ins->s[indel], "%c%c%s", |
5383 | 0 | "+-"[mod[j].strand], |
5384 | 0 | mod[j].modified_base, |
5385 | 0 | qual); |
5386 | 0 | } |
5387 | 0 | ins->s[indel++] = ']'; |
5388 | 0 | ins->l += indel - o_indel; // grow by amount we used |
5389 | 0 | } |
5390 | 0 | } |
5391 | 0 | break; |
5392 | 0 | case BAM_CDEL: |
5393 | | // eg cigar 1M2I1D gives mpileup output in T+2AA-1C style |
5394 | 0 | if (del_len) |
5395 | 0 | *del_len = cigar[k]>>BAM_CIGAR_SHIFT; |
5396 | | // fall through |
5397 | 0 | default: |
5398 | 0 | k = p->b->core.n_cigar; |
5399 | 0 | break; |
5400 | 0 | } |
5401 | 0 | k++; |
5402 | 0 | } |
5403 | 0 | ins->s[indel] = '\0'; |
5404 | 0 | ins->l = indel; // string length |
5405 | |
|
5406 | 0 | return nb; // base length |
5407 | 0 | } |
5408 | | |
5409 | | /* |
5410 | | * Fills out the kstring with the padded insertion sequence for the current |
5411 | | * location in 'p'. If this is not an insertion site, the string is blank. |
5412 | | * |
5413 | | * This is the original interface with no capability for reporting base |
5414 | | * modifications. |
5415 | | * |
5416 | | * Returns the length of insertion string on success; |
5417 | | * -1 on failure. |
5418 | | */ |
5419 | 0 | int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) { |
5420 | 0 | return bam_plp_insertion_mod(p, NULL, ins, del_len); |
5421 | 0 | } |
5422 | | |
5423 | | /*********************** |
5424 | | *** Pileup iterator *** |
5425 | | ***********************/ |
5426 | | |
5427 | | // Dictionary of overlapping reads |
5428 | | KHASH_MAP_INIT_STR(olap_hash, lbnode_t *) |
5429 | | typedef khash_t(olap_hash) olap_hash_t; |
5430 | | |
5431 | | struct bam_plp_s { |
5432 | | mempool_t *mp; |
5433 | | lbnode_t *head, *tail; |
5434 | | int32_t tid, max_tid; |
5435 | | hts_pos_t pos, max_pos; |
5436 | | int is_eof, max_plp, error, maxcnt; |
5437 | | uint64_t id; |
5438 | | bam_pileup1_t *plp; |
5439 | | // for the "auto" interface only |
5440 | | bam1_t *b; |
5441 | | bam_plp_auto_f func; |
5442 | | void *data; |
5443 | | olap_hash_t *overlaps; |
5444 | | |
5445 | | // For notification of creation and destruction events |
5446 | | // and associated client-owned pointer. |
5447 | | int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd); |
5448 | | int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd); |
5449 | | }; |
5450 | | |
5451 | | bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data) |
5452 | 0 | { |
5453 | 0 | bam_plp_t iter; |
5454 | 0 | iter = (bam_plp_t)calloc(1, sizeof(struct bam_plp_s)); |
5455 | 0 | iter->mp = mp_init(); |
5456 | 0 | iter->head = iter->tail = mp_alloc(iter->mp); |
5457 | 0 | iter->max_tid = iter->max_pos = -1; |
5458 | 0 | iter->maxcnt = 8000; |
5459 | 0 | if (func) { |
5460 | 0 | iter->func = func; |
5461 | 0 | iter->data = data; |
5462 | 0 | iter->b = bam_init1(); |
5463 | 0 | } |
5464 | 0 | return iter; |
5465 | 0 | } |
5466 | | |
5467 | | int bam_plp_init_overlaps(bam_plp_t iter) |
5468 | 0 | { |
5469 | 0 | iter->overlaps = kh_init(olap_hash); // hash for tweaking quality of bases in overlapping reads |
5470 | 0 | return iter->overlaps ? 0 : -1; |
5471 | 0 | } |
5472 | | |
5473 | | void bam_plp_destroy(bam_plp_t iter) |
5474 | 0 | { |
5475 | 0 | lbnode_t *p, *pnext; |
5476 | 0 | if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps); |
5477 | 0 | for (p = iter->head; p != NULL; p = pnext) { |
5478 | 0 | pnext = p->next; |
5479 | 0 | mp_free(iter->mp, p); |
5480 | 0 | } |
5481 | 0 | mp_destroy(iter->mp); |
5482 | 0 | if (iter->b) bam_destroy1(iter->b); |
5483 | 0 | free(iter->plp); |
5484 | 0 | free(iter); |
5485 | 0 | } |
5486 | | |
5487 | | void bam_plp_constructor(bam_plp_t plp, |
5488 | 0 | int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) { |
5489 | 0 | plp->plp_construct = func; |
5490 | 0 | } |
5491 | | |
5492 | | void bam_plp_destructor(bam_plp_t plp, |
5493 | 0 | int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) { |
5494 | 0 | plp->plp_destruct = func; |
5495 | 0 | } |
5496 | | |
5497 | | //--------------------------------- |
5498 | | //--- Tweak overlapping reads |
5499 | | //--------------------------------- |
5500 | | |
5501 | | /** |
5502 | | * cigar_iref2iseq_set() - find the first CMATCH setting the ref and the read index |
5503 | | * cigar_iref2iseq_next() - get the next CMATCH base |
5504 | | * @cigar: pointer to current cigar block (rw) |
5505 | | * @cigar_max: pointer just beyond the last cigar block |
5506 | | * @icig: position within the current cigar block (rw) |
5507 | | * @iseq: position in the sequence (rw) |
5508 | | * @iref: position with respect to the beginning of the read (iref_pos - b->core.pos) (rw) |
5509 | | * |
5510 | | * Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered, |
5511 | | * or -2 on error. |
5512 | | */ |
5513 | | static inline int cigar_iref2iseq_set(const uint32_t **cigar, |
5514 | | const uint32_t *cigar_max, |
5515 | | hts_pos_t *icig, |
5516 | | hts_pos_t *iseq, |
5517 | | hts_pos_t *iref) |
5518 | 0 | { |
5519 | 0 | hts_pos_t pos = *iref; |
5520 | 0 | if ( pos < 0 ) return -1; |
5521 | 0 | *icig = 0; |
5522 | 0 | *iseq = 0; |
5523 | 0 | *iref = 0; |
5524 | 0 | while ( *cigar<cigar_max ) |
5525 | 0 | { |
5526 | 0 | int cig = (**cigar) & BAM_CIGAR_MASK; |
5527 | 0 | int ncig = (**cigar) >> BAM_CIGAR_SHIFT; |
5528 | |
|
5529 | 0 | if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } |
5530 | 0 | if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; } |
5531 | 0 | if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) |
5532 | 0 | { |
5533 | 0 | pos -= ncig; |
5534 | 0 | if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; } |
5535 | 0 | (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig; |
5536 | 0 | continue; |
5537 | 0 | } |
5538 | 0 | if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } |
5539 | 0 | if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) |
5540 | 0 | { |
5541 | 0 | pos -= ncig; |
5542 | 0 | if ( pos<0 ) pos = 0; |
5543 | 0 | (*cigar)++; *icig = 0; *iref += ncig; |
5544 | 0 | continue; |
5545 | 0 | } |
5546 | 0 | hts_log_error("Unexpected cigar %d", cig); |
5547 | 0 | return -2; |
5548 | 0 | } |
5549 | 0 | *iseq = -1; |
5550 | 0 | return -1; |
5551 | 0 | } |
5552 | | static inline int cigar_iref2iseq_next(const uint32_t **cigar, |
5553 | | const uint32_t *cigar_max, |
5554 | | hts_pos_t *icig, |
5555 | | hts_pos_t *iseq, |
5556 | | hts_pos_t *iref) |
5557 | 0 | { |
5558 | 0 | while ( *cigar < cigar_max ) |
5559 | 0 | { |
5560 | 0 | int cig = (**cigar) & BAM_CIGAR_MASK; |
5561 | 0 | int ncig = (**cigar) >> BAM_CIGAR_SHIFT; |
5562 | |
|
5563 | 0 | if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) |
5564 | 0 | { |
5565 | 0 | if ( *icig >= ncig - 1 ) { *icig = -1; (*cigar)++; continue; } |
5566 | 0 | (*iseq)++; (*icig)++; (*iref)++; |
5567 | 0 | return BAM_CMATCH; |
5568 | 0 | } |
5569 | 0 | if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = -1; continue; } |
5570 | 0 | if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; } |
5571 | 0 | if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; } |
5572 | 0 | if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = -1; continue; } |
5573 | 0 | hts_log_error("Unexpected cigar %d", cig); |
5574 | 0 | return -2; |
5575 | 0 | } |
5576 | 0 | *iseq = -1; |
5577 | 0 | *iref = -1; |
5578 | 0 | return -1; |
5579 | 0 | } |
5580 | | |
5581 | | // Given overlapping read 'a' (left) and 'b' (right) on the same |
5582 | | // template, adjust quality values to zero for either a or b. |
5583 | | // Note versions 1.12 and earlier always removed quality from 'b' for |
5584 | | // matching bases. Now we select a or b semi-randomly based on name hash. |
5585 | | // Returns 0 on success, |
5586 | | // -1 on failure |
5587 | | static int tweak_overlap_quality(bam1_t *a, bam1_t *b) |
5588 | 0 | { |
5589 | 0 | const uint32_t *a_cigar = bam_get_cigar(a), |
5590 | 0 | *a_cigar_max = a_cigar + a->core.n_cigar; |
5591 | 0 | const uint32_t *b_cigar = bam_get_cigar(b), |
5592 | 0 | *b_cigar_max = b_cigar + b->core.n_cigar; |
5593 | 0 | hts_pos_t a_icig = 0, a_iseq = 0; |
5594 | 0 | hts_pos_t b_icig = 0, b_iseq = 0; |
5595 | 0 | uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b); |
5596 | 0 | uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b); |
5597 | |
|
5598 | 0 | hts_pos_t iref = b->core.pos; |
5599 | 0 | hts_pos_t a_iref = iref - a->core.pos; |
5600 | 0 | hts_pos_t b_iref = iref - b->core.pos; |
5601 | |
|
5602 | 0 | int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, |
5603 | 0 | &a_icig, &a_iseq, &a_iref); |
5604 | 0 | if ( a_ret<0 ) |
5605 | | // no overlap or error |
5606 | 0 | return a_ret<-1 ? -1:0; |
5607 | | |
5608 | 0 | int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, |
5609 | 0 | &b_icig, &b_iseq, &b_iref); |
5610 | 0 | if ( b_ret<0 ) |
5611 | | // no overlap or error |
5612 | 0 | return b_ret<-1 ? -1:0; |
5613 | | |
5614 | | // Determine which seq is the one getting modified qualities. |
5615 | 0 | uint8_t amul, bmul; |
5616 | 0 | if (__ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(a))) & 1) { |
5617 | 0 | amul = 1; |
5618 | 0 | bmul = 0; |
5619 | 0 | } else { |
5620 | 0 | amul = 0; |
5621 | 0 | bmul = 1; |
5622 | 0 | } |
5623 | | |
5624 | | // Loop over the overlapping region nulling qualities in either |
5625 | | // seq a or b. |
5626 | 0 | int err = 0; |
5627 | 0 | while ( 1 ) |
5628 | 0 | { |
5629 | | // Step to next matching reference position in a and b |
5630 | 0 | while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos ) |
5631 | 0 | a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, |
5632 | 0 | &a_icig, &a_iseq, &a_iref); |
5633 | 0 | if ( a_ret<0 ) { // done |
5634 | 0 | err = a_ret<-1?-1:0; |
5635 | 0 | break; |
5636 | 0 | } |
5637 | 0 | if ( iref < a_iref + a->core.pos ) |
5638 | 0 | iref = a_iref + a->core.pos; |
5639 | |
|
5640 | 0 | while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos ) |
5641 | 0 | b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, |
5642 | 0 | &b_iseq, &b_iref); |
5643 | 0 | if ( b_ret<0 ) { // done |
5644 | 0 | err = b_ret<-1?-1:0; |
5645 | 0 | break; |
5646 | 0 | } |
5647 | 0 | if ( iref < b_iref + b->core.pos ) |
5648 | 0 | iref = b_iref + b->core.pos; |
5649 | |
|
5650 | 0 | iref++; |
5651 | |
|
5652 | 0 | if ( a_iref+a->core.pos != b_iref+b->core.pos ) |
5653 | | // only CMATCH positions, don't know what to do with indels |
5654 | 0 | continue; |
5655 | | |
5656 | 0 | if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq) |
5657 | | // Fell off end of sequence, bad CIGAR? |
5658 | 0 | return -1; |
5659 | | |
5660 | | // We're finally at the same ref base in both a and b. |
5661 | | // Check if the bases match (confident) or mismatch |
5662 | | // (not so confident). |
5663 | 0 | if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) { |
5664 | | // We are very confident about this base. Use sum of quals |
5665 | 0 | int qual = a_qual[a_iseq] + b_qual[b_iseq]; |
5666 | 0 | a_qual[a_iseq] = amul * (qual>200 ? 200 : qual); |
5667 | 0 | b_qual[b_iseq] = bmul * (qual>200 ? 200 : qual);; |
5668 | 0 | } else { |
5669 | | // Not so confident about anymore given the mismatch. |
5670 | | // Reduce qual for lowest quality base. |
5671 | 0 | if ( a_qual[a_iseq] > b_qual[b_iseq] ) { |
5672 | | // A highest qual base; keep |
5673 | 0 | a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; |
5674 | 0 | b_qual[b_iseq] = 0; |
5675 | 0 | } else if (a_qual[a_iseq] < b_qual[b_iseq] ) { |
5676 | | // B highest qual base; keep |
5677 | 0 | b_qual[b_iseq] = 0.8 * b_qual[b_iseq]; |
5678 | 0 | a_qual[a_iseq] = 0; |
5679 | 0 | } else { |
5680 | | // Both equal, so pick randomly |
5681 | 0 | a_qual[a_iseq] = amul * 0.8 * a_qual[a_iseq]; |
5682 | 0 | b_qual[b_iseq] = bmul * 0.8 * b_qual[b_iseq]; |
5683 | 0 | } |
5684 | 0 | } |
5685 | 0 | } |
5686 | | |
5687 | 0 | return err; |
5688 | 0 | } |
5689 | | |
5690 | | // Fix overlapping reads. Simple soft-clipping did not give good results. |
5691 | | // Lowering qualities of unwanted bases is more selective and works better. |
5692 | | // |
5693 | | // Returns 0 on success, -1 on failure |
5694 | | static int overlap_push(bam_plp_t iter, lbnode_t *node) |
5695 | 0 | { |
5696 | 0 | if ( !iter->overlaps ) return 0; |
5697 | | |
5698 | | // mapped mates and paired reads only |
5699 | 0 | if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return 0; |
5700 | | |
5701 | | // no overlap possible, unless some wild cigar |
5702 | 0 | if ( (node->b.core.mtid >= 0 && node->b.core.tid != node->b.core.mtid) |
5703 | 0 | || (llabs(node->b.core.isize) >= 2*node->b.core.l_qseq |
5704 | 0 | && node->b.core.mpos >= node->end) // for those wild cigars |
5705 | 0 | ) return 0; |
5706 | | |
5707 | 0 | khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b)); |
5708 | 0 | if ( kitr==kh_end(iter->overlaps) ) |
5709 | 0 | { |
5710 | | // Only add reads where the mate is still to arrive |
5711 | 0 | if (node->b.core.mpos >= node->b.core.pos || |
5712 | 0 | ((node->b.core.flag & BAM_FPAIRED) && node->b.core.mpos == -1)) { |
5713 | 0 | int ret; |
5714 | 0 | kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret); |
5715 | 0 | if (ret < 0) return -1; |
5716 | 0 | kh_value(iter->overlaps, kitr) = node; |
5717 | 0 | } |
5718 | 0 | } |
5719 | 0 | else |
5720 | 0 | { |
5721 | 0 | lbnode_t *a = kh_value(iter->overlaps, kitr); |
5722 | 0 | int err = tweak_overlap_quality(&a->b, &node->b); |
5723 | 0 | kh_del(olap_hash, iter->overlaps, kitr); |
5724 | 0 | assert(a->end-1 == a->s.end); |
5725 | 0 | return err; |
5726 | 0 | } |
5727 | 0 | return 0; |
5728 | 0 | } |
5729 | | |
5730 | | static void overlap_remove(bam_plp_t iter, const bam1_t *b) |
5731 | 0 | { |
5732 | 0 | if ( !iter->overlaps ) return; |
5733 | | |
5734 | 0 | khiter_t kitr; |
5735 | 0 | if ( b ) |
5736 | 0 | { |
5737 | 0 | kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b)); |
5738 | 0 | if ( kitr!=kh_end(iter->overlaps) ) |
5739 | 0 | kh_del(olap_hash, iter->overlaps, kitr); |
5740 | 0 | } |
5741 | 0 | else |
5742 | 0 | { |
5743 | | // remove all |
5744 | 0 | for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++) |
5745 | 0 | if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr); |
5746 | 0 | } |
5747 | 0 | } |
5748 | | |
5749 | | |
5750 | | |
5751 | | // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns |
5752 | | // pointer to the piled records if next position is ready or NULL if there is not enough records in the |
5753 | | // buffer yet (the current position is still the maximum position across all buffered reads). |
5754 | | const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp) |
5755 | 0 | { |
5756 | 0 | if (iter->error) { *_n_plp = -1; return NULL; } |
5757 | 0 | *_n_plp = 0; |
5758 | 0 | if (iter->is_eof && iter->head == iter->tail) return NULL; |
5759 | 0 | while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) { |
5760 | 0 | int n_plp = 0; |
5761 | | // write iter->plp at iter->pos |
5762 | 0 | lbnode_t **pptr = &iter->head; |
5763 | 0 | while (*pptr != iter->tail) { |
5764 | 0 | lbnode_t *p = *pptr; |
5765 | 0 | if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove |
5766 | 0 | overlap_remove(iter, &p->b); |
5767 | 0 | if (iter->plp_destruct) |
5768 | 0 | iter->plp_destruct(iter->data, &p->b, &p->cd); |
5769 | 0 | *pptr = p->next; mp_free(iter->mp, p); |
5770 | 0 | } |
5771 | 0 | else { |
5772 | 0 | if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup |
5773 | 0 | if (n_plp == iter->max_plp) { // then double the capacity |
5774 | 0 | iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256; |
5775 | 0 | iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp); |
5776 | 0 | } |
5777 | 0 | iter->plp[n_plp].b = &p->b; |
5778 | 0 | iter->plp[n_plp].cd = p->cd; |
5779 | 0 | if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true... |
5780 | 0 | } |
5781 | 0 | pptr = &(*pptr)->next; |
5782 | 0 | } |
5783 | 0 | } |
5784 | 0 | *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos; |
5785 | | // update iter->tid and iter->pos |
5786 | 0 | if (iter->head != iter->tail) { |
5787 | 0 | if (iter->tid > iter->head->b.core.tid) { |
5788 | 0 | hts_log_error("Unsorted input. Pileup aborts"); |
5789 | 0 | iter->error = 1; |
5790 | 0 | *_n_plp = -1; |
5791 | 0 | return NULL; |
5792 | 0 | } |
5793 | 0 | } |
5794 | 0 | if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence |
5795 | 0 | iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference |
5796 | 0 | } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid |
5797 | 0 | iter->pos = iter->head->beg; // jump to the next position |
5798 | 0 | } else ++iter->pos; // scan contiguously |
5799 | | // return |
5800 | 0 | if (n_plp) return iter->plp; |
5801 | 0 | if (iter->is_eof && iter->head == iter->tail) break; |
5802 | 0 | } |
5803 | 0 | return NULL; |
5804 | 0 | } |
5805 | | |
5806 | | const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) |
5807 | 0 | { |
5808 | 0 | hts_pos_t pos64 = 0; |
5809 | 0 | const bam_pileup1_t *p = bam_plp64_next(iter, _tid, &pos64, _n_plp); |
5810 | 0 | if (pos64 < INT_MAX) { |
5811 | 0 | *_pos = pos64; |
5812 | 0 | } else { |
5813 | 0 | hts_log_error("Position %"PRId64" too large", pos64); |
5814 | 0 | *_pos = INT_MAX; |
5815 | 0 | iter->error = 1; |
5816 | 0 | *_n_plp = -1; |
5817 | 0 | return NULL; |
5818 | 0 | } |
5819 | 0 | return p; |
5820 | 0 | } |
5821 | | |
5822 | | int bam_plp_push(bam_plp_t iter, const bam1_t *b) |
5823 | 0 | { |
5824 | 0 | if (iter->error) return -1; |
5825 | 0 | if (b) { |
5826 | 0 | if (b->core.tid < 0) { overlap_remove(iter, b); return 0; } |
5827 | | // Skip only unmapped reads here, any additional filtering must be done in iter->func |
5828 | 0 | if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; } |
5829 | 0 | if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt) |
5830 | 0 | { |
5831 | 0 | overlap_remove(iter, b); |
5832 | 0 | return 0; |
5833 | 0 | } |
5834 | 0 | if (bam_copy1(&iter->tail->b, b) == NULL) |
5835 | 0 | return -1; |
5836 | 0 | iter->tail->b.id = iter->id++; |
5837 | 0 | iter->tail->beg = b->core.pos; |
5838 | | // Use raw rlen rather than bam_endpos() which adjusts rlen=0 to rlen=1 |
5839 | 0 | iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); |
5840 | 0 | iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t |
5841 | 0 | if (b->core.tid < iter->max_tid) { |
5842 | 0 | hts_log_error("The input is not sorted (chromosomes out of order)"); |
5843 | 0 | iter->error = 1; |
5844 | 0 | return -1; |
5845 | 0 | } |
5846 | 0 | if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) { |
5847 | 0 | hts_log_error("The input is not sorted (reads out of order)"); |
5848 | 0 | iter->error = 1; |
5849 | 0 | return -1; |
5850 | 0 | } |
5851 | 0 | iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg; |
5852 | 0 | if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) { |
5853 | 0 | lbnode_t *next = mp_alloc(iter->mp); |
5854 | 0 | if (!next) { |
5855 | 0 | iter->error = 1; |
5856 | 0 | return -1; |
5857 | 0 | } |
5858 | 0 | if (iter->plp_construct) { |
5859 | 0 | if (iter->plp_construct(iter->data, &iter->tail->b, |
5860 | 0 | &iter->tail->cd) < 0) { |
5861 | 0 | mp_free(iter->mp, next); |
5862 | 0 | iter->error = 1; |
5863 | 0 | return -1; |
5864 | 0 | } |
5865 | 0 | } |
5866 | 0 | if (overlap_push(iter, iter->tail) < 0) { |
5867 | 0 | mp_free(iter->mp, next); |
5868 | 0 | iter->error = 1; |
5869 | 0 | return -1; |
5870 | 0 | } |
5871 | 0 | iter->tail->next = next; |
5872 | 0 | iter->tail = iter->tail->next; |
5873 | 0 | } |
5874 | 0 | } else iter->is_eof = 1; |
5875 | 0 | return 0; |
5876 | 0 | } |
5877 | | |
5878 | | const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp) |
5879 | 0 | { |
5880 | 0 | const bam_pileup1_t *plp; |
5881 | 0 | if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; } |
5882 | 0 | if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp; |
5883 | 0 | else { // no pileup line can be obtained; read alignments |
5884 | 0 | *_n_plp = 0; |
5885 | 0 | if (iter->is_eof) return 0; |
5886 | 0 | int ret; |
5887 | 0 | while ( (ret=iter->func(iter->data, iter->b)) >= 0) { |
5888 | 0 | if (bam_plp_push(iter, iter->b) < 0) { |
5889 | 0 | *_n_plp = -1; |
5890 | 0 | return 0; |
5891 | 0 | } |
5892 | 0 | if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp; |
5893 | | // otherwise no pileup line can be returned; read the next alignment. |
5894 | 0 | } |
5895 | 0 | if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; } |
5896 | 0 | if (bam_plp_push(iter, 0) < 0) { |
5897 | 0 | *_n_plp = -1; |
5898 | 0 | return 0; |
5899 | 0 | } |
5900 | 0 | if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp; |
5901 | 0 | return 0; |
5902 | 0 | } |
5903 | 0 | } |
5904 | | |
5905 | | const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) |
5906 | 0 | { |
5907 | 0 | hts_pos_t pos64 = 0; |
5908 | 0 | const bam_pileup1_t *p = bam_plp64_auto(iter, _tid, &pos64, _n_plp); |
5909 | 0 | if (pos64 < INT_MAX) { |
5910 | 0 | *_pos = pos64; |
5911 | 0 | } else { |
5912 | 0 | hts_log_error("Position %"PRId64" too large", pos64); |
5913 | 0 | *_pos = INT_MAX; |
5914 | 0 | iter->error = 1; |
5915 | 0 | *_n_plp = -1; |
5916 | 0 | return NULL; |
5917 | 0 | } |
5918 | 0 | return p; |
5919 | 0 | } |
5920 | | |
5921 | | void bam_plp_reset(bam_plp_t iter) |
5922 | 0 | { |
5923 | 0 | overlap_remove(iter, NULL); |
5924 | 0 | iter->max_tid = iter->max_pos = -1; |
5925 | 0 | iter->tid = iter->pos = 0; |
5926 | 0 | iter->is_eof = 0; |
5927 | 0 | while (iter->head != iter->tail) { |
5928 | 0 | lbnode_t *p = iter->head; |
5929 | 0 | iter->head = p->next; |
5930 | 0 | mp_free(iter->mp, p); |
5931 | 0 | } |
5932 | 0 | } |
5933 | | |
5934 | | void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt) |
5935 | 0 | { |
5936 | 0 | iter->maxcnt = maxcnt; |
5937 | 0 | } |
5938 | | |
5939 | | /************************ |
5940 | | *** Mpileup iterator *** |
5941 | | ************************/ |
5942 | | |
5943 | | struct bam_mplp_s { |
5944 | | int n; |
5945 | | int32_t min_tid, *tid; |
5946 | | hts_pos_t min_pos, *pos; |
5947 | | bam_plp_t *iter; |
5948 | | int *n_plp; |
5949 | | const bam_pileup1_t **plp; |
5950 | | }; |
5951 | | |
5952 | | bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) |
5953 | 0 | { |
5954 | 0 | int i; |
5955 | 0 | bam_mplp_t iter; |
5956 | 0 | iter = (bam_mplp_t)calloc(1, sizeof(struct bam_mplp_s)); |
5957 | 0 | iter->pos = (hts_pos_t*)calloc(n, sizeof(hts_pos_t)); |
5958 | 0 | iter->tid = (int32_t*)calloc(n, sizeof(int32_t)); |
5959 | 0 | iter->n_plp = (int*)calloc(n, sizeof(int)); |
5960 | 0 | iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*)); |
5961 | 0 | iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t)); |
5962 | 0 | iter->n = n; |
5963 | 0 | iter->min_pos = HTS_POS_MAX; |
5964 | 0 | iter->min_tid = (uint32_t)-1; |
5965 | 0 | for (i = 0; i < n; ++i) { |
5966 | 0 | iter->iter[i] = bam_plp_init(func, data[i]); |
5967 | 0 | iter->pos[i] = iter->min_pos; |
5968 | 0 | iter->tid[i] = iter->min_tid; |
5969 | 0 | } |
5970 | 0 | return iter; |
5971 | 0 | } |
5972 | | |
5973 | | int bam_mplp_init_overlaps(bam_mplp_t iter) |
5974 | 0 | { |
5975 | 0 | int i, r = 0; |
5976 | 0 | for (i = 0; i < iter->n; ++i) |
5977 | 0 | r |= bam_plp_init_overlaps(iter->iter[i]); |
5978 | 0 | return r == 0 ? 0 : -1; |
5979 | 0 | } |
5980 | | |
5981 | | void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt) |
5982 | 0 | { |
5983 | 0 | int i; |
5984 | 0 | for (i = 0; i < iter->n; ++i) |
5985 | 0 | iter->iter[i]->maxcnt = maxcnt; |
5986 | 0 | } |
5987 | | |
5988 | | void bam_mplp_destroy(bam_mplp_t iter) |
5989 | 0 | { |
5990 | 0 | int i; |
5991 | 0 | for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]); |
5992 | 0 | free(iter->iter); free(iter->pos); free(iter->tid); |
5993 | 0 | free(iter->n_plp); free(iter->plp); |
5994 | 0 | free(iter); |
5995 | 0 | } |
5996 | | |
5997 | | int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp) |
5998 | 0 | { |
5999 | 0 | int i, ret = 0; |
6000 | 0 | hts_pos_t new_min_pos = HTS_POS_MAX; |
6001 | 0 | uint32_t new_min_tid = (uint32_t)-1; |
6002 | 0 | for (i = 0; i < iter->n; ++i) { |
6003 | 0 | if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) { |
6004 | 0 | int tid; |
6005 | 0 | hts_pos_t pos; |
6006 | 0 | iter->plp[i] = bam_plp64_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]); |
6007 | 0 | if ( iter->iter[i]->error ) return -1; |
6008 | 0 | if (iter->plp[i]) { |
6009 | 0 | iter->tid[i] = tid; |
6010 | 0 | iter->pos[i] = pos; |
6011 | 0 | } else { |
6012 | 0 | iter->tid[i] = 0; |
6013 | 0 | iter->pos[i] = 0; |
6014 | 0 | } |
6015 | 0 | } |
6016 | 0 | if (iter->plp[i]) { |
6017 | 0 | if (iter->tid[i] < new_min_tid) { |
6018 | 0 | new_min_tid = iter->tid[i]; |
6019 | 0 | new_min_pos = iter->pos[i]; |
6020 | 0 | } else if (iter->tid[i] == new_min_tid && iter->pos[i] < new_min_pos) { |
6021 | 0 | new_min_pos = iter->pos[i]; |
6022 | 0 | } |
6023 | 0 | } |
6024 | 0 | } |
6025 | 0 | iter->min_pos = new_min_pos; |
6026 | 0 | iter->min_tid = new_min_tid; |
6027 | 0 | if (new_min_pos == HTS_POS_MAX) return 0; |
6028 | 0 | *_tid = new_min_tid; *_pos = new_min_pos; |
6029 | 0 | for (i = 0; i < iter->n; ++i) { |
6030 | 0 | if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) { |
6031 | 0 | n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i]; |
6032 | 0 | ++ret; |
6033 | 0 | } else n_plp[i] = 0, plp[i] = 0; |
6034 | 0 | } |
6035 | 0 | return ret; |
6036 | 0 | } |
6037 | | |
6038 | | int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) |
6039 | 0 | { |
6040 | 0 | hts_pos_t pos64 = 0; |
6041 | 0 | int ret = bam_mplp64_auto(iter, _tid, &pos64, n_plp, plp); |
6042 | 0 | if (ret >= 0) { |
6043 | 0 | if (pos64 < INT_MAX) { |
6044 | 0 | *_pos = pos64; |
6045 | 0 | } else { |
6046 | 0 | hts_log_error("Position %"PRId64" too large", pos64); |
6047 | 0 | *_pos = INT_MAX; |
6048 | 0 | return -1; |
6049 | 0 | } |
6050 | 0 | } |
6051 | 0 | return ret; |
6052 | 0 | } |
6053 | | |
6054 | | void bam_mplp_reset(bam_mplp_t iter) |
6055 | 0 | { |
6056 | 0 | int i; |
6057 | 0 | iter->min_pos = HTS_POS_MAX; |
6058 | 0 | iter->min_tid = (uint32_t)-1; |
6059 | 0 | for (i = 0; i < iter->n; ++i) { |
6060 | 0 | bam_plp_reset(iter->iter[i]); |
6061 | 0 | iter->pos[i] = HTS_POS_MAX; |
6062 | 0 | iter->tid[i] = (uint32_t)-1; |
6063 | 0 | iter->n_plp[i] = 0; |
6064 | 0 | iter->plp[i] = NULL; |
6065 | 0 | } |
6066 | 0 | } |
6067 | | |
6068 | | void bam_mplp_constructor(bam_mplp_t iter, |
6069 | 0 | int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) { |
6070 | 0 | int i; |
6071 | 0 | for (i = 0; i < iter->n; ++i) |
6072 | 0 | bam_plp_constructor(iter->iter[i], func); |
6073 | 0 | } |
6074 | | |
6075 | | void bam_mplp_destructor(bam_mplp_t iter, |
6076 | 0 | int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) { |
6077 | 0 | int i; |
6078 | 0 | for (i = 0; i < iter->n; ++i) |
6079 | 0 | bam_plp_destructor(iter->iter[i], func); |
6080 | 0 | } |
6081 | | |
6082 | | #endif // ~!defined(BAM_NO_PILEUP) |
6083 | | |
6084 | | // --------------------------- |
6085 | | // Base Modification retrieval |
6086 | | // |
6087 | | // These operate by recording state in an opaque type, allocated and freed |
6088 | | // via the functions below. |
6089 | | // |
6090 | | // Initially we call bam_parse_basemod to process the tags and record the |
6091 | | // modifications in the state structure, and then functions such as |
6092 | | // bam_next_basemod can iterate over this cached state. |
6093 | | |
6094 | | /* |
6095 | | * Base modification are stored in MM/Mm tags as <mod_list> defined as |
6096 | | * |
6097 | | * <mod_list> ::= <mod_chain><mod_list> | "" |
6098 | | * <mod_chain> ::= <canonical_base><strand><mod-list><delta-list> |
6099 | | * |
6100 | | * <canonical_base> ::= "A" | "C" | "G" | "T" | "N". |
6101 | | * |
6102 | | * <strand> ::= "+" | "-". |
6103 | | * |
6104 | | * <mod-list> ::= <simple-mod-list> | <ChEBI-code> |
6105 | | * <simple-mod-list> ::= <simple-mod><simple-mod-list> | <simple-mod> |
6106 | | * <ChEBI-code> ::= <integer> |
6107 | | * <simple-mod> ::= <letter> |
6108 | | * |
6109 | | * <delta-list> ::= "," <integer> <delta-list> | ";" |
6110 | | * |
6111 | | * We do not allocate additional memory other than the fixed size |
6112 | | * state, thus we track up to 256 pointers to different locations |
6113 | | * within the MM and ML tags. Each pointer is for a distinct |
6114 | | * modification code (simple or ChEBI), meaning some may point to the |
6115 | | * same delta-list when multiple codes are combined together |
6116 | | * (e.g. "C+mh,1,5,18,3;"). This is the MM[] array. |
6117 | | * |
6118 | | * Each numeric in the delta-list is tracked in MMcount[], counted |
6119 | | * down until it hits zero in which case the next delta is fetched. |
6120 | | * |
6121 | | * ML array similarly holds the locations in the quality (ML) tag per |
6122 | | * type, but these are interleaved so C+mhfc,10,15 will have 4 types |
6123 | | * all pointing to the same delta position, but in ML we store |
6124 | | * Q(m0)Q(h0)Q(f0)Q(c0) followed by Q(m1)Q(h1)Q(f1)Q(c1). This ML |
6125 | | * also has MLstride indicating how many positions along ML to jump |
6126 | | * each time we consume a base. (4 in our above example, but usually 1 |
6127 | | * for the simple case). |
6128 | | * |
6129 | | * One complexity of the base modification system is that mods are |
6130 | | * always stored in the original DNA orientation. This is so that |
6131 | | * tools that may reverse-complement a sequence (eg "samtools fastq -T |
6132 | | * MM,ML") can pass through these modification tags irrespective of |
6133 | | * whether they have any knowledge of their internal workings. |
6134 | | * |
6135 | | * Because we don't wish to allocate extra memory, we cannot simply |
6136 | | * reverse the MM and ML tags. Sadly this means we have to manage the |
6137 | | * reverse complementing ourselves on-the-fly. |
6138 | | * For reversed reads we start at the right end of MM and no longer |
6139 | | * stop at the semicolon. Instead we use MMend[] array to mark the |
6140 | | * termination point. |
6141 | | */ |
6142 | 0 | #define MAX_BASE_MOD 256 |
6143 | | struct hts_base_mod_state { |
6144 | | int type[MAX_BASE_MOD]; // char or minus-CHEBI |
6145 | | int canonical[MAX_BASE_MOD];// canonical base, as seqi (1,2,4,8,15) |
6146 | | char strand[MAX_BASE_MOD]; // strand of modification; + or - |
6147 | | int MMcount[MAX_BASE_MOD]; // no. canonical bases left until next mod |
6148 | | char *MM[MAX_BASE_MOD]; // next pos delta (string) |
6149 | | char *MMend[MAX_BASE_MOD]; // end of pos-delta string |
6150 | | uint8_t *ML[MAX_BASE_MOD]; // next qual |
6151 | | int MLstride[MAX_BASE_MOD]; // bytes between quals for this type |
6152 | | int implicit[MAX_BASE_MOD]; // treat unlisted positions as non-modified? |
6153 | | int seq_pos; // current position along sequence |
6154 | | int nmods; // used array size (0 to MAX_BASE_MOD-1). |
6155 | | }; |
6156 | | |
6157 | 0 | hts_base_mod_state *hts_base_mod_state_alloc(void) { |
6158 | 0 | return calloc(1, sizeof(hts_base_mod_state)); |
6159 | 0 | } |
6160 | | |
6161 | 0 | void hts_base_mod_state_free(hts_base_mod_state *state) { |
6162 | 0 | free(state); |
6163 | 0 | } |
6164 | | |
6165 | | /* |
6166 | | * Count frequency of A, C, G, T and N canonical bases in the sequence |
6167 | | */ |
6168 | 0 | static void seq_freq(const bam1_t *b, int freq[16]) { |
6169 | 0 | int i; |
6170 | |
|
6171 | 0 | memset(freq, 0, 16*sizeof(*freq)); |
6172 | 0 | uint8_t *seq = bam_get_seq(b); |
6173 | 0 | for (i = 0; i < b->core.l_qseq; i++) |
6174 | 0 | freq[bam_seqi(seq, i)]++; |
6175 | 0 | freq[15] = b->core.l_qseq; // all bases count as N for base mods |
6176 | 0 | } |
6177 | | |
6178 | | //0123456789ABCDEF |
6179 | | //=ACMGRSVTWYHKDBN aka seq_nt16_str[] |
6180 | | //=TGKCYSBAWRDMHVN comp1ement of seq_nt16_str |
6181 | | //084C2A6E195D3B7F |
6182 | | static int seqi_rc[] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 }; |
6183 | | |
6184 | | /* |
6185 | | * Parse the MM and ML tags to populate the base mod state. |
6186 | | * This structure will have been previously allocated via |
6187 | | * hts_base_mod_state_alloc, but it does not need to be repeatedly |
6188 | | * freed and allocated for each new bam record. (Although obviously |
6189 | | * it requires a new call to this function.) |
6190 | | * |
6191 | | */ |
6192 | 0 | int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { |
6193 | | // Read MM and ML tags |
6194 | 0 | uint8_t *mm = bam_aux_get(b, "MM"); |
6195 | 0 | if (!mm) mm = bam_aux_get(b, "Mm"); |
6196 | 0 | if (!mm) |
6197 | 0 | return 0; |
6198 | 0 | if (mm[0] != 'Z') { |
6199 | 0 | hts_log_error("MM tag is not of type Z"); |
6200 | 0 | return -1; |
6201 | 0 | } |
6202 | | |
6203 | 0 | uint8_t *ml = bam_aux_get(b, "ML"); |
6204 | 0 | if (!ml) ml = bam_aux_get(b, "Ml"); |
6205 | 0 | if (ml && (ml[0] != 'B' || ml[1] != 'C')) { |
6206 | 0 | hts_log_error("ML tag is not of type B,C"); |
6207 | 0 | return -1; |
6208 | 0 | } |
6209 | 0 | uint8_t *ml_end = ml ? ml+6 + le_to_u32(ml+2) : NULL; |
6210 | 0 | if (ml) ml += 6; |
6211 | |
|
6212 | 0 | state->seq_pos = 0; |
6213 | | |
6214 | | // Aggregate freqs of ACGTN if reversed, to get final-delta (later) |
6215 | 0 | int freq[16]; |
6216 | 0 | if (b->core.flag & BAM_FREVERSE) |
6217 | 0 | seq_freq(b, freq); |
6218 | |
|
6219 | 0 | char *cp = (char *)mm+1; |
6220 | 0 | int mod_num = 0; |
6221 | 0 | int implicit = 1; |
6222 | 0 | while (*cp) { |
6223 | 0 | for (; *cp; cp++) { |
6224 | | // cp should be [ACGTNU][+-]([a-zA-Z]+|[0-9]+)[.?]?(,\d+)*; |
6225 | 0 | unsigned char btype = *cp++; |
6226 | |
|
6227 | 0 | if (btype != 'A' && btype != 'C' && |
6228 | 0 | btype != 'G' && btype != 'T' && |
6229 | 0 | btype != 'U' && btype != 'N') |
6230 | 0 | return -1; |
6231 | 0 | if (btype == 'U') btype = 'T'; |
6232 | |
|
6233 | 0 | btype = seq_nt16_table[btype]; |
6234 | | |
6235 | | // Strand |
6236 | 0 | if (*cp != '+' && *cp != '-') |
6237 | 0 | return -1; // malformed |
6238 | 0 | char strand = *cp++; |
6239 | | |
6240 | | // List of modification types |
6241 | 0 | char *ms = cp, *me; // mod code start and end |
6242 | 0 | char *cp_end = NULL; |
6243 | 0 | int chebi = 0; |
6244 | 0 | if (isdigit_c(*cp)) { |
6245 | 0 | chebi = strtol(cp, &cp_end, 10); |
6246 | 0 | cp = cp_end; |
6247 | 0 | ms = cp-1; |
6248 | 0 | } else { |
6249 | 0 | while (*cp && isalpha_c(*cp)) |
6250 | 0 | cp++; |
6251 | 0 | if (*cp == '\0') |
6252 | 0 | return -1; |
6253 | 0 | } |
6254 | | |
6255 | 0 | me = cp; |
6256 | | |
6257 | | // Optional explicit vs implicit marker |
6258 | 0 | if (*cp == '.') { |
6259 | | // default is implicit = 1; |
6260 | 0 | cp++; |
6261 | 0 | } else if (*cp == '?') { |
6262 | 0 | implicit = 0; |
6263 | 0 | cp++; |
6264 | 0 | } else if (*cp != ',' && *cp != ';') { |
6265 | | // parse error |
6266 | 0 | return -1; |
6267 | 0 | } |
6268 | | |
6269 | 0 | long delta; |
6270 | 0 | int n = 0; // nth symbol in a multi-mod string |
6271 | 0 | int stride = me-ms; |
6272 | 0 | int ndelta = 0; |
6273 | |
|
6274 | 0 | if (b->core.flag & BAM_FREVERSE) { |
6275 | | // We process the sequence in left to right order, |
6276 | | // but delta is successive count of bases to skip |
6277 | | // counting right to left. This also means the number |
6278 | | // of bases to skip at left edge is unrecorded (as it's |
6279 | | // the remainder). |
6280 | | // |
6281 | | // To output mods in left to right, we step through the |
6282 | | // MM list in reverse and need to identify the left-end |
6283 | | // "remainder" delta. |
6284 | 0 | int total_seq = 0; |
6285 | 0 | for (;;) { |
6286 | 0 | cp += (*cp == ','); |
6287 | 0 | if (*cp == 0 || *cp == ';') |
6288 | 0 | break; |
6289 | | |
6290 | 0 | delta = strtol(cp, &cp_end, 10); |
6291 | 0 | if (cp_end == cp) { |
6292 | 0 | hts_log_error("Hit end of MM tag. Missing semicolon?"); |
6293 | 0 | return -1; |
6294 | 0 | } |
6295 | | |
6296 | 0 | cp = cp_end; |
6297 | 0 | total_seq += delta+1; |
6298 | 0 | ndelta++; |
6299 | 0 | } |
6300 | 0 | delta = freq[seqi_rc[btype]] - total_seq; // remainder |
6301 | 0 | } else { |
6302 | 0 | delta = *cp == ',' |
6303 | 0 | ? strtol(cp+1, &cp_end, 10) |
6304 | 0 | : 0; |
6305 | 0 | if (!cp_end) { |
6306 | | // empty list |
6307 | 0 | delta = INT_MAX; |
6308 | 0 | cp_end = cp+1; |
6309 | 0 | } |
6310 | 0 | } |
6311 | | // Now delta is first in list or computed remainder, |
6312 | | // and cp_end is either start or end of the MM list. |
6313 | 0 | while (ms < me) { |
6314 | 0 | state->type [mod_num] = chebi ? -chebi : *ms; |
6315 | 0 | state->strand [mod_num] = (strand == '-'); |
6316 | 0 | state->canonical[mod_num] = btype; |
6317 | 0 | state->MLstride [mod_num] = stride; |
6318 | 0 | state->implicit [mod_num] = implicit; |
6319 | |
|
6320 | 0 | if (delta < 0) { |
6321 | 0 | hts_log_error("MM tag refers to bases beyond sequence " |
6322 | 0 | "length"); |
6323 | 0 | return -1; |
6324 | 0 | } |
6325 | 0 | state->MMcount [mod_num] = delta; |
6326 | 0 | if (b->core.flag & BAM_FREVERSE) { |
6327 | 0 | state->MM [mod_num] = cp+1; |
6328 | 0 | state->MMend[mod_num] = cp_end; |
6329 | 0 | state->ML [mod_num] = ml ? ml+n +(ndelta-1)*stride: NULL; |
6330 | 0 | } else { |
6331 | 0 | state->MM [mod_num] = cp_end; |
6332 | 0 | state->MMend[mod_num] = NULL; |
6333 | 0 | state->ML [mod_num] = ml ? ml+n : NULL; |
6334 | 0 | } |
6335 | |
|
6336 | 0 | if (++mod_num >= MAX_BASE_MOD) { |
6337 | 0 | hts_log_error("Too many base modification types"); |
6338 | 0 | return -1; |
6339 | 0 | } |
6340 | 0 | ms++; n++; |
6341 | 0 | } |
6342 | | |
6343 | | // Skip modification deltas |
6344 | 0 | if (ml) { |
6345 | 0 | if (b->core.flag & BAM_FREVERSE) { |
6346 | 0 | ml += ndelta*stride; |
6347 | 0 | } else { |
6348 | 0 | while (*cp && *cp != ';') { |
6349 | 0 | if (*cp == ',') |
6350 | 0 | ml+=stride; |
6351 | 0 | cp++; |
6352 | 0 | } |
6353 | 0 | } |
6354 | 0 | if (ml > ml_end) { |
6355 | 0 | hts_log_error("Insufficient number of entries in ML tag"); |
6356 | 0 | return -1; |
6357 | 0 | } |
6358 | 0 | } else { |
6359 | | // cp_end already known if FREVERSE |
6360 | 0 | if (cp_end && (b->core.flag & BAM_FREVERSE)) |
6361 | 0 | cp = cp_end; |
6362 | 0 | else |
6363 | 0 | while (*cp && *cp != ';') |
6364 | 0 | cp++; |
6365 | 0 | } |
6366 | 0 | if (!*cp) { |
6367 | 0 | hts_log_error("Hit end of MM tag. Missing semicolon?"); |
6368 | 0 | return -1; |
6369 | 0 | } |
6370 | 0 | } |
6371 | 0 | } |
6372 | | |
6373 | 0 | state->nmods = mod_num; |
6374 | |
|
6375 | 0 | return 0; |
6376 | 0 | } |
6377 | | |
6378 | | /* |
6379 | | * Fills out mods[] with the base modifications found. |
6380 | | * Returns the number found (0 if none), which may be more than |
6381 | | * the size of n_mods if more were found than reported. |
6382 | | * Returns <= -1 on error. |
6383 | | * |
6384 | | * This always marches left to right along sequence, irrespective of |
6385 | | * reverse flag or modification strand. |
6386 | | */ |
6387 | | int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state, |
6388 | 0 | hts_base_mod *mods, int n_mods) { |
6389 | 0 | if (b->core.flag & BAM_FREVERSE) { |
6390 | 0 | if (state->seq_pos < 0) |
6391 | 0 | return -1; |
6392 | 0 | } else { |
6393 | 0 | if (state->seq_pos >= b->core.l_qseq) |
6394 | 0 | return -1; |
6395 | 0 | } |
6396 | | |
6397 | 0 | int i, j, n = 0; |
6398 | 0 | unsigned char base = bam_seqi(bam_get_seq(b), state->seq_pos); |
6399 | 0 | state->seq_pos++; |
6400 | 0 | if (b->core.flag & BAM_FREVERSE) |
6401 | 0 | base = seqi_rc[base]; |
6402 | |
|
6403 | 0 | for (i = 0; i < state->nmods; i++) { |
6404 | 0 | if (state->canonical[i] != base && state->canonical[i] != 15/*N*/) |
6405 | 0 | continue; |
6406 | | |
6407 | 0 | if (state->MMcount[i]-- > 0) |
6408 | 0 | continue; |
6409 | | |
6410 | 0 | char *MMptr = state->MM[i]; |
6411 | 0 | if (n < n_mods) { |
6412 | 0 | mods[n].modified_base = state->type[i]; |
6413 | 0 | mods[n].canonical_base = seq_nt16_str[state->canonical[i]]; |
6414 | 0 | mods[n].strand = state->strand[i]; |
6415 | 0 | mods[n].qual = state->ML[i] ? *state->ML[i] : -1; |
6416 | 0 | } |
6417 | 0 | n++; |
6418 | 0 | if (state->ML[i]) |
6419 | 0 | state->ML[i] += (b->core.flag & BAM_FREVERSE) |
6420 | 0 | ? -state->MLstride[i] |
6421 | 0 | : +state->MLstride[i]; |
6422 | |
|
6423 | 0 | if (b->core.flag & BAM_FREVERSE) { |
6424 | | // process MM list backwards |
6425 | 0 | char *cp; |
6426 | 0 | for (cp = state->MMend[i]-1; cp != state->MM[i]; cp--) |
6427 | 0 | if (*cp == ',') |
6428 | 0 | break; |
6429 | 0 | state->MMend[i] = cp; |
6430 | 0 | if (cp != state->MM[i]) |
6431 | 0 | state->MMcount[i] = strtol(cp+1, NULL, 10); |
6432 | 0 | else |
6433 | 0 | state->MMcount[i] = INT_MAX; |
6434 | 0 | } else { |
6435 | 0 | if (*state->MM[i] == ',') |
6436 | 0 | state->MMcount[i] = strtol(state->MM[i]+1, &state->MM[i], 10); |
6437 | 0 | else |
6438 | 0 | state->MMcount[i] = INT_MAX; |
6439 | 0 | } |
6440 | | |
6441 | | // Multiple mods at the same coords. |
6442 | 0 | for (j=i+1; j < state->nmods && state->MM[j] == MMptr; j++) { |
6443 | 0 | if (n < n_mods) { |
6444 | 0 | mods[n].modified_base = state->type[j]; |
6445 | 0 | mods[n].canonical_base = seq_nt16_str[state->canonical[j]]; |
6446 | 0 | mods[n].strand = state->strand[j]; |
6447 | 0 | mods[n].qual = state->ML[j] ? *state->ML[j] : -1; |
6448 | 0 | } |
6449 | 0 | n++; |
6450 | 0 | state->MMcount[j] = state->MMcount[i]; |
6451 | 0 | state->MM[j] = state->MM[i]; |
6452 | 0 | if (state->ML[j]) |
6453 | 0 | state->ML[j] += (b->core.flag & BAM_FREVERSE) |
6454 | 0 | ? -state->MLstride[j] |
6455 | 0 | : +state->MLstride[j]; |
6456 | 0 | } |
6457 | 0 | i = j-1; |
6458 | 0 | } |
6459 | |
|
6460 | 0 | return n; |
6461 | 0 | } |
6462 | | |
6463 | | /* |
6464 | | * Looks for the next location with a base modification. |
6465 | | */ |
6466 | | int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state, |
6467 | 0 | hts_base_mod *mods, int n_mods, int *pos) { |
6468 | 0 | if (state->seq_pos >= b->core.l_qseq) |
6469 | 0 | return 0; |
6470 | | |
6471 | | // Look through state->MMcount arrays to see when the next lowest is |
6472 | | // per base type; |
6473 | 0 | int next[16], freq[16] = {0}, i; |
6474 | 0 | memset(next, 0x7f, 16*sizeof(*next)); |
6475 | 0 | if (b->core.flag & BAM_FREVERSE) { |
6476 | 0 | for (i = 0; i < state->nmods; i++) { |
6477 | 0 | if (next[seqi_rc[state->canonical[i]]] > state->MMcount[i]) |
6478 | 0 | next[seqi_rc[state->canonical[i]]] = state->MMcount[i]; |
6479 | 0 | } |
6480 | 0 | } else { |
6481 | 0 | for (i = 0; i < state->nmods; i++) { |
6482 | 0 | if (next[state->canonical[i]] > state->MMcount[i]) |
6483 | 0 | next[state->canonical[i]] = state->MMcount[i]; |
6484 | 0 | } |
6485 | 0 | } |
6486 | | |
6487 | | // Now step through the sequence counting off base types. |
6488 | 0 | for (i = state->seq_pos; i < b->core.l_qseq; i++) { |
6489 | 0 | unsigned char bc = bam_seqi(bam_get_seq(b), i); |
6490 | 0 | if (next[bc] <= freq[bc] || next[15] <= freq[15]) |
6491 | 0 | break; |
6492 | 0 | freq[bc]++; |
6493 | 0 | if (bc != 15) // N |
6494 | 0 | freq[15]++; |
6495 | 0 | } |
6496 | 0 | *pos = state->seq_pos = i; |
6497 | |
|
6498 | 0 | if (i >= b->core.l_qseq) { |
6499 | | // Check for more MM elements than bases present. |
6500 | 0 | for (i = 0; i < state->nmods; i++) { |
6501 | 0 | if (!(b->core.flag & BAM_FREVERSE) && |
6502 | 0 | state->MMcount[i] < 0x7f000000) { |
6503 | 0 | hts_log_warning("MM tag refers to bases beyond sequence length"); |
6504 | 0 | return -1; |
6505 | 0 | } |
6506 | 0 | } |
6507 | 0 | return 0; |
6508 | 0 | } |
6509 | | |
6510 | 0 | if (b->core.flag & BAM_FREVERSE) { |
6511 | 0 | for (i = 0; i < state->nmods; i++) |
6512 | 0 | state->MMcount[i] -= freq[seqi_rc[state->canonical[i]]]; |
6513 | 0 | } else { |
6514 | 0 | for (i = 0; i < state->nmods; i++) |
6515 | 0 | state->MMcount[i] -= freq[state->canonical[i]]; |
6516 | 0 | } |
6517 | |
|
6518 | 0 | int r = bam_mods_at_next_pos(b, state, mods, n_mods); |
6519 | 0 | return r > 0 ? r : 0; |
6520 | 0 | } |
6521 | | |
6522 | | /* |
6523 | | * As per bam_mods_at_next_pos, but at a specific qpos >= the previous qpos. |
6524 | | * This can only march forwards along the read, but can do so by more than |
6525 | | * one base-pair. |
6526 | | * |
6527 | | * This makes it useful for calling from pileup iterators where qpos may |
6528 | | * start part way through a read for the first occurrence of that record. |
6529 | | */ |
6530 | | int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state, |
6531 | 0 | hts_base_mod *mods, int n_mods) { |
6532 | | // FIXME: for now this is inefficient in implementation. |
6533 | 0 | int r = 0; |
6534 | 0 | while (state->seq_pos <= qpos) |
6535 | 0 | if ((r = bam_mods_at_next_pos(b, state, mods, n_mods)) < 0) |
6536 | 0 | break; |
6537 | |
|
6538 | 0 | return r; |
6539 | 0 | } |
6540 | | |
6541 | | /* |
6542 | | * Returns the list of base modification codes provided for this |
6543 | | * alignment record as an array of character codes (+ve) or ChEBI numbers |
6544 | | * (negative). |
6545 | | * |
6546 | | * Returns the array, with *ntype filled out with the size. |
6547 | | * The array returned should not be freed. |
6548 | | * It is a valid pointer until the state is freed using |
6549 | | * hts_base_mod_free(). |
6550 | | */ |
6551 | 0 | int *bam_mods_recorded(hts_base_mod_state *state, int *ntype) { |
6552 | 0 | *ntype = state->nmods; |
6553 | 0 | return state->type; |
6554 | 0 | } |
6555 | | |
6556 | | /* |
6557 | | * Returns data about a specific modification type for the alignment record. |
6558 | | * Code is either positive (eg 'm') or negative for ChEBI numbers. |
6559 | | * |
6560 | | * Return 0 on success or -1 if not found. The strand, implicit and canonical |
6561 | | * fields are filled out if passed in as non-NULL pointers. |
6562 | | */ |
6563 | | int bam_mods_query_type(hts_base_mod_state *state, int code, |
6564 | 0 | int *strand, int *implicit, char *canonical) { |
6565 | | // Find code entry |
6566 | 0 | int i; |
6567 | 0 | for (i = 0; i < state->nmods; i++) { |
6568 | 0 | if (state->type[i] == code) |
6569 | 0 | break; |
6570 | 0 | } |
6571 | 0 | if (i == state->nmods) |
6572 | 0 | return -1; |
6573 | | |
6574 | | // Return data |
6575 | 0 | if (strand) *strand = state->strand[i]; |
6576 | 0 | if (implicit) *implicit = state->implicit[i]; |
6577 | 0 | if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]]; |
6578 | |
|
6579 | 0 | return 0; |
6580 | 0 | } |