Line | Count | Source (jump to first uncovered line) |
1 | | /* sam.c -- SAM and BAM file I/O and manipulation. |
2 | | |
3 | | Copyright (C) 2008-2010, 2012-2024 Genome Research Ltd. |
4 | | Copyright (C) 2010, 2012, 2013 Broad Institute. |
5 | | |
6 | | Author: Heng Li <lh3@sanger.ac.uk> |
7 | | |
8 | | Permission is hereby granted, free of charge, to any person obtaining a copy |
9 | | of this software and associated documentation files (the "Software"), to deal |
10 | | in the Software without restriction, including without limitation the rights |
11 | | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
12 | | copies of the Software, and to permit persons to whom the Software is |
13 | | furnished to do so, subject to the following conditions: |
14 | | |
15 | | The above copyright notice and this permission notice shall be included in |
16 | | all copies or substantial portions of the Software. |
17 | | |
18 | | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
19 | | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
20 | | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
21 | | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
22 | | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
23 | | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
24 | | DEALINGS IN THE SOFTWARE. */ |
25 | | |
26 | | #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h |
27 | | #include <config.h> |
28 | | |
29 | | #include <strings.h> |
30 | | #include <stdio.h> |
31 | | #include <stdlib.h> |
32 | | #include <string.h> |
33 | | #include <errno.h> |
34 | | #include <zlib.h> |
35 | | #include <assert.h> |
36 | | #include <signal.h> |
37 | | #include <inttypes.h> |
38 | | #include <unistd.h> |
39 | | |
40 | | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
41 | | #include "fuzz_settings.h" |
42 | | #endif |
43 | | |
44 | | // Suppress deprecation message for cigar_tab, which we initialise |
45 | | #include "htslib/hts_defs.h" |
46 | | #undef HTS_DEPRECATED |
47 | | #define HTS_DEPRECATED(message) |
48 | | |
49 | | #include "htslib/sam.h" |
50 | | #include "htslib/bgzf.h" |
51 | | #include "cram/cram.h" |
52 | | #include "hts_internal.h" |
53 | | #include "sam_internal.h" |
54 | | #include "htslib/hfile.h" |
55 | | #include "htslib/hts_endian.h" |
56 | | #include "htslib/hts_expr.h" |
57 | | #include "header.h" |
58 | | |
59 | | #include "htslib/khash.h" |
60 | | KHASH_DECLARE(s2i, kh_cstr_t, int64_t) |
61 | | KHASH_SET_INIT_INT(tag) |
62 | | |
63 | | #ifndef EFTYPE |
64 | 0 | #define EFTYPE ENOEXEC |
65 | | #endif |
66 | | #ifndef EOVERFLOW |
67 | | #define EOVERFLOW ERANGE |
68 | | #endif |
69 | | |
70 | | /********************** |
71 | | *** BAM header I/O *** |
72 | | **********************/ |
73 | | |
74 | | HTSLIB_EXPORT |
75 | | const int8_t bam_cigar_table[256] = { |
76 | | // 0 .. 47 |
77 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
78 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
79 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
80 | | |
81 | | // 48 .. 63 (including =) |
82 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, BAM_CEQUAL, -1, -1, |
83 | | |
84 | | // 64 .. 79 (including MIDNHB) |
85 | | -1, -1, BAM_CBACK, -1, BAM_CDEL, -1, -1, -1, |
86 | | BAM_CHARD_CLIP, BAM_CINS, -1, -1, -1, BAM_CMATCH, BAM_CREF_SKIP, -1, |
87 | | |
88 | | // 80 .. 95 (including SPX) |
89 | | BAM_CPAD, -1, -1, BAM_CSOFT_CLIP, -1, -1, -1, -1, |
90 | | BAM_CDIFF, -1, -1, -1, -1, -1, -1, -1, |
91 | | |
92 | | // 96 .. 127 |
93 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
94 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
95 | | |
96 | | // 128 .. 255 |
97 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
98 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
99 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
100 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
101 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
102 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
103 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
104 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 |
105 | | }; |
106 | | |
107 | | sam_hdr_t *sam_hdr_init(void) |
108 | 59.1k | { |
109 | 59.1k | sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t)); |
110 | 59.1k | if (bh == NULL) return NULL; |
111 | | |
112 | 59.1k | bh->cigar_tab = bam_cigar_table; |
113 | 59.1k | return bh; |
114 | 59.1k | } |
115 | | |
116 | | void sam_hdr_destroy(sam_hdr_t *bh) |
117 | 167k | { |
118 | 167k | int32_t i; |
119 | | |
120 | 167k | if (bh == NULL) return; |
121 | | |
122 | 79.9k | if (bh->ref_count > 0) { |
123 | 20.8k | --bh->ref_count; |
124 | 20.8k | return; |
125 | 20.8k | } |
126 | | |
127 | 59.1k | if (bh->target_name) { |
128 | 82.7k | for (i = 0; i < bh->n_targets; ++i) |
129 | 63.5k | free(bh->target_name[i]); |
130 | 19.1k | free(bh->target_name); |
131 | 19.1k | free(bh->target_len); |
132 | 19.1k | } |
133 | 59.1k | free(bh->text); |
134 | 59.1k | if (bh->hrecs) |
135 | 53.7k | sam_hrecs_free(bh->hrecs); |
136 | 59.1k | if (bh->sdict) |
137 | 59.1k | kh_destroy(s2i, (khash_t(s2i) *) bh->sdict); |
138 | 59.1k | free(bh); |
139 | 59.1k | } |
140 | | |
141 | | // Copy the sam_hdr_t::sdict hash, used to store the real lengths of long |
142 | | // references before sam_hdr_t::hrecs is populated |
143 | | int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h) |
144 | 112 | { |
145 | 112 | const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict; |
146 | 112 | khash_t(s2i) *dest_long_refs = kh_init(s2i); |
147 | 112 | int i; |
148 | 112 | if (!dest_long_refs) return -1; |
149 | | |
150 | 3.57k | for (i = 0; i < h->n_targets; i++) { |
151 | 3.45k | int ret; |
152 | 3.45k | khiter_t ksrc, kdest; |
153 | 3.45k | if (h->target_len[i] < UINT32_MAX) continue; |
154 | 1.94k | ksrc = kh_get(s2i, src_long_refs, h->target_name[i]); |
155 | 1.94k | if (ksrc == kh_end(src_long_refs)) continue; |
156 | 1.94k | kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret); |
157 | 1.94k | if (ret < 0) { |
158 | 0 | kh_destroy(s2i, dest_long_refs); |
159 | 0 | return -1; |
160 | 0 | } |
161 | 1.94k | kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc); |
162 | 1.94k | } |
163 | | |
164 | 112 | h->sdict = dest_long_refs; |
165 | 112 | return 0; |
166 | 112 | } |
167 | | |
168 | | sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0) |
169 | 20.1k | { |
170 | 20.1k | if (h0 == NULL) return NULL; |
171 | 20.1k | sam_hdr_t *h; |
172 | 20.1k | if ((h = sam_hdr_init()) == NULL) return NULL; |
173 | | // copy the simple data |
174 | 20.1k | h->n_targets = 0; |
175 | 20.1k | h->ignore_sam_err = h0->ignore_sam_err; |
176 | 20.1k | h->l_text = 0; |
177 | | |
178 | | // Then the pointery stuff |
179 | | |
180 | 20.1k | if (!h0->hrecs) { |
181 | 1.10k | h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t)); |
182 | 1.10k | if (!h->target_len) goto fail; |
183 | 1.10k | h->target_name = (char**)calloc(h0->n_targets, sizeof(char*)); |
184 | 1.10k | if (!h->target_name) goto fail; |
185 | | |
186 | 1.10k | int i; |
187 | 5.26k | for (i = 0; i < h0->n_targets; ++i) { |
188 | 4.15k | h->target_len[i] = h0->target_len[i]; |
189 | 4.15k | h->target_name[i] = strdup(h0->target_name[i]); |
190 | 4.15k | if (!h->target_name[i]) break; |
191 | 4.15k | } |
192 | 1.10k | h->n_targets = i; |
193 | 1.10k | if (i < h0->n_targets) goto fail; |
194 | | |
195 | 1.10k | if (h0->sdict) { |
196 | 112 | if (sam_hdr_dup_sdict(h0, h) < 0) goto fail; |
197 | 112 | } |
198 | 1.10k | } |
199 | | |
200 | 20.1k | if (h0->hrecs) { |
201 | 19.0k | kstring_t tmp = { 0, 0, NULL }; |
202 | 19.0k | if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) { |
203 | 0 | free(ks_release(&tmp)); |
204 | 0 | goto fail; |
205 | 0 | } |
206 | | |
207 | 19.0k | h->l_text = tmp.l; |
208 | 19.0k | h->text = ks_release(&tmp); |
209 | | |
210 | 19.0k | if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0) |
211 | 0 | goto fail; |
212 | 19.0k | } else { |
213 | 1.10k | h->l_text = h0->l_text; |
214 | 1.10k | h->text = malloc(h->l_text + 1); |
215 | 1.10k | if (!h->text) goto fail; |
216 | 1.10k | memcpy(h->text, h0->text, h->l_text); |
217 | 1.10k | h->text[h->l_text] = '\0'; |
218 | 1.10k | } |
219 | | |
220 | 20.1k | return h; |
221 | | |
222 | 0 | fail: |
223 | 0 | sam_hdr_destroy(h); |
224 | 0 | return NULL; |
225 | 20.1k | } |
226 | | |
227 | | sam_hdr_t *bam_hdr_read(BGZF *fp) |
228 | 2.91k | { |
229 | 2.91k | sam_hdr_t *h; |
230 | 2.91k | uint8_t buf[4]; |
231 | 2.91k | int magic_len, has_EOF; |
232 | 2.91k | int32_t i, name_len, num_names = 0; |
233 | 2.91k | size_t bufsize; |
234 | 2.91k | ssize_t bytes; |
235 | | // check EOF |
236 | 2.91k | has_EOF = bgzf_check_EOF(fp); |
237 | 2.91k | if (has_EOF < 0) { |
238 | 0 | perror("[W::bam_hdr_read] bgzf_check_EOF"); |
239 | 2.91k | } else if (has_EOF == 0) { |
240 | 2.91k | hts_log_warning("EOF marker is absent. The input is probably truncated"); |
241 | 2.91k | } |
242 | | // read "BAM1" |
243 | 2.91k | magic_len = bgzf_read(fp, buf, 4); |
244 | 2.91k | if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) { |
245 | 3 | hts_log_error("Invalid BAM binary header"); |
246 | 3 | return 0; |
247 | 3 | } |
248 | 2.91k | h = sam_hdr_init(); |
249 | 2.91k | if (!h) goto nomem; |
250 | | |
251 | | // read plain text and the number of reference sequences |
252 | 2.91k | bytes = bgzf_read(fp, buf, 4); |
253 | 2.91k | if (bytes != 4) goto read_err; |
254 | 2.90k | h->l_text = le_to_u32(buf); |
255 | | |
256 | 2.90k | bufsize = h->l_text + 1; |
257 | 2.90k | if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed |
258 | 2.90k | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
259 | 2.90k | if (bufsize > FUZZ_ALLOC_LIMIT) goto nomem; |
260 | 2.89k | #endif |
261 | 2.89k | h->text = (char*)malloc(bufsize); |
262 | 2.89k | if (!h->text) goto nomem; |
263 | 2.89k | h->text[h->l_text] = 0; // make sure it is NULL terminated |
264 | 2.89k | bytes = bgzf_read(fp, h->text, h->l_text); |
265 | 2.89k | if (bytes != h->l_text) goto read_err; |
266 | | |
267 | 2.67k | bytes = bgzf_read(fp, &h->n_targets, 4); |
268 | 2.67k | if (bytes != 4) goto read_err; |
269 | 2.66k | if (fp->is_be) ed_swap_4p(&h->n_targets); |
270 | | |
271 | 2.66k | if (h->n_targets < 0) goto invalid; |
272 | | |
273 | | // read reference sequence names and lengths |
274 | 2.58k | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
275 | 2.58k | if (h->n_targets > (FUZZ_ALLOC_LIMIT - bufsize)/(sizeof(char*)+sizeof(uint32_t))) |
276 | 21 | goto nomem; |
277 | 2.56k | #endif |
278 | 2.56k | if (h->n_targets > 0) { |
279 | 1.12k | h->target_name = (char**)calloc(h->n_targets, sizeof(char*)); |
280 | 1.12k | if (!h->target_name) goto nomem; |
281 | 1.12k | h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t)); |
282 | 1.12k | if (!h->target_len) goto nomem; |
283 | 1.12k | } |
284 | 1.44k | else { |
285 | 1.44k | h->target_name = NULL; |
286 | 1.44k | h->target_len = NULL; |
287 | 1.44k | } |
288 | | |
289 | 5.32k | for (i = 0; i != h->n_targets; ++i) { |
290 | 3.12k | bytes = bgzf_read(fp, &name_len, 4); |
291 | 3.12k | if (bytes != 4) goto read_err; |
292 | 3.07k | if (fp->is_be) ed_swap_4p(&name_len); |
293 | 3.07k | if (name_len <= 0) goto invalid; |
294 | | |
295 | 2.97k | h->target_name[i] = (char*)malloc(name_len); |
296 | 2.97k | if (!h->target_name[i]) goto nomem; |
297 | 2.97k | num_names++; |
298 | | |
299 | 2.97k | bytes = bgzf_read(fp, h->target_name[i], name_len); |
300 | 2.97k | if (bytes != name_len) goto read_err; |
301 | | |
302 | 2.79k | if (h->target_name[i][name_len - 1] != '\0') { |
303 | | /* Fix missing NUL-termination. Is this being too nice? |
304 | | We could alternatively bail out with an error. */ |
305 | 1.63k | char *new_name; |
306 | 1.63k | if (name_len == INT32_MAX) goto invalid; |
307 | 1.63k | new_name = realloc(h->target_name[i], name_len + 1); |
308 | 1.63k | if (new_name == NULL) goto nomem; |
309 | 1.63k | h->target_name[i] = new_name; |
310 | 1.63k | h->target_name[i][name_len] = '\0'; |
311 | 1.63k | } |
312 | | |
313 | 2.79k | bytes = bgzf_read(fp, &h->target_len[i], 4); |
314 | 2.79k | if (bytes != 4) goto read_err; |
315 | 2.76k | if (fp->is_be) ed_swap_4p(&h->target_len[i]); |
316 | 2.76k | } |
317 | 2.19k | return h; |
318 | | |
319 | 30 | nomem: |
320 | 30 | hts_log_error("Out of memory"); |
321 | 30 | goto clean; |
322 | | |
323 | 501 | read_err: |
324 | 501 | if (bytes < 0) { |
325 | 48 | hts_log_error("Error reading BGZF stream"); |
326 | 453 | } else { |
327 | 453 | hts_log_error("Truncated BAM header"); |
328 | 453 | } |
329 | 501 | goto clean; |
330 | | |
331 | 186 | invalid: |
332 | 186 | hts_log_error("Invalid BAM binary header"); |
333 | | |
334 | 717 | clean: |
335 | 717 | if (h != NULL) { |
336 | 717 | h->n_targets = num_names; // ensure we free only allocated target_names |
337 | 717 | sam_hdr_destroy(h); |
338 | 717 | } |
339 | 717 | return NULL; |
340 | 186 | } |
341 | | |
342 | | int bam_hdr_write(BGZF *fp, const sam_hdr_t *h) |
343 | 11.6k | { |
344 | 11.6k | int32_t i, name_len, x; |
345 | 11.6k | kstring_t hdr_ks = { 0, 0, NULL }; |
346 | 11.6k | char *text; |
347 | 11.6k | uint32_t l_text; |
348 | | |
349 | 11.6k | if (!h) return -1; |
350 | | |
351 | 11.6k | if (h->hrecs) { |
352 | 10.5k | if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1; |
353 | 10.5k | if (hdr_ks.l > UINT32_MAX) { |
354 | 0 | hts_log_error("Header too long for BAM format"); |
355 | 0 | free(hdr_ks.s); |
356 | 0 | return -1; |
357 | 10.5k | } else if (hdr_ks.l > INT32_MAX) { |
358 | 0 | hts_log_warning("Header too long for BAM specification (>2GB)"); |
359 | 0 | hts_log_warning("Output file may not be portable"); |
360 | 0 | } |
361 | 10.5k | text = hdr_ks.s; |
362 | 10.5k | l_text = hdr_ks.l; |
363 | 10.5k | } else { |
364 | 1.10k | if (h->l_text > UINT32_MAX) { |
365 | 0 | hts_log_error("Header too long for BAM format"); |
366 | 0 | return -1; |
367 | 1.10k | } else if (h->l_text > INT32_MAX) { |
368 | 0 | hts_log_warning("Header too long for BAM specification (>2GB)"); |
369 | 0 | hts_log_warning("Output file may not be portable"); |
370 | 0 | } |
371 | 1.10k | text = h->text; |
372 | 1.10k | l_text = h->l_text; |
373 | 1.10k | } |
374 | | // write "BAM1" |
375 | 11.6k | if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; } |
376 | | // write plain text and the number of reference sequences |
377 | 11.6k | if (fp->is_be) { |
378 | 0 | x = ed_swap_4(l_text); |
379 | 0 | if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; } |
380 | 0 | if (l_text) { |
381 | 0 | if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; } |
382 | 0 | } |
383 | 0 | x = ed_swap_4(h->n_targets); |
384 | 0 | if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; } |
385 | 11.6k | } else { |
386 | 11.6k | if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; } |
387 | 11.6k | if (l_text) { |
388 | 7.25k | if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; } |
389 | 7.25k | } |
390 | 11.6k | if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; } |
391 | 11.6k | } |
392 | 11.6k | free(hdr_ks.s); |
393 | | // write sequence names and lengths |
394 | 26.8k | for (i = 0; i != h->n_targets; ++i) { |
395 | 15.1k | char *p = h->target_name[i]; |
396 | 15.1k | name_len = strlen(p) + 1; |
397 | 15.1k | if (fp->is_be) { |
398 | 0 | x = ed_swap_4(name_len); |
399 | 0 | if (bgzf_write(fp, &x, 4) < 0) return -1; |
400 | 15.1k | } else { |
401 | 15.1k | if (bgzf_write(fp, &name_len, 4) < 0) return -1; |
402 | 15.1k | } |
403 | 15.1k | if (bgzf_write(fp, p, name_len) < 0) return -1; |
404 | 15.1k | if (fp->is_be) { |
405 | 0 | x = ed_swap_4(h->target_len[i]); |
406 | 0 | if (bgzf_write(fp, &x, 4) < 0) return -1; |
407 | 15.1k | } else { |
408 | 15.1k | if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1; |
409 | 15.1k | } |
410 | 15.1k | } |
411 | 11.6k | if (bgzf_flush(fp) < 0) return -1; |
412 | 11.6k | return 0; |
413 | 11.6k | } |
414 | | |
415 | | const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, |
416 | 0 | hts_pos_t *beg, hts_pos_t *end, int flags) { |
417 | 0 | return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags); |
418 | 0 | } |
419 | | |
420 | | /************************* |
421 | | *** BAM alignment I/O *** |
422 | | *************************/ |
423 | | |
424 | | bam1_t *bam_init1(void) |
425 | 1.64M | { |
426 | 1.64M | return (bam1_t*)calloc(1, sizeof(bam1_t)); |
427 | 1.64M | } |
428 | | |
429 | | int sam_realloc_bam_data(bam1_t *b, size_t desired) |
430 | 2.14M | { |
431 | 2.14M | uint32_t new_m_data; |
432 | 2.14M | uint8_t *new_data; |
433 | 2.14M | new_m_data = desired; |
434 | 2.14M | kroundup32(new_m_data); |
435 | 2.14M | if (new_m_data < desired) { |
436 | 0 | errno = ENOMEM; // Not strictly true but we can't store the size |
437 | 0 | return -1; |
438 | 0 | } |
439 | 2.14M | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
440 | 2.14M | if (new_m_data > FUZZ_ALLOC_LIMIT) { |
441 | 45 | errno = ENOMEM; |
442 | 45 | return -1; |
443 | 45 | } |
444 | 2.14M | #endif |
445 | 2.14M | if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) { |
446 | 2.14M | new_data = realloc(b->data, new_m_data); |
447 | 2.14M | } else { |
448 | 0 | if ((new_data = malloc(new_m_data)) != NULL) { |
449 | 0 | if (b->l_data > 0) |
450 | 0 | memcpy(new_data, b->data, |
451 | 0 | b->l_data < b->m_data ? b->l_data : b->m_data); |
452 | 0 | bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA)); |
453 | 0 | } |
454 | 0 | } |
455 | 2.14M | if (!new_data) return -1; |
456 | 2.14M | b->data = new_data; |
457 | 2.14M | b->m_data = new_m_data; |
458 | 2.14M | return 0; |
459 | 2.14M | } |
460 | | |
461 | | void bam_destroy1(bam1_t *b) |
462 | 47.6M | { |
463 | 47.6M | if (b == 0) return; |
464 | 1.64M | if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) { |
465 | 1.64M | free(b->data); |
466 | 1.64M | if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) { |
467 | | // In case of reuse |
468 | 0 | b->data = NULL; |
469 | 0 | b->m_data = 0; |
470 | 0 | b->l_data = 0; |
471 | 0 | } |
472 | 1.64M | } |
473 | | |
474 | 1.64M | if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0) |
475 | 1.64M | free(b); |
476 | 1.64M | } |
477 | | |
478 | | bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) |
479 | 18.7M | { |
480 | 18.7M | if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL; |
481 | 18.7M | memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data |
482 | 18.7M | memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest |
483 | 18.7M | bdst->l_data = bsrc->l_data; |
484 | 18.7M | bdst->id = bsrc->id; |
485 | 18.7M | return bdst; |
486 | 18.7M | } |
487 | | |
488 | | bam1_t *bam_dup1(const bam1_t *bsrc) |
489 | 1.61M | { |
490 | 1.61M | if (bsrc == NULL) return NULL; |
491 | 1.61M | bam1_t *bdst = bam_init1(); |
492 | 1.61M | if (bdst == NULL) return NULL; |
493 | 1.61M | if (bam_copy1(bdst, bsrc) == NULL) { |
494 | 0 | bam_destroy1(bdst); |
495 | 0 | return NULL; |
496 | 0 | } |
497 | 1.61M | return bdst; |
498 | 1.61M | } |
499 | | |
500 | | static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar, |
501 | | hts_pos_t *rlen, hts_pos_t *qlen) |
502 | 2.75k | { |
503 | 2.75k | int k; |
504 | 2.75k | *rlen = *qlen = 0; |
505 | 759k | for (k = 0; k < n_cigar; ++k) { |
506 | 756k | int type = bam_cigar_type(bam_cigar_op(cigar[k])); |
507 | 756k | int len = bam_cigar_oplen(cigar[k]); |
508 | 756k | if (type & 1) *qlen += len; |
509 | 756k | if (type & 2) *rlen += len; |
510 | 756k | } |
511 | 2.75k | } |
512 | | |
513 | | static int subtract_check_underflow(size_t length, size_t *limit) |
514 | 279M | { |
515 | 279M | if (length <= *limit) { |
516 | 279M | *limit -= length; |
517 | 279M | return 0; |
518 | 279M | } |
519 | | |
520 | 0 | return -1; |
521 | 279M | } |
522 | | |
523 | | int bam_set1(bam1_t *bam, |
524 | | size_t l_qname, const char *qname, |
525 | | uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq, |
526 | | size_t n_cigar, const uint32_t *cigar, |
527 | | int32_t mtid, hts_pos_t mpos, hts_pos_t isize, |
528 | | size_t l_seq, const char *seq, const char *qual, |
529 | | size_t l_aux) |
530 | 55.8M | { |
531 | | // use a default qname "*" if none is provided |
532 | 55.8M | if (l_qname == 0) { |
533 | 51.9M | l_qname = 1; |
534 | 51.9M | qname = "*"; |
535 | 51.9M | } |
536 | | |
537 | | // note: the qname is stored nul terminated and padded as described in the |
538 | | // documentation for the bam1_t struct. |
539 | 55.8M | size_t qname_nuls = 4 - l_qname % 4; |
540 | | |
541 | | // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos(). |
542 | | // can't use bam_endpos() directly as some fields not yet set up. |
543 | 55.8M | hts_pos_t rlen = 0, qlen = 0; |
544 | 55.8M | if (!(flag & BAM_FUNMAP)) { |
545 | 0 | bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen); |
546 | 0 | } |
547 | 55.8M | if (rlen == 0) { |
548 | 55.8M | rlen = 1; |
549 | 55.8M | } |
550 | | |
551 | | // validate parameters |
552 | 55.8M | if (l_qname > 254) { |
553 | 90 | hts_log_error("Query name too long"); |
554 | 90 | errno = EINVAL; |
555 | 90 | return -1; |
556 | 90 | } |
557 | 55.8M | if (HTS_POS_MAX - rlen <= pos) { |
558 | 0 | hts_log_error("Read ends beyond highest supported position"); |
559 | 0 | errno = EINVAL; |
560 | 0 | return -1; |
561 | 0 | } |
562 | 55.8M | if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) { |
563 | 0 | hts_log_error("Mapped query must have a CIGAR"); |
564 | 0 | errno = EINVAL; |
565 | 0 | return -1; |
566 | 0 | } |
567 | 55.8M | if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) { |
568 | 0 | hts_log_error("CIGAR and query sequence are of different length"); |
569 | 0 | errno = EINVAL; |
570 | 0 | return -1; |
571 | 0 | } |
572 | | |
573 | 55.8M | size_t limit = INT32_MAX; |
574 | 55.8M | int u = subtract_check_underflow(l_qname + qname_nuls, &limit); |
575 | 55.8M | u += subtract_check_underflow(n_cigar * 4, &limit); |
576 | 55.8M | u += subtract_check_underflow((l_seq + 1) / 2, &limit); |
577 | 55.8M | u += subtract_check_underflow(l_seq, &limit); |
578 | 55.8M | u += subtract_check_underflow(l_aux, &limit); |
579 | 55.8M | if (u != 0) { |
580 | 0 | hts_log_error("Size overflow"); |
581 | 0 | errno = EINVAL; |
582 | 0 | return -1; |
583 | 0 | } |
584 | | |
585 | | // re-allocate the data buffer as needed. |
586 | 55.8M | size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq; |
587 | 55.8M | if (realloc_bam_data(bam, data_len + l_aux) < 0) { |
588 | 0 | return -1; |
589 | 0 | } |
590 | | |
591 | 55.8M | bam->l_data = (int)data_len; |
592 | 55.8M | bam->core.pos = pos; |
593 | 55.8M | bam->core.tid = tid; |
594 | 55.8M | bam->core.bin = bam_reg2bin(pos, pos + rlen); |
595 | 55.8M | bam->core.qual = mapq; |
596 | 55.8M | bam->core.l_extranul = (uint8_t)(qname_nuls - 1); |
597 | 55.8M | bam->core.flag = flag; |
598 | 55.8M | bam->core.l_qname = (uint16_t)(l_qname + qname_nuls); |
599 | 55.8M | bam->core.n_cigar = (uint32_t)n_cigar; |
600 | 55.8M | bam->core.l_qseq = (int32_t)l_seq; |
601 | 55.8M | bam->core.mtid = mtid; |
602 | 55.8M | bam->core.mpos = mpos; |
603 | 55.8M | bam->core.isize = isize; |
604 | | |
605 | 55.8M | uint8_t *cp = bam->data; |
606 | 55.8M | strncpy((char *)cp, qname, l_qname); |
607 | 55.8M | int i; |
608 | 220M | for (i = 0; i < qname_nuls; i++) { |
609 | 164M | cp[l_qname + i] = '\0'; |
610 | 164M | } |
611 | 55.8M | cp += l_qname + qname_nuls; |
612 | | |
613 | 55.8M | if (n_cigar > 0) { |
614 | 0 | memcpy(cp, cigar, n_cigar * 4); |
615 | 0 | } |
616 | 55.8M | cp += n_cigar * 4; |
617 | | |
618 | 1.87G | #define NN 16 |
619 | 55.8M | const uint8_t *useq = (uint8_t *)seq; |
620 | 207M | for (i = 0; i + NN < l_seq; i += NN) { |
621 | 151M | int j; |
622 | 151M | const uint8_t *u2 = useq+i; |
623 | 1.36G | for (j = 0; j < NN/2; j++) |
624 | 1.21G | cp[j] = (seq_nt16_table[u2[j*2]]<<4) | seq_nt16_table[u2[j*2+1]]; |
625 | 151M | cp += NN/2; |
626 | 151M | } |
627 | 62.3M | for (; i + 1 < l_seq; i += 2) { |
628 | 6.42M | *cp++ = (seq_nt16_table[useq[i]] << 4) | seq_nt16_table[useq[i + 1]]; |
629 | 6.42M | } |
630 | | |
631 | 57.0M | for (; i < l_seq; i++) { |
632 | 1.15M | *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4; |
633 | 1.15M | } |
634 | | |
635 | 55.8M | if (qual) { |
636 | 2.35k | memcpy(cp, qual, l_seq); |
637 | 2.35k | } |
638 | 55.8M | else { |
639 | 55.8M | memset(cp, '\xff', l_seq); |
640 | 55.8M | } |
641 | | |
642 | 55.8M | return (int)data_len; |
643 | 55.8M | } |
644 | | |
645 | | hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar) |
646 | 18.3M | { |
647 | 18.3M | int k; |
648 | 18.3M | hts_pos_t l; |
649 | 25.9M | for (k = l = 0; k < n_cigar; ++k) |
650 | 7.65M | if (bam_cigar_type(bam_cigar_op(cigar[k]))&1) |
651 | 7.26M | l += bam_cigar_oplen(cigar[k]); |
652 | 18.3M | return l; |
653 | 18.3M | } |
654 | | |
655 | | hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar) |
656 | 179k | { |
657 | 179k | int k; |
658 | 179k | hts_pos_t l; |
659 | 27.6M | for (k = l = 0; k < n_cigar; ++k) |
660 | 27.5M | if (bam_cigar_type(bam_cigar_op(cigar[k]))&2) |
661 | 25.7M | l += bam_cigar_oplen(cigar[k]); |
662 | 179k | return l; |
663 | 179k | } |
664 | | |
665 | | hts_pos_t bam_endpos(const bam1_t *b) |
666 | 5.96k | { |
667 | 5.96k | hts_pos_t rlen = (b->core.flag & BAM_FUNMAP)? 0 : bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); |
668 | 5.96k | if (rlen == 0) rlen = 1; |
669 | 5.96k | return b->core.pos + rlen; |
670 | 5.96k | } |
671 | | |
672 | | static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG |
673 | 338k | { |
674 | 338k | bam1_core_t *c = &b->core; |
675 | 338k | uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes; |
676 | 338k | uint8_t *CG; |
677 | | |
678 | | // test where there is a real CIGAR in the CG tag to move |
679 | 338k | if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0; |
680 | 173k | cigar0 = bam_get_cigar(b); |
681 | 173k | if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0; |
682 | 158k | fake_bytes = c->n_cigar * 4; |
683 | 158k | int saved_errno = errno; |
684 | 158k | CG = bam_aux_get(b, "CG"); |
685 | 158k | if (!CG) { |
686 | 149k | if (errno != ENOENT) return -1; // Bad aux data |
687 | 149k | errno = saved_errno; // restore errno on expected no-CG-tag case |
688 | 149k | return 0; |
689 | 149k | } |
690 | 8.98k | if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i')) |
691 | 3.00k | return 0; // not of type B,I |
692 | 5.98k | CG_len = le_to_u32(CG + 2); |
693 | 5.98k | if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length |
694 | | |
695 | | // move from the CG tag to the right position |
696 | 5.96k | cigar_st = (uint8_t*)cigar0 - b->data; |
697 | 5.96k | c->n_cigar = CG_len; |
698 | 5.96k | n_cigar4 = c->n_cigar * 4; |
699 | 5.96k | CG_st = CG - b->data - 2; |
700 | 5.96k | CG_en = CG_st + 8 + n_cigar4; |
701 | 5.96k | if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1; |
702 | 5.96k | b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place |
703 | 5.96k | memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room |
704 | 5.96k | memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR |
705 | 5.96k | if (ori_len > CG_en) // move data after the CG tag |
706 | 1.38k | memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en); |
707 | 5.96k | b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4) |
708 | 5.96k | if (recal_bin) |
709 | 5.96k | b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5); |
710 | 5.96k | if (give_warning) |
711 | 5.96k | hts_log_warning("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar); |
712 | 5.96k | return 1; |
713 | 5.96k | } |
714 | | |
715 | | static inline int aux_type2size(uint8_t type) |
716 | 3.90M | { |
717 | 3.90M | switch (type) { |
718 | 2.59M | case 'A': case 'c': case 'C': |
719 | 2.59M | return 1; |
720 | 395k | case 's': case 'S': |
721 | 395k | return 2; |
722 | 636k | case 'i': case 'I': case 'f': |
723 | 636k | return 4; |
724 | 70.5k | case 'd': |
725 | 70.5k | return 8; |
726 | 209k | case 'Z': case 'H': case 'B': |
727 | 209k | return type; |
728 | 367 | default: |
729 | 367 | return 0; |
730 | 3.90M | } |
731 | 3.90M | } |
732 | | |
733 | | static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host) |
734 | 0 | { |
735 | 0 | uint32_t *cigar = (uint32_t*)(data + c->l_qname); |
736 | 0 | uint32_t i; |
737 | 0 | for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]); |
738 | 0 | } |
739 | | |
740 | | // Fix bad records where qname is not terminated correctly. |
741 | 2.04k | static int fixup_missing_qname_nul(bam1_t *b) { |
742 | 2.04k | bam1_core_t *c = &b->core; |
743 | | |
744 | | // Note this is called before c->l_extranul is added to c->l_qname |
745 | 2.04k | if (c->l_extranul > 0) { |
746 | 1.69k | b->data[c->l_qname++] = '\0'; |
747 | 1.69k | c->l_extranul--; |
748 | 1.69k | } else { |
749 | 349 | if (b->l_data > INT_MAX - 4) return -1; |
750 | 349 | if (realloc_bam_data(b, b->l_data + 4) < 0) return -1; |
751 | 349 | b->l_data += 4; |
752 | 349 | b->data[c->l_qname++] = '\0'; |
753 | 349 | c->l_extranul = 3; |
754 | 349 | } |
755 | 2.04k | return 0; |
756 | 2.04k | } |
757 | | |
758 | | /* |
759 | | * Note a second interface that returns a bam pointer instead would avoid bam_copy1 |
760 | | * in multi-threaded handling. This may be worth considering for htslib2. |
761 | | */ |
762 | | int bam_read1(BGZF *fp, bam1_t *b) |
763 | 4.56k | { |
764 | 4.56k | bam1_core_t *c = &b->core; |
765 | 4.56k | int32_t block_len, ret, i; |
766 | 4.56k | uint32_t x[8], new_l_data; |
767 | | |
768 | 4.56k | b->l_data = 0; |
769 | | |
770 | 4.56k | if ((ret = bgzf_read(fp, &block_len, 4)) != 4) { |
771 | 278 | if (ret == 0) return -1; // normal end-of-file |
772 | 166 | else return -2; // truncated |
773 | 278 | } |
774 | 4.29k | if (fp->is_be) |
775 | 0 | ed_swap_4p(&block_len); |
776 | 4.29k | if (block_len < 32) return -4; // block_len includes core data |
777 | 4.02k | if (bgzf_read(fp, x, 32) != 32) return -3; |
778 | 3.82k | if (fp->is_be) { |
779 | 0 | for (i = 0; i < 8; ++i) ed_swap_4p(x + i); |
780 | 0 | } |
781 | 3.82k | c->tid = x[0]; c->pos = (int32_t)x[1]; |
782 | 3.82k | c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; |
783 | 3.82k | c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0; |
784 | 3.82k | c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; |
785 | 3.82k | c->l_qseq = x[4]; |
786 | 3.82k | c->mtid = x[5]; c->mpos = (int32_t)x[6]; c->isize = (int32_t)x[7]; |
787 | | |
788 | 3.82k | new_l_data = block_len - 32 + c->l_extranul; |
789 | 3.82k | if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4; |
790 | 3.67k | if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul |
791 | 3.67k | + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data) |
792 | 139 | return -4; |
793 | 3.54k | if (realloc_bam_data(b, new_l_data) < 0) return -4; |
794 | 3.49k | b->l_data = new_l_data; |
795 | | |
796 | 3.49k | if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4; |
797 | 3.42k | if (b->data[c->l_qname - 1] != '\0') { // Try to fix missing NUL termination |
798 | 2.04k | if (fixup_missing_qname_nul(b) < 0) return -4; |
799 | 2.04k | } |
800 | 7.16k | for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0'; |
801 | 3.42k | c->l_qname += c->l_extranul; |
802 | 3.42k | if (b->l_data < c->l_qname || |
803 | 3.42k | bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname) |
804 | 206 | return -4; |
805 | 3.22k | if (fp->is_be) swap_data(c, b->l_data, b->data, 0); |
806 | 3.22k | if (bam_tag2cigar(b, 0, 0) < 0) |
807 | 8 | return -4; |
808 | | |
809 | 3.21k | if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency |
810 | 2.75k | hts_pos_t rlen, qlen; |
811 | 2.75k | bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen); |
812 | 2.75k | if ((b->core.flag & BAM_FUNMAP) || rlen == 0) rlen = 1; |
813 | 2.75k | b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5); |
814 | | // Sanity check for broken CIGAR alignments |
815 | 2.75k | if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) { |
816 | 93 | hts_log_error("CIGAR and query sequence lengths differ for %s", |
817 | 93 | bam_get_qname(b)); |
818 | 93 | return -4; |
819 | 93 | } |
820 | 2.75k | } |
821 | | |
822 | 3.12k | return 4 + block_len; |
823 | 3.21k | } |
824 | | |
825 | | int bam_write1(BGZF *fp, const bam1_t *b) |
826 | 18.7M | { |
827 | 18.7M | const bam1_core_t *c = &b->core; |
828 | 18.7M | uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y; |
829 | 18.7M | int i, ok; |
830 | 18.7M | if (c->l_qname - c->l_extranul > 255) { |
831 | 8 | hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b)); |
832 | 8 | errno = EOVERFLOW; |
833 | 8 | return -1; |
834 | 8 | } |
835 | 18.7M | if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR |
836 | 18.7M | if (c->pos > INT_MAX || |
837 | 18.7M | c->mpos > INT_MAX || |
838 | 18.7M | c->isize < INT_MIN || c->isize > INT_MAX) { |
839 | 730 | hts_log_error("Positional data is too large for BAM format"); |
840 | 730 | return -1; |
841 | 730 | } |
842 | 18.7M | x[0] = c->tid; |
843 | 18.7M | x[1] = c->pos; |
844 | 18.7M | x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul); |
845 | 18.7M | if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2; |
846 | 18.7M | else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff); |
847 | 18.7M | x[4] = c->l_qseq; |
848 | 18.7M | x[5] = c->mtid; |
849 | 18.7M | x[6] = c->mpos; |
850 | 18.7M | x[7] = c->isize; |
851 | 18.7M | ok = (bgzf_flush_try(fp, 4 + block_len) >= 0); |
852 | 18.7M | if (fp->is_be) { |
853 | 0 | for (i = 0; i < 8; ++i) ed_swap_4p(x + i); |
854 | 0 | y = block_len; |
855 | 0 | if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0); |
856 | 0 | swap_data(c, b->l_data, b->data, 1); |
857 | 18.7M | } else { |
858 | 18.7M | if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0); |
859 | 18.7M | } |
860 | 18.7M | if (ok) ok = (bgzf_write(fp, x, 32) >= 0); |
861 | 18.7M | if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0); |
862 | 18.7M | if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally |
863 | 18.7M | if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0); |
864 | 18.7M | } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag |
865 | 35 | uint8_t buf[8]; |
866 | 35 | uint32_t cigar_st, cigar_en, cigar[2]; |
867 | 35 | hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b)); |
868 | 35 | if (cigreflen >= (1<<28)) { |
869 | | // Length of reference covered is greater than the biggest |
870 | | // CIGAR operation currently allowed. |
871 | 3 | hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos |
872 | 3 | " cannot be written in BAM. Try writing SAM or CRAM instead.\n", |
873 | 3 | bam_get_qname(b), c->n_cigar, cigreflen); |
874 | 3 | return -1; |
875 | 3 | } |
876 | 32 | cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; |
877 | 32 | cigar_en = cigar_st + c->n_cigar * 4; |
878 | 32 | cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP; |
879 | 32 | cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP; |
880 | 32 | u32_to_le(cigar[0], buf); |
881 | 32 | u32_to_le(cigar[1], buf + 4); |
882 | 32 | if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N |
883 | 32 | if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR |
884 | 32 | if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I |
885 | 32 | u32_to_le(c->n_cigar, buf); |
886 | 32 | if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length |
887 | 32 | if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR |
888 | 32 | } |
889 | 18.7M | if (fp->is_be) swap_data(c, b->l_data, b->data, 0); |
890 | 18.7M | return ok? 4 + block_len : -1; |
891 | 18.7M | } |
892 | | |
893 | | /* |
894 | | * Write a BAM file and append to the in-memory index simultaneously. |
895 | | */ |
896 | 18.7M | static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) { |
897 | 18.7M | BGZF *bfp = fp->fp.bgzf; |
898 | | |
899 | 18.7M | if (!fp->idx) |
900 | 18.7M | return bam_write1(bfp, b); |
901 | | |
902 | 0 | uint32_t block_len = b->l_data - b->core.l_extranul + 32; |
903 | 0 | if (bgzf_flush_try(bfp, 4 + block_len) < 0) |
904 | 0 | return -1; |
905 | 0 | if (!bfp->mt) |
906 | 0 | hts_idx_amend_last(fp->idx, bgzf_tell(bfp)); |
907 | |
|
908 | 0 | int ret = bam_write1(bfp, b); |
909 | 0 | if (ret < 0) |
910 | 0 | return -1; |
911 | | |
912 | 0 | if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) { |
913 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
914 | 0 | bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); |
915 | 0 | ret = -1; |
916 | 0 | } |
917 | |
|
918 | 0 | return ret; |
919 | 0 | } |
920 | | |
921 | | /* |
922 | | * Set the qname in a BAM record |
923 | | */ |
924 | | int bam_set_qname(bam1_t *rec, const char *qname) |
925 | 0 | { |
926 | 0 | if (!rec) return -1; |
927 | 0 | if (!qname || !*qname) return -1; |
928 | | |
929 | 0 | size_t old_len = rec->core.l_qname; |
930 | 0 | size_t new_len = strlen(qname) + 1; |
931 | 0 | if (new_len < 1 || new_len > 255) return -1; |
932 | | |
933 | 0 | int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0; |
934 | |
|
935 | 0 | size_t new_data_len = rec->l_data - old_len + new_len + extranul; |
936 | 0 | if (realloc_bam_data(rec, new_data_len) < 0) return -1; |
937 | | |
938 | | // Make room |
939 | 0 | if (new_len + extranul != rec->core.l_qname) |
940 | 0 | memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname); |
941 | | // Copy in new name and pad if needed |
942 | 0 | memcpy(rec->data, qname, new_len); |
943 | 0 | int n; |
944 | 0 | for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0'; |
945 | |
|
946 | 0 | rec->l_data = new_data_len; |
947 | 0 | rec->core.l_qname = new_len + extranul; |
948 | 0 | rec->core.l_extranul = extranul; |
949 | |
|
950 | 0 | return 0; |
951 | 0 | } |
952 | | |
953 | | /******************** |
954 | | *** BAM indexing *** |
955 | | ********************/ |
956 | | |
957 | | static hts_idx_t *sam_index(htsFile *fp, int min_shift) |
958 | 0 | { |
959 | 0 | int n_lvls, i, fmt, ret; |
960 | 0 | bam1_t *b; |
961 | 0 | hts_idx_t *idx; |
962 | 0 | sam_hdr_t *h; |
963 | 0 | h = sam_hdr_read(fp); |
964 | 0 | if (h == NULL) return NULL; |
965 | 0 | if (min_shift > 0) { |
966 | 0 | hts_pos_t max_len = 0, s; |
967 | 0 | for (i = 0; i < h->n_targets; ++i) { |
968 | 0 | hts_pos_t len = sam_hdr_tid2len(h, i); |
969 | 0 | if (max_len < len) max_len = len; |
970 | 0 | } |
971 | 0 | max_len += 256; |
972 | 0 | for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3); |
973 | 0 | fmt = HTS_FMT_CSI; |
974 | 0 | } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI; |
975 | 0 | idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); |
976 | 0 | b = bam_init1(); |
977 | 0 | while ((ret = sam_read1(fp, h, b)) >= 0) { |
978 | 0 | ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)); |
979 | 0 | if (ret < 0) { // unsorted or doesn't fit |
980 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); |
981 | 0 | goto err; |
982 | 0 | } |
983 | 0 | } |
984 | 0 | if (ret < -1) goto err; // corrupted BAM file |
985 | | |
986 | 0 | hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf)); |
987 | 0 | sam_hdr_destroy(h); |
988 | 0 | bam_destroy1(b); |
989 | 0 | return idx; |
990 | | |
991 | 0 | err: |
992 | 0 | bam_destroy1(b); |
993 | 0 | hts_idx_destroy(idx); |
994 | 0 | return NULL; |
995 | 0 | } |
996 | | |
997 | | int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads) |
998 | 0 | { |
999 | 0 | hts_idx_t *idx; |
1000 | 0 | htsFile *fp; |
1001 | 0 | int ret = 0; |
1002 | |
|
1003 | 0 | if ((fp = hts_open(fn, "r")) == 0) return -2; |
1004 | 0 | if (nthreads) |
1005 | 0 | hts_set_threads(fp, nthreads); |
1006 | |
|
1007 | 0 | switch (fp->format.format) { |
1008 | 0 | case cram: |
1009 | |
|
1010 | 0 | ret = cram_index_build(fp->fp.cram, fn, fnidx); |
1011 | 0 | break; |
1012 | | |
1013 | 0 | case bam: |
1014 | 0 | case sam: |
1015 | 0 | if (fp->format.compression != bgzf) { |
1016 | 0 | hts_log_error("%s file \"%s\" not BGZF compressed", |
1017 | 0 | fp->format.format == bam ? "BAM" : "SAM", fn); |
1018 | 0 | ret = -1; |
1019 | 0 | break; |
1020 | 0 | } |
1021 | 0 | idx = sam_index(fp, min_shift); |
1022 | 0 | if (idx) { |
1023 | 0 | ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI); |
1024 | 0 | if (ret < 0) ret = -4; |
1025 | 0 | hts_idx_destroy(idx); |
1026 | 0 | } |
1027 | 0 | else ret = -1; |
1028 | 0 | break; |
1029 | | |
1030 | 0 | default: |
1031 | 0 | ret = -3; |
1032 | 0 | break; |
1033 | 0 | } |
1034 | 0 | hts_close(fp); |
1035 | |
|
1036 | 0 | return ret; |
1037 | 0 | } |
1038 | | |
1039 | | int sam_index_build2(const char *fn, const char *fnidx, int min_shift) |
1040 | 0 | { |
1041 | 0 | return sam_index_build3(fn, fnidx, min_shift, 0); |
1042 | 0 | } |
1043 | | |
1044 | | int sam_index_build(const char *fn, int min_shift) |
1045 | 0 | { |
1046 | 0 | return sam_index_build3(fn, NULL, min_shift, 0); |
1047 | 0 | } |
1048 | | |
1049 | | // Provide bam_index_build() symbol for binary compatibility with earlier HTSlib |
1050 | | #undef bam_index_build |
1051 | | int bam_index_build(const char *fn, int min_shift) |
1052 | 0 | { |
1053 | 0 | return sam_index_build2(fn, NULL, min_shift); |
1054 | 0 | } |
1055 | | |
1056 | | // Initialise fp->idx for the current format type. |
1057 | | // This must be called after the header has been written but no other data. |
1058 | 0 | int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) { |
1059 | 0 | fp->fnidx = fnidx; |
1060 | 0 | if (fp->format.format == bam || fp->format.format == bcf || |
1061 | 0 | (fp->format.format == sam && fp->format.compression == bgzf)) { |
1062 | 0 | int n_lvls, fmt = HTS_FMT_CSI; |
1063 | 0 | if (min_shift > 0) { |
1064 | 0 | int64_t max_len = 0, s; |
1065 | 0 | int i; |
1066 | 0 | for (i = 0; i < h->n_targets; ++i) |
1067 | 0 | if (max_len < h->target_len[i]) max_len = h->target_len[i]; |
1068 | 0 | max_len += 256; |
1069 | 0 | for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3); |
1070 | |
|
1071 | 0 | } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI; |
1072 | |
|
1073 | 0 | fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); |
1074 | 0 | return fp->idx ? 0 : -1; |
1075 | 0 | } |
1076 | | |
1077 | 0 | if (fp->format.format == cram) { |
1078 | 0 | fp->fp.cram->idxfp = bgzf_open(fnidx, "wg"); |
1079 | 0 | return fp->fp.cram->idxfp ? 0 : -1; |
1080 | 0 | } |
1081 | | |
1082 | 0 | return -1; |
1083 | 0 | } |
1084 | | |
1085 | | // Finishes an index. Call after the last record has been written. |
1086 | | // Returns 0 on success, <0 on failure. |
1087 | 0 | int sam_idx_save(htsFile *fp) { |
1088 | 0 | if (fp->format.format == bam || fp->format.format == bcf || |
1089 | 0 | fp->format.format == vcf || fp->format.format == sam) { |
1090 | 0 | int ret; |
1091 | 0 | if ((ret = sam_state_destroy(fp)) < 0) { |
1092 | 0 | errno = -ret; |
1093 | 0 | return -1; |
1094 | 0 | } |
1095 | 0 | if (!fp->is_bgzf || bgzf_flush(fp->fp.bgzf) < 0) |
1096 | 0 | return -1; |
1097 | 0 | hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf)); |
1098 | |
|
1099 | 0 | if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0) |
1100 | 0 | return -1; |
1101 | | |
1102 | 0 | return hts_idx_save_but_not_close(fp->idx, fp->fnidx, hts_idx_fmt(fp->idx)); |
1103 | |
|
1104 | 0 | } else if (fp->format.format == cram) { |
1105 | | // flushed and closed by cram_close |
1106 | 0 | } |
1107 | | |
1108 | 0 | return 0; |
1109 | 0 | } |
1110 | | |
1111 | | static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) |
1112 | 0 | { |
1113 | 0 | htsFile *fp = (htsFile *)fpv; |
1114 | 0 | bam1_t *b = bv; |
1115 | 0 | fp->line.l = 0; |
1116 | 0 | int ret = sam_read1(fp, fp->bam_header, b); |
1117 | 0 | if (ret >= 0) { |
1118 | 0 | *tid = b->core.tid; |
1119 | 0 | *beg = b->core.pos; |
1120 | 0 | *end = bam_endpos(b); |
1121 | 0 | } |
1122 | 0 | return ret; |
1123 | 0 | } |
1124 | | |
1125 | | // This is used only with read_rest=1 iterators, so need not set tid/beg/end. |
1126 | | static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) |
1127 | 0 | { |
1128 | 0 | htsFile *fp = (htsFile *)fpv; |
1129 | 0 | bam1_t *b = bv; |
1130 | 0 | fp->line.l = 0; |
1131 | 0 | int ret = sam_read1(fp, fp->bam_header, b); |
1132 | 0 | return ret; |
1133 | 0 | } |
1134 | | |
1135 | | // Internal (for now) func used by bam_sym_lookup. This is copied from |
1136 | | // samtools/bam.c. |
1137 | | static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b) |
1138 | 0 | { |
1139 | 0 | const char *rg; |
1140 | 0 | kstring_t lib = { 0, 0, NULL }; |
1141 | 0 | rg = (char *)bam_aux_get(b, "RG"); |
1142 | |
|
1143 | 0 | if (!rg) |
1144 | 0 | return NULL; |
1145 | 0 | else |
1146 | 0 | rg++; |
1147 | | |
1148 | 0 | if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib) < 0) |
1149 | 0 | return NULL; |
1150 | | |
1151 | 0 | static char LB_text[1024]; |
1152 | 0 | int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1; |
1153 | |
|
1154 | 0 | memcpy(LB_text, lib.s, len); |
1155 | 0 | LB_text[len] = 0; |
1156 | |
|
1157 | 0 | free(lib.s); |
1158 | |
|
1159 | 0 | return LB_text; |
1160 | 0 | } |
1161 | | |
1162 | | |
1163 | | // Bam record pointer and SAM header combined |
1164 | | typedef struct { |
1165 | | const sam_hdr_t *h; |
1166 | | const bam1_t *b; |
1167 | | } hb_pair; |
1168 | | |
1169 | | // Looks up variable names in str and replaces them with their value. |
1170 | | // Also supports aux tags. |
1171 | | // |
1172 | | // Note the expression parser deliberately overallocates str size so it |
1173 | | // is safe to use memcmp over strcmp. |
1174 | | static int bam_sym_lookup(void *data, char *str, char **end, |
1175 | 0 | hts_expr_val_t *res) { |
1176 | 0 | hb_pair *hb = (hb_pair *)data; |
1177 | 0 | const bam1_t *b = hb->b; |
1178 | |
|
1179 | 0 | res->is_str = 0; |
1180 | 0 | switch(*str) { |
1181 | 0 | case 'c': |
1182 | 0 | if (memcmp(str, "cigar", 5) == 0) { |
1183 | 0 | *end = str+5; |
1184 | 0 | res->is_str = 1; |
1185 | 0 | ks_clear(&res->s); |
1186 | 0 | uint32_t *cigar = bam_get_cigar(b); |
1187 | 0 | int i, n = b->core.n_cigar, r = 0; |
1188 | 0 | if (n) { |
1189 | 0 | for (i = 0; i < n; i++) { |
1190 | 0 | r |= kputw (bam_cigar_oplen(cigar[i]), &res->s) < 0; |
1191 | 0 | r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s) < 0; |
1192 | 0 | } |
1193 | 0 | r |= kputs("", &res->s) < 0; |
1194 | 0 | } else { |
1195 | 0 | r |= kputs("*", &res->s) < 0; |
1196 | 0 | } |
1197 | 0 | return r ? -1 : 0; |
1198 | 0 | } |
1199 | 0 | break; |
1200 | | |
1201 | 0 | case 'e': |
1202 | 0 | if (memcmp(str, "endpos", 6) == 0) { |
1203 | 0 | *end = str+6; |
1204 | 0 | res->d = bam_endpos(b); |
1205 | 0 | return 0; |
1206 | 0 | } |
1207 | 0 | break; |
1208 | | |
1209 | 0 | case 'f': |
1210 | 0 | if (memcmp(str, "flag", 4) == 0) { |
1211 | 0 | str = *end = str+4; |
1212 | 0 | if (*str != '.') { |
1213 | 0 | res->d = b->core.flag; |
1214 | 0 | return 0; |
1215 | 0 | } else { |
1216 | 0 | str++; |
1217 | 0 | if (!memcmp(str, "paired", 6)) { |
1218 | 0 | *end = str+6; |
1219 | 0 | res->d = b->core.flag & BAM_FPAIRED; |
1220 | 0 | return 0; |
1221 | 0 | } else if (!memcmp(str, "proper_pair", 11)) { |
1222 | 0 | *end = str+11; |
1223 | 0 | res->d = b->core.flag & BAM_FPROPER_PAIR; |
1224 | 0 | return 0; |
1225 | 0 | } else if (!memcmp(str, "unmap", 5)) { |
1226 | 0 | *end = str+5; |
1227 | 0 | res->d = b->core.flag & BAM_FUNMAP; |
1228 | 0 | return 0; |
1229 | 0 | } else if (!memcmp(str, "munmap", 6)) { |
1230 | 0 | *end = str+6; |
1231 | 0 | res->d = b->core.flag & BAM_FMUNMAP; |
1232 | 0 | return 0; |
1233 | 0 | } else if (!memcmp(str, "reverse", 7)) { |
1234 | 0 | *end = str+7; |
1235 | 0 | res->d = b->core.flag & BAM_FREVERSE; |
1236 | 0 | return 0; |
1237 | 0 | } else if (!memcmp(str, "mreverse", 8)) { |
1238 | 0 | *end = str+8; |
1239 | 0 | res->d = b->core.flag & BAM_FMREVERSE; |
1240 | 0 | return 0; |
1241 | 0 | } else if (!memcmp(str, "read1", 5)) { |
1242 | 0 | *end = str+5; |
1243 | 0 | res->d = b->core.flag & BAM_FREAD1; |
1244 | 0 | return 0; |
1245 | 0 | } else if (!memcmp(str, "read2", 5)) { |
1246 | 0 | *end = str+5; |
1247 | 0 | res->d = b->core.flag & BAM_FREAD2; |
1248 | 0 | return 0; |
1249 | 0 | } else if (!memcmp(str, "secondary", 9)) { |
1250 | 0 | *end = str+9; |
1251 | 0 | res->d = b->core.flag & BAM_FSECONDARY; |
1252 | 0 | return 0; |
1253 | 0 | } else if (!memcmp(str, "qcfail", 6)) { |
1254 | 0 | *end = str+6; |
1255 | 0 | res->d = b->core.flag & BAM_FQCFAIL; |
1256 | 0 | return 0; |
1257 | 0 | } else if (!memcmp(str, "dup", 3)) { |
1258 | 0 | *end = str+3; |
1259 | 0 | res->d = b->core.flag & BAM_FDUP; |
1260 | 0 | return 0; |
1261 | 0 | } else if (!memcmp(str, "supplementary", 13)) { |
1262 | 0 | *end = str+13; |
1263 | 0 | res->d = b->core.flag & BAM_FSUPPLEMENTARY; |
1264 | 0 | return 0; |
1265 | 0 | } else { |
1266 | 0 | hts_log_error("Unrecognised flag string"); |
1267 | 0 | return -1; |
1268 | 0 | } |
1269 | 0 | } |
1270 | 0 | } |
1271 | 0 | break; |
1272 | | |
1273 | 0 | case 'h': |
1274 | 0 | if (memcmp(str, "hclen", 5) == 0) { |
1275 | 0 | int hclen = 0; |
1276 | 0 | uint32_t *cigar = bam_get_cigar(b); |
1277 | 0 | uint32_t ncigar = b->core.n_cigar; |
1278 | | |
1279 | | // left |
1280 | 0 | if (ncigar > 0 && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP) |
1281 | 0 | hclen = bam_cigar_oplen(cigar[0]); |
1282 | | |
1283 | | // right |
1284 | 0 | if (ncigar > 1 && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP) |
1285 | 0 | hclen += bam_cigar_oplen(cigar[ncigar-1]); |
1286 | |
|
1287 | 0 | *end = str+5; |
1288 | 0 | res->d = hclen; |
1289 | 0 | return 0; |
1290 | 0 | } |
1291 | 0 | break; |
1292 | | |
1293 | 0 | case 'l': |
1294 | 0 | if (memcmp(str, "library", 7) == 0) { |
1295 | 0 | *end = str+7; |
1296 | 0 | res->is_str = 1; |
1297 | 0 | const char *lib = bam_get_library(hb->h, b); |
1298 | 0 | kputs(lib ? lib : "", ks_clear(&res->s)); |
1299 | 0 | return 0; |
1300 | 0 | } |
1301 | 0 | break; |
1302 | | |
1303 | 0 | case 'm': |
1304 | 0 | if (memcmp(str, "mapq", 4) == 0) { |
1305 | 0 | *end = str+4; |
1306 | 0 | res->d = b->core.qual; |
1307 | 0 | return 0; |
1308 | 0 | } else if (memcmp(str, "mpos", 4) == 0) { |
1309 | 0 | *end = str+4; |
1310 | 0 | res->d = b->core.mpos+1; |
1311 | 0 | return 0; |
1312 | 0 | } else if (memcmp(str, "mrname", 6) == 0) { |
1313 | 0 | *end = str+6; |
1314 | 0 | res->is_str = 1; |
1315 | 0 | const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); |
1316 | 0 | kputs(rn ? rn : "*", ks_clear(&res->s)); |
1317 | 0 | return 0; |
1318 | 0 | } else if (memcmp(str, "mrefid", 6) == 0) { |
1319 | 0 | *end = str+6; |
1320 | 0 | res->d = b->core.mtid; |
1321 | 0 | return 0; |
1322 | 0 | } |
1323 | 0 | break; |
1324 | | |
1325 | 0 | case 'n': |
1326 | 0 | if (memcmp(str, "ncigar", 6) == 0) { |
1327 | 0 | *end = str+6; |
1328 | 0 | res->d = b->core.n_cigar; |
1329 | 0 | return 0; |
1330 | 0 | } |
1331 | 0 | break; |
1332 | | |
1333 | 0 | case 'p': |
1334 | 0 | if (memcmp(str, "pos", 3) == 0) { |
1335 | 0 | *end = str+3; |
1336 | 0 | res->d = b->core.pos+1; |
1337 | 0 | return 0; |
1338 | 0 | } else if (memcmp(str, "pnext", 5) == 0) { |
1339 | 0 | *end = str+5; |
1340 | 0 | res->d = b->core.mpos+1; |
1341 | 0 | return 0; |
1342 | 0 | } |
1343 | 0 | break; |
1344 | | |
1345 | 0 | case 'q': |
1346 | 0 | if (memcmp(str, "qlen", 4) == 0) { |
1347 | 0 | *end = str+4; |
1348 | 0 | res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)); |
1349 | 0 | return 0; |
1350 | 0 | } else if (memcmp(str, "qname", 5) == 0) { |
1351 | 0 | *end = str+5; |
1352 | 0 | res->is_str = 1; |
1353 | 0 | kputs(bam_get_qname(b), ks_clear(&res->s)); |
1354 | 0 | return 0; |
1355 | 0 | } else if (memcmp(str, "qual", 4) == 0) { |
1356 | 0 | *end = str+4; |
1357 | 0 | ks_clear(&res->s); |
1358 | 0 | if (ks_resize(&res->s, b->core.l_qseq+1) < 0) |
1359 | 0 | return -1; |
1360 | 0 | memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq); |
1361 | 0 | res->s.l = b->core.l_qseq; |
1362 | 0 | res->is_str = 1; |
1363 | 0 | return 0; |
1364 | 0 | } |
1365 | 0 | break; |
1366 | | |
1367 | 0 | case 'r': |
1368 | 0 | if (memcmp(str, "rlen", 4) == 0) { |
1369 | 0 | *end = str+4; |
1370 | 0 | res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); |
1371 | 0 | return 0; |
1372 | 0 | } else if (memcmp(str, "rname", 5) == 0) { |
1373 | 0 | *end = str+5; |
1374 | 0 | res->is_str = 1; |
1375 | 0 | const char *rn = sam_hdr_tid2name(hb->h, b->core.tid); |
1376 | 0 | kputs(rn ? rn : "*", ks_clear(&res->s)); |
1377 | 0 | return 0; |
1378 | 0 | } else if (memcmp(str, "rnext", 5) == 0) { |
1379 | 0 | *end = str+5; |
1380 | 0 | res->is_str = 1; |
1381 | 0 | const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); |
1382 | 0 | kputs(rn ? rn : "*", ks_clear(&res->s)); |
1383 | 0 | return 0; |
1384 | 0 | } else if (memcmp(str, "refid", 5) == 0) { |
1385 | 0 | *end = str+5; |
1386 | 0 | res->d = b->core.tid; |
1387 | 0 | return 0; |
1388 | 0 | } |
1389 | 0 | break; |
1390 | | |
1391 | 0 | case 's': |
1392 | 0 | if (memcmp(str, "seq", 3) == 0) { |
1393 | 0 | *end = str+3; |
1394 | 0 | ks_clear(&res->s); |
1395 | 0 | if (ks_resize(&res->s, b->core.l_qseq+1) < 0) |
1396 | 0 | return -1; |
1397 | 0 | nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq); |
1398 | 0 | res->s.s[b->core.l_qseq] = 0; |
1399 | 0 | res->s.l = b->core.l_qseq; |
1400 | 0 | res->is_str = 1; |
1401 | 0 | return 0; |
1402 | 0 | } else if (memcmp(str, "sclen", 5) == 0) { |
1403 | 0 | int sclen = 0; |
1404 | 0 | uint32_t *cigar = bam_get_cigar(b); |
1405 | 0 | int ncigar = b->core.n_cigar; |
1406 | 0 | int left = 0; |
1407 | | |
1408 | | // left |
1409 | 0 | if (ncigar > 0 |
1410 | 0 | && bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) |
1411 | 0 | left = 0, sclen += bam_cigar_oplen(cigar[0]); |
1412 | 0 | else if (ncigar > 1 |
1413 | 0 | && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP |
1414 | 0 | && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP) |
1415 | 0 | left = 1, sclen += bam_cigar_oplen(cigar[1]); |
1416 | | |
1417 | | // right |
1418 | 0 | if (ncigar-1 > left |
1419 | 0 | && bam_cigar_op(cigar[ncigar-1]) == BAM_CSOFT_CLIP) |
1420 | 0 | sclen += bam_cigar_oplen(cigar[ncigar-1]); |
1421 | 0 | else if (ncigar-2 > left |
1422 | 0 | && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP |
1423 | 0 | && bam_cigar_op(cigar[ncigar-2]) == BAM_CSOFT_CLIP) |
1424 | 0 | sclen += bam_cigar_oplen(cigar[ncigar-2]); |
1425 | |
|
1426 | 0 | *end = str+5; |
1427 | 0 | res->d = sclen; |
1428 | 0 | return 0; |
1429 | 0 | } |
1430 | 0 | break; |
1431 | | |
1432 | 0 | case 't': |
1433 | 0 | if (memcmp(str, "tlen", 4) == 0) { |
1434 | 0 | *end = str+4; |
1435 | 0 | res->d = b->core.isize; |
1436 | 0 | return 0; |
1437 | 0 | } |
1438 | 0 | break; |
1439 | | |
1440 | 0 | case '[': |
1441 | 0 | if (*str == '[' && str[1] && str[2] && str[3] == ']') { |
1442 | | /* aux tags */ |
1443 | 0 | *end = str+4; |
1444 | |
|
1445 | 0 | uint8_t *aux = bam_aux_get(b, str+1); |
1446 | 0 | if (aux) { |
1447 | | // we define the truth of a tag to be its presence, even if 0. |
1448 | 0 | res->is_true = 1; |
1449 | 0 | switch (*aux) { |
1450 | 0 | case 'Z': |
1451 | 0 | case 'H': |
1452 | 0 | res->is_str = 1; |
1453 | 0 | kputs((char *)aux+1, ks_clear(&res->s)); |
1454 | 0 | break; |
1455 | | |
1456 | 0 | case 'A': |
1457 | 0 | res->is_str = 1; |
1458 | 0 | kputsn((char *)aux+1, 1, ks_clear(&res->s)); |
1459 | 0 | break; |
1460 | | |
1461 | 0 | case 'i': case 'I': |
1462 | 0 | case 's': case 'S': |
1463 | 0 | case 'c': case 'C': |
1464 | 0 | res->is_str = 0; |
1465 | 0 | res->d = bam_aux2i(aux); |
1466 | 0 | break; |
1467 | | |
1468 | 0 | case 'f': |
1469 | 0 | case 'd': |
1470 | 0 | res->is_str = 0; |
1471 | 0 | res->d = bam_aux2f(aux); |
1472 | 0 | break; |
1473 | | |
1474 | 0 | default: |
1475 | 0 | hts_log_error("Aux type '%c not yet supported by filters", |
1476 | 0 | *aux); |
1477 | 0 | return -1; |
1478 | 0 | } |
1479 | 0 | return 0; |
1480 | |
|
1481 | 0 | } else { |
1482 | | // hence absent tags are always false (and strings) |
1483 | 0 | res->is_str = 1; |
1484 | 0 | res->s.l = 0; |
1485 | 0 | res->d = 0; |
1486 | 0 | res->is_true = 0; |
1487 | 0 | return 0; |
1488 | 0 | } |
1489 | 0 | } |
1490 | 0 | break; |
1491 | 0 | } |
1492 | | |
1493 | | // All successful matches in switch should return 0. |
1494 | | // So if we didn't match, it's a parse error. |
1495 | 0 | return -1; |
1496 | 0 | } |
1497 | | |
1498 | | // Returns 1 when accepted by the filter, 0 if not, -1 on error. |
1499 | | int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt) |
1500 | 0 | { |
1501 | 0 | hb_pair hb = {h, b}; |
1502 | 0 | hts_expr_val_t res = HTS_EXPR_VAL_INIT; |
1503 | 0 | if (hts_filter_eval2(filt, &hb, bam_sym_lookup, &res)) { |
1504 | 0 | hts_log_error("Couldn't process filter expression"); |
1505 | 0 | hts_expr_val_free(&res); |
1506 | 0 | return -1; |
1507 | 0 | } |
1508 | | |
1509 | 0 | int t = res.is_true; |
1510 | 0 | hts_expr_val_free(&res); |
1511 | |
|
1512 | 0 | return t; |
1513 | 0 | } |
1514 | | |
1515 | | static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) |
1516 | 0 | { |
1517 | 0 | htsFile *fp = fpv; |
1518 | 0 | bam1_t *b = bv; |
1519 | 0 | int pass_filter, ret; |
1520 | |
|
1521 | 0 | do { |
1522 | 0 | ret = cram_get_bam_seq(fp->fp.cram, &b); |
1523 | 0 | if (ret < 0) |
1524 | 0 | return cram_eof(fp->fp.cram) ? -1 : -2; |
1525 | | |
1526 | 0 | if (bam_tag2cigar(b, 1, 1) < 0) |
1527 | 0 | return -2; |
1528 | | |
1529 | 0 | *tid = b->core.tid; |
1530 | 0 | *beg = b->core.pos; |
1531 | 0 | *end = bam_endpos(b); |
1532 | |
|
1533 | 0 | if (fp->filter) { |
1534 | 0 | pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter); |
1535 | 0 | if (pass_filter < 0) |
1536 | 0 | return -2; |
1537 | 0 | } else { |
1538 | 0 | pass_filter = 1; |
1539 | 0 | } |
1540 | 0 | } while (pass_filter == 0); |
1541 | | |
1542 | 0 | return ret; |
1543 | 0 | } |
1544 | | |
1545 | | static int cram_pseek(void *fp, int64_t offset, int whence) |
1546 | 0 | { |
1547 | 0 | cram_fd *fd = (cram_fd *)fp; |
1548 | |
|
1549 | 0 | if ((0 != cram_seek(fd, offset, SEEK_SET)) |
1550 | 0 | && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR))) |
1551 | 0 | return -1; |
1552 | | |
1553 | 0 | fd->curr_position = offset; |
1554 | |
|
1555 | 0 | if (fd->ctr) { |
1556 | 0 | cram_free_container(fd->ctr); |
1557 | 0 | if (fd->ctr_mt && fd->ctr_mt != fd->ctr) |
1558 | 0 | cram_free_container(fd->ctr_mt); |
1559 | |
|
1560 | 0 | fd->ctr = NULL; |
1561 | 0 | fd->ctr_mt = NULL; |
1562 | 0 | fd->ooc = 0; |
1563 | 0 | } |
1564 | |
|
1565 | 0 | return 0; |
1566 | 0 | } |
1567 | | |
1568 | | /* |
1569 | | * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only |
1570 | | * after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered |
1571 | | * container previously fetched. It was designed like this to integrate with the functionality |
1572 | | * of the iterator stepping logic. |
1573 | | */ |
1574 | | |
1575 | | static int64_t cram_ptell(void *fp) |
1576 | 0 | { |
1577 | 0 | cram_fd *fd = (cram_fd *)fp; |
1578 | 0 | cram_container *c; |
1579 | 0 | cram_slice *s; |
1580 | 0 | int64_t ret = -1L; |
1581 | |
|
1582 | 0 | if (fd) { |
1583 | 0 | if ((c = fd->ctr) != NULL) { |
1584 | 0 | if ((s = c->slice) != NULL && s->max_rec) { |
1585 | 0 | if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1)) |
1586 | 0 | fd->curr_position += c->offset + c->length; |
1587 | 0 | } |
1588 | 0 | } |
1589 | 0 | ret = fd->curr_position; |
1590 | 0 | } |
1591 | |
|
1592 | 0 | return ret; |
1593 | 0 | } |
1594 | | |
1595 | | static int bam_pseek(void *fp, int64_t offset, int whence) |
1596 | 0 | { |
1597 | 0 | BGZF *fd = (BGZF *)fp; |
1598 | |
|
1599 | 0 | return bgzf_seek(fd, offset, whence); |
1600 | 0 | } |
1601 | | |
1602 | | static int64_t bam_ptell(void *fp) |
1603 | 0 | { |
1604 | 0 | BGZF *fd = (BGZF *)fp; |
1605 | 0 | if (!fd) |
1606 | 0 | return -1L; |
1607 | | |
1608 | 0 | return bgzf_tell(fd); |
1609 | 0 | } |
1610 | | |
1611 | | |
1612 | | |
1613 | | static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags) |
1614 | 0 | { |
1615 | 0 | switch (fp->format.format) { |
1616 | 0 | case bam: |
1617 | 0 | case sam: |
1618 | 0 | return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags); |
1619 | | |
1620 | 0 | case cram: { |
1621 | 0 | if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL; |
1622 | | |
1623 | | // Cons up a fake "index" just pointing at the associated cram_fd: |
1624 | 0 | hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t)); |
1625 | 0 | if (idx == NULL) return NULL; |
1626 | 0 | idx->fmt = HTS_FMT_CRAI; |
1627 | 0 | idx->cram = fp->fp.cram; |
1628 | 0 | return (hts_idx_t *) idx; |
1629 | 0 | } |
1630 | | |
1631 | 0 | default: |
1632 | 0 | return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t |
1633 | 0 | } |
1634 | 0 | } |
1635 | | |
1636 | | hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags) |
1637 | 0 | { |
1638 | 0 | return index_load(fp, fn, fnidx, flags); |
1639 | 0 | } |
1640 | | |
1641 | 0 | hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) { |
1642 | 0 | return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE); |
1643 | 0 | } |
1644 | | |
1645 | | hts_idx_t *sam_index_load(htsFile *fp, const char *fn) |
1646 | 0 | { |
1647 | 0 | return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE); |
1648 | 0 | } |
1649 | | |
1650 | | static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec) |
1651 | 0 | { |
1652 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1653 | 0 | hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t)); |
1654 | 0 | if (iter == NULL) return NULL; |
1655 | | |
1656 | | // Cons up a dummy iterator for which hts_itr_next() will simply invoke |
1657 | | // the readrec function: |
1658 | 0 | iter->is_cram = 1; |
1659 | 0 | iter->read_rest = 1; |
1660 | 0 | iter->off = NULL; |
1661 | 0 | iter->bins.a = NULL; |
1662 | 0 | iter->readrec = readrec; |
1663 | |
|
1664 | 0 | if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) { |
1665 | 0 | cram_range r = { tid, beg+1, end }; |
1666 | 0 | int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r); |
1667 | |
|
1668 | 0 | iter->curr_off = 0; |
1669 | | // The following fields are not required by hts_itr_next(), but are |
1670 | | // filled in in case user code wants to look at them. |
1671 | 0 | iter->tid = tid; |
1672 | 0 | iter->beg = beg; |
1673 | 0 | iter->end = end; |
1674 | |
|
1675 | 0 | switch (ret) { |
1676 | 0 | case 0: |
1677 | 0 | break; |
1678 | | |
1679 | 0 | case -2: |
1680 | | // No data vs this ref, so mark iterator as completed. |
1681 | | // Same as HTS_IDX_NONE. |
1682 | 0 | iter->finished = 1; |
1683 | 0 | break; |
1684 | | |
1685 | 0 | default: |
1686 | 0 | free(iter); |
1687 | 0 | return NULL; |
1688 | 0 | } |
1689 | 0 | } |
1690 | 0 | else switch (tid) { |
1691 | 0 | case HTS_IDX_REST: |
1692 | 0 | iter->curr_off = 0; |
1693 | 0 | break; |
1694 | 0 | case HTS_IDX_NONE: |
1695 | 0 | iter->curr_off = 0; |
1696 | 0 | iter->finished = 1; |
1697 | 0 | break; |
1698 | 0 | default: |
1699 | 0 | hts_log_error("Query with tid=%d not implemented for CRAM files", tid); |
1700 | 0 | abort(); |
1701 | 0 | break; |
1702 | 0 | } |
1703 | | |
1704 | 0 | return iter; |
1705 | 0 | } |
1706 | | |
1707 | | hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end) |
1708 | 0 | { |
1709 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1710 | 0 | if (idx == NULL) |
1711 | 0 | return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest); |
1712 | 0 | else if (cidx->fmt == HTS_FMT_CRAI) |
1713 | 0 | return cram_itr_query(idx, tid, beg, end, sam_readrec); |
1714 | 0 | else |
1715 | 0 | return hts_itr_query(idx, tid, beg, end, sam_readrec); |
1716 | 0 | } |
1717 | | |
1718 | | static int cram_name2id(void *fdv, const char *ref) |
1719 | 0 | { |
1720 | 0 | cram_fd *fd = (cram_fd *) fdv; |
1721 | 0 | return sam_hdr_name2tid(fd->header, ref); |
1722 | 0 | } |
1723 | | |
1724 | | hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region) |
1725 | 0 | { |
1726 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1727 | 0 | return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr, |
1728 | 0 | cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query, |
1729 | 0 | sam_readrec); |
1730 | 0 | } |
1731 | | |
1732 | | hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount) |
1733 | 0 | { |
1734 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1735 | 0 | hts_reglist_t *r_list = NULL; |
1736 | 0 | int r_count = 0; |
1737 | |
|
1738 | 0 | if (!cidx || !hdr) |
1739 | 0 | return NULL; |
1740 | | |
1741 | 0 | hts_itr_t *itr = NULL; |
1742 | 0 | if (cidx->fmt == HTS_FMT_CRAI) { |
1743 | 0 | r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id); |
1744 | 0 | if (!r_list) |
1745 | 0 | return NULL; |
1746 | 0 | itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram, |
1747 | 0 | hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell); |
1748 | 0 | } else { |
1749 | 0 | r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, (hts_name2id_f)(bam_name2id)); |
1750 | 0 | if (!r_list) |
1751 | 0 | return NULL; |
1752 | 0 | itr = hts_itr_regions(idx, r_list, r_count, (hts_name2id_f)(bam_name2id), hdr, |
1753 | 0 | hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell); |
1754 | 0 | } |
1755 | | |
1756 | 0 | if (!itr) |
1757 | 0 | hts_reglist_free(r_list, r_count); |
1758 | |
|
1759 | 0 | return itr; |
1760 | 0 | } |
1761 | | |
1762 | | hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount) |
1763 | 0 | { |
1764 | 0 | const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; |
1765 | |
|
1766 | 0 | if(!cidx || !hdr || !reglist) |
1767 | 0 | return NULL; |
1768 | | |
1769 | 0 | if (cidx->fmt == HTS_FMT_CRAI) |
1770 | 0 | return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram, |
1771 | 0 | hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell); |
1772 | 0 | else |
1773 | 0 | return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr, |
1774 | 0 | hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell); |
1775 | 0 | } |
1776 | | |
1777 | | /********************** |
1778 | | *** SAM header I/O *** |
1779 | | **********************/ |
1780 | | |
1781 | | #include "htslib/kseq.h" |
1782 | | #include "htslib/kstring.h" |
1783 | | |
1784 | | sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text) |
1785 | 0 | { |
1786 | 0 | sam_hdr_t *bh = sam_hdr_init(); |
1787 | 0 | if (!bh) return NULL; |
1788 | | |
1789 | 0 | if (sam_hdr_add_lines(bh, text, l_text) != 0) { |
1790 | 0 | sam_hdr_destroy(bh); |
1791 | 0 | return NULL; |
1792 | 0 | } |
1793 | | |
1794 | 0 | return bh; |
1795 | 0 | } |
1796 | | |
1797 | 563k | static int valid_sam_header_type(const char *s) { |
1798 | 563k | if (s[0] != '@') return 0; |
1799 | 563k | switch (s[1]) { |
1800 | 2.99k | case 'H': |
1801 | 2.99k | return s[2] == 'D' && s[3] == '\t'; |
1802 | 48 | case 'S': |
1803 | 48 | return s[2] == 'Q' && s[3] == '\t'; |
1804 | 545k | case 'R': |
1805 | 548k | case 'P': |
1806 | 548k | return s[2] == 'G' && s[3] == '\t'; |
1807 | 11.5k | case 'C': |
1808 | 11.5k | return s[2] == 'O'; |
1809 | 563k | } |
1810 | 81 | return 0; |
1811 | 563k | } |
1812 | | |
1813 | | // Minimal sanitisation of a header to ensure. |
1814 | | // - null terminated string. |
1815 | | // - all lines start with @ (also implies no blank lines). |
1816 | | // |
1817 | | // Much more could be done, but currently is not, including: |
1818 | | // - checking header types are known (HD, SQ, etc). |
1819 | | // - syntax (eg checking tab separated fields). |
1820 | | // - validating n_targets matches @SQ records. |
1821 | | // - validating target lengths against @SQ records. |
1822 | 32.2k | static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) { |
1823 | 32.2k | if (!h) |
1824 | 720 | return NULL; |
1825 | | |
1826 | | // Special case for empty headers. |
1827 | 31.5k | if (h->l_text == 0) |
1828 | 10.2k | return h; |
1829 | | |
1830 | 21.3k | size_t i; |
1831 | 21.3k | unsigned int lnum = 0; |
1832 | 21.3k | char *cp = h->text, last = '\n'; |
1833 | 64.3M | for (i = 0; i < h->l_text; i++) { |
1834 | | // NB: l_text excludes terminating nul. This finds early ones. |
1835 | 64.2M | if (cp[i] == 0) |
1836 | 6.26k | break; |
1837 | | |
1838 | | // Error on \n[^@], including duplicate newlines |
1839 | 64.2M | if (last == '\n') { |
1840 | 364k | lnum++; |
1841 | 364k | if (cp[i] != '@') { |
1842 | 3 | hts_log_error("Malformed SAM header at line %u", lnum); |
1843 | 3 | sam_hdr_destroy(h); |
1844 | 3 | return NULL; |
1845 | 3 | } |
1846 | 364k | } |
1847 | | |
1848 | 64.2M | last = cp[i]; |
1849 | 64.2M | } |
1850 | | |
1851 | 21.3k | if (i < h->l_text) { // Early nul found. Complain if not just padding. |
1852 | 6.26k | size_t j = i; |
1853 | 36.1k | while (j < h->l_text && cp[j] == '\0') j++; |
1854 | 6.26k | if (j < h->l_text) |
1855 | 5.98k | hts_log_warning("Unexpected NUL character in header. Possibly truncated"); |
1856 | 6.26k | } |
1857 | | |
1858 | | // Add trailing newline and/or trailing nul if required. |
1859 | 21.3k | if (last != '\n') { |
1860 | 6.00k | hts_log_warning("Missing trailing newline on SAM header. Possibly truncated"); |
1861 | | |
1862 | 6.00k | if (h->l_text < 2 || i >= h->l_text - 2) { |
1863 | 1.08k | if (h->l_text >= SIZE_MAX - 2) { |
1864 | 0 | hts_log_error("No room for extra newline"); |
1865 | 0 | sam_hdr_destroy(h); |
1866 | 0 | return NULL; |
1867 | 0 | } |
1868 | | |
1869 | 1.08k | cp = realloc(h->text, (size_t) h->l_text+2); |
1870 | 1.08k | if (!cp) { |
1871 | 0 | sam_hdr_destroy(h); |
1872 | 0 | return NULL; |
1873 | 0 | } |
1874 | 1.08k | h->text = cp; |
1875 | 1.08k | } |
1876 | 6.00k | cp[i++] = '\n'; |
1877 | | |
1878 | | // l_text may be larger already due to multiple nul padding |
1879 | 6.00k | if (h->l_text < i) |
1880 | 39 | h->l_text = i; |
1881 | 6.00k | cp[h->l_text] = '\0'; |
1882 | 6.00k | } |
1883 | | |
1884 | 21.3k | return h; |
1885 | 21.3k | } |
1886 | | |
1887 | 2.47k | static void known_stderr(const char *tool, const char *advice) { |
1888 | 2.47k | hts_log_warning("SAM file corrupted by embedded %s error/log message", tool); |
1889 | 2.47k | hts_log_warning("%s", advice); |
1890 | 2.47k | } |
1891 | | |
1892 | 33.2k | static void warn_if_known_stderr(const char *line) { |
1893 | 33.2k | if (strstr(line, "M::bwa_idx_load_from_disk") != NULL) |
1894 | 759 | known_stderr("bwa", "Use `bwa mem -o file.sam ...` or `bwa sampe -f file.sam ...` instead of `bwa ... > file.sam`"); |
1895 | 32.5k | else if (strstr(line, "M::mem_pestat") != NULL) |
1896 | 1.39k | known_stderr("bwa", "Use `bwa mem -o file.sam ...` instead of `bwa mem ... > file.sam`"); |
1897 | 31.1k | else if (strstr(line, "loaded/built the index") != NULL) |
1898 | 321 | known_stderr("minimap2", "Use `minimap2 -o file.sam ...` instead of `minimap2 ... > file.sam`"); |
1899 | 33.2k | } |
1900 | | |
1901 | 21.0k | static sam_hdr_t *sam_hdr_create(htsFile* fp) { |
1902 | 21.0k | kstring_t str = { 0, 0, NULL }; |
1903 | 21.0k | khint_t k; |
1904 | 21.0k | sam_hdr_t* h = sam_hdr_init(); |
1905 | 21.0k | const char *q, *r; |
1906 | 21.0k | char* sn = NULL; |
1907 | 21.0k | khash_t(s2i) *d = kh_init(s2i); |
1908 | 21.0k | khash_t(s2i) *long_refs = NULL; |
1909 | 21.0k | if (!h || !d) |
1910 | 0 | goto error; |
1911 | | |
1912 | 21.0k | int ret, has_SQ = 0; |
1913 | 21.0k | int next_c = '@'; |
1914 | 699k | while (next_c == '@' && (ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) { |
1915 | 678k | if (fp->line.s[0] != '@') |
1916 | 39 | break; |
1917 | | |
1918 | 678k | if (fp->line.l > 3 && strncmp(fp->line.s, "@SQ", 3) == 0) { |
1919 | 114k | has_SQ = 1; |
1920 | 114k | hts_pos_t ln = -1; |
1921 | 309k | for (q = fp->line.s + 4;; ++q) { |
1922 | 309k | if (strncmp(q, "SN:", 3) == 0) { |
1923 | 111k | q += 3; |
1924 | 1.06G | for (r = q;*r != '\t' && *r != '\n' && *r != '\0';++r); |
1925 | | |
1926 | 111k | if (sn) { |
1927 | 22.1k | hts_log_warning("SQ header line has more than one SN: tag"); |
1928 | 22.1k | free(sn); |
1929 | 22.1k | } |
1930 | 111k | sn = (char*)calloc(r - q + 1, 1); |
1931 | 111k | if (!sn) |
1932 | 0 | goto error; |
1933 | | |
1934 | 111k | strncpy(sn, q, r - q); |
1935 | 111k | q = r; |
1936 | 198k | } else { |
1937 | 198k | if (strncmp(q, "LN:", 3) == 0) |
1938 | 102k | ln = strtoll(q + 3, (char**)&q, 10); |
1939 | 198k | } |
1940 | | |
1941 | 37.9M | while (*q != '\t' && *q != '\n' && *q != '\0') |
1942 | 37.6M | ++q; |
1943 | 309k | if (*q == '\0' || *q == '\n') |
1944 | 114k | break; |
1945 | 309k | } |
1946 | 114k | if (sn) { |
1947 | 88.8k | if (ln >= 0) { |
1948 | 81.6k | int absent; |
1949 | 81.6k | k = kh_put(s2i, d, sn, &absent); |
1950 | 81.6k | if (absent < 0) |
1951 | 0 | goto error; |
1952 | | |
1953 | 81.6k | if (!absent) { |
1954 | 36.5k | hts_log_warning("Duplicated sequence \"%s\" in file \"%s\"", sn, fp->fn); |
1955 | 36.5k | free(sn); |
1956 | 45.1k | } else { |
1957 | 45.1k | sn = NULL; |
1958 | 45.1k | if (ln >= UINT32_MAX) { |
1959 | | // Stash away ref length that |
1960 | | // doesn't fit in target_len array |
1961 | 16.3k | int k2; |
1962 | 16.3k | if (!long_refs) { |
1963 | 963 | long_refs = kh_init(s2i); |
1964 | 963 | if (!long_refs) |
1965 | 0 | goto error; |
1966 | 963 | } |
1967 | 16.3k | k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent); |
1968 | 16.3k | if (absent < 0) |
1969 | 0 | goto error; |
1970 | 16.3k | kh_val(long_refs, k2) = ln; |
1971 | 16.3k | kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32 |
1972 | 16.3k | | UINT32_MAX); |
1973 | 28.7k | } else { |
1974 | 28.7k | kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln; |
1975 | 28.7k | } |
1976 | 45.1k | } |
1977 | 81.6k | } else { |
1978 | 7.20k | hts_log_warning("Ignored @SQ SN:%s : bad or missing LN tag", sn); |
1979 | 7.20k | warn_if_known_stderr(fp->line.s); |
1980 | 7.20k | free(sn); |
1981 | 7.20k | } |
1982 | 88.8k | } else { |
1983 | 25.8k | hts_log_warning("Ignored @SQ line with missing SN: tag"); |
1984 | 25.8k | warn_if_known_stderr(fp->line.s); |
1985 | 25.8k | } |
1986 | 114k | sn = NULL; |
1987 | 114k | } |
1988 | 563k | else if (!valid_sam_header_type(fp->line.s)) { |
1989 | 186 | hts_log_error("Invalid header line: must start with @HD/@SQ/@RG/@PG/@CO"); |
1990 | 186 | warn_if_known_stderr(fp->line.s); |
1991 | 186 | goto error; |
1992 | 186 | } |
1993 | | |
1994 | 678k | if (kputsn(fp->line.s, fp->line.l, &str) < 0) |
1995 | 0 | goto error; |
1996 | | |
1997 | 678k | if (kputc('\n', &str) < 0) |
1998 | 0 | goto error; |
1999 | | |
2000 | 678k | if (fp->is_bgzf) { |
2001 | 539k | next_c = bgzf_peek(fp->fp.bgzf); |
2002 | 539k | } else { |
2003 | 138k | unsigned char nc; |
2004 | 138k | ssize_t pret = hpeek(fp->fp.hfile, &nc, 1); |
2005 | 138k | next_c = pret > 0 ? nc : pret - 1; |
2006 | 138k | } |
2007 | 678k | if (next_c < -1) |
2008 | 3 | goto error; |
2009 | 678k | } |
2010 | 20.8k | if (next_c != '@') |
2011 | 20.8k | fp->line.l = 0; |
2012 | | |
2013 | 20.8k | if (ret < -1) |
2014 | 27 | goto error; |
2015 | | |
2016 | 20.8k | if (!has_SQ && fp->fn_aux) { |
2017 | 0 | kstring_t line = { 0, 0, NULL }; |
2018 | | |
2019 | | /* The reference index (.fai) is actually needed here */ |
2020 | 0 | char *fai_fn = fp->fn_aux; |
2021 | 0 | char *fn_delim = strstr(fp->fn_aux, HTS_IDX_DELIM); |
2022 | 0 | if (fn_delim) |
2023 | 0 | fai_fn = fn_delim + strlen(HTS_IDX_DELIM); |
2024 | |
|
2025 | 0 | hFILE* f = hopen(fai_fn, "r"); |
2026 | 0 | int e = 0, absent; |
2027 | 0 | if (f == NULL) |
2028 | 0 | goto error; |
2029 | | |
2030 | 0 | while (line.l = 0, kgetline(&line, (kgets_func*) hgets, f) >= 0) { |
2031 | 0 | char* tab = strchr(line.s, '\t'); |
2032 | 0 | hts_pos_t ln; |
2033 | |
|
2034 | 0 | if (tab == NULL) |
2035 | 0 | continue; |
2036 | | |
2037 | 0 | sn = (char*)calloc(tab-line.s+1, 1); |
2038 | 0 | if (!sn) { |
2039 | 0 | e = 1; |
2040 | 0 | break; |
2041 | 0 | } |
2042 | 0 | memcpy(sn, line.s, tab-line.s); |
2043 | 0 | k = kh_put(s2i, d, sn, &absent); |
2044 | 0 | if (absent < 0) { |
2045 | 0 | e = 1; |
2046 | 0 | break; |
2047 | 0 | } |
2048 | | |
2049 | 0 | ln = strtoll(tab, NULL, 10); |
2050 | |
|
2051 | 0 | if (!absent) { |
2052 | 0 | hts_log_warning("Duplicated sequence \"%s\" in the file \"%s\"", sn, fai_fn); |
2053 | 0 | free(sn); |
2054 | 0 | sn = NULL; |
2055 | 0 | } else { |
2056 | 0 | sn = NULL; |
2057 | 0 | if (ln >= UINT32_MAX) { |
2058 | | // Stash away ref length that |
2059 | | // doesn't fit in target_len array |
2060 | 0 | khint_t k2; |
2061 | 0 | int absent = -1; |
2062 | 0 | if (!long_refs) { |
2063 | 0 | long_refs = kh_init(s2i); |
2064 | 0 | if (!long_refs) { |
2065 | 0 | e = 1; |
2066 | 0 | break; |
2067 | 0 | } |
2068 | 0 | } |
2069 | 0 | k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent); |
2070 | 0 | if (absent < 0) { |
2071 | 0 | e = 1; |
2072 | 0 | break; |
2073 | 0 | } |
2074 | 0 | kh_val(long_refs, k2) = ln; |
2075 | 0 | kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32 |
2076 | 0 | | UINT32_MAX); |
2077 | 0 | } else { |
2078 | 0 | kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln; |
2079 | 0 | } |
2080 | 0 | has_SQ = 1; |
2081 | 0 | } |
2082 | | |
2083 | 0 | e |= kputs("@SQ\tSN:", &str) < 0; |
2084 | 0 | e |= kputsn(line.s, tab - line.s, &str) < 0; |
2085 | 0 | e |= kputs("\tLN:", &str) < 0; |
2086 | 0 | e |= kputll(ln, &str) < 0; |
2087 | 0 | e |= kputc('\n', &str) < 0; |
2088 | 0 | if (e) |
2089 | 0 | break; |
2090 | 0 | } |
2091 | |
|
2092 | 0 | ks_free(&line); |
2093 | 0 | if (hclose(f) != 0) { |
2094 | 0 | hts_log_error("Error on closing %s", fai_fn); |
2095 | 0 | e = 1; |
2096 | 0 | } |
2097 | 0 | if (e) |
2098 | 0 | goto error; |
2099 | 0 | } |
2100 | | |
2101 | 20.8k | if (has_SQ) { |
2102 | | // Populate the targets array |
2103 | 12.3k | h->n_targets = kh_size(d); |
2104 | | |
2105 | 12.3k | h->target_name = (char**) malloc(sizeof(char*) * h->n_targets); |
2106 | 12.3k | if (!h->target_name) { |
2107 | 0 | h->n_targets = 0; |
2108 | 0 | goto error; |
2109 | 0 | } |
2110 | | |
2111 | 12.3k | h->target_len = (uint32_t*) malloc(sizeof(uint32_t) * h->n_targets); |
2112 | 12.3k | if (!h->target_len) { |
2113 | 0 | h->n_targets = 0; |
2114 | 0 | goto error; |
2115 | 0 | } |
2116 | | |
2117 | 117k | for (k = kh_begin(d); k != kh_end(d); ++k) { |
2118 | 104k | if (!kh_exist(d, k)) |
2119 | 61.5k | continue; |
2120 | | |
2121 | 43.1k | h->target_name[kh_val(d, k) >> 32] = (char*) kh_key(d, k); |
2122 | 43.1k | h->target_len[kh_val(d, k) >> 32] = kh_val(d, k) & 0xffffffffUL; |
2123 | 43.1k | kh_val(d, k) >>= 32; |
2124 | 43.1k | } |
2125 | 12.3k | } |
2126 | | |
2127 | | // Repurpose sdict to hold any references longer than UINT32_MAX |
2128 | 20.8k | h->sdict = long_refs; |
2129 | | |
2130 | 20.8k | kh_destroy(s2i, d); |
2131 | | |
2132 | 20.8k | if (str.l == 0) |
2133 | 39 | kputsn("", 0, &str); |
2134 | 20.8k | h->l_text = str.l; |
2135 | 20.8k | h->text = ks_release(&str); |
2136 | 20.8k | fp->bam_header = sam_hdr_sanitise(h); |
2137 | 20.8k | fp->bam_header->ref_count = 1; |
2138 | | |
2139 | 20.8k | return fp->bam_header; |
2140 | | |
2141 | 216 | error: |
2142 | 216 | if (h && d && (!h->target_name || !h->target_len)) { |
2143 | 3.61k | for (k = kh_begin(d); k != kh_end(d); ++k) |
2144 | 3.39k | if (kh_exist(d, k)) free((void *)kh_key(d, k)); |
2145 | 216 | } |
2146 | 216 | sam_hdr_destroy(h); |
2147 | 216 | ks_free(&str); |
2148 | 216 | kh_destroy(s2i, d); |
2149 | 216 | kh_destroy(s2i, long_refs); |
2150 | 216 | if (sn) free(sn); |
2151 | 216 | return NULL; |
2152 | 20.8k | } |
2153 | | |
2154 | | sam_hdr_t *sam_hdr_read(htsFile *fp) |
2155 | 35.9k | { |
2156 | 35.9k | if (!fp) { |
2157 | 0 | errno = EINVAL; |
2158 | 0 | return NULL; |
2159 | 0 | } |
2160 | | |
2161 | 35.9k | switch (fp->format.format) { |
2162 | 2.91k | case bam: |
2163 | 2.91k | return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf)); |
2164 | | |
2165 | 8.50k | case cram: |
2166 | 8.50k | return sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header)); |
2167 | | |
2168 | 21.0k | case sam: |
2169 | 21.0k | return sam_hdr_create(fp); |
2170 | | |
2171 | 396 | case fastq_format: |
2172 | 3.49k | case fasta_format: |
2173 | 3.49k | return sam_hdr_init(); |
2174 | | |
2175 | 0 | case empty_format: |
2176 | 0 | errno = EPIPE; |
2177 | 0 | return NULL; |
2178 | | |
2179 | 0 | default: |
2180 | 0 | errno = EFTYPE; |
2181 | 0 | return NULL; |
2182 | 35.9k | } |
2183 | 35.9k | } |
2184 | | |
2185 | | int sam_hdr_write(htsFile *fp, const sam_hdr_t *h) |
2186 | 35.0k | { |
2187 | 35.0k | if (!fp || !h) { |
2188 | 0 | errno = EINVAL; |
2189 | 0 | return -1; |
2190 | 0 | } |
2191 | | |
2192 | 35.0k | switch (fp->format.format) { |
2193 | 11.6k | case binary_format: |
2194 | 11.6k | fp->format.category = sequence_data; |
2195 | 11.6k | fp->format.format = bam; |
2196 | | /* fall-through */ |
2197 | 11.6k | case bam: |
2198 | 11.6k | if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1; |
2199 | 11.6k | break; |
2200 | | |
2201 | 11.6k | case cram: { |
2202 | 11.6k | cram_fd *fd = fp->fp.cram; |
2203 | 11.6k | if (cram_set_header2(fd, h) < 0) return -1; |
2204 | 10.5k | if (fp->fn_aux) |
2205 | 0 | cram_load_reference(fd, fp->fn_aux); |
2206 | 10.5k | if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1; |
2207 | 10.5k | } |
2208 | 10.5k | break; |
2209 | | |
2210 | 11.6k | case text_format: |
2211 | 11.6k | fp->format.category = sequence_data; |
2212 | 11.6k | fp->format.format = sam; |
2213 | | /* fall-through */ |
2214 | 11.6k | case sam: { |
2215 | 11.6k | if (!h->hrecs && !h->text) |
2216 | 0 | return 0; |
2217 | 11.6k | char *text; |
2218 | 11.6k | kstring_t hdr_ks = { 0, 0, NULL }; |
2219 | 11.6k | size_t l_text; |
2220 | 11.6k | ssize_t bytes; |
2221 | 11.6k | int r = 0, no_sq = 0; |
2222 | | |
2223 | 11.6k | if (h->hrecs) { |
2224 | 10.5k | if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) |
2225 | 0 | return -1; |
2226 | 10.5k | text = hdr_ks.s; |
2227 | 10.5k | l_text = hdr_ks.l; |
2228 | 10.5k | } else { |
2229 | 1.10k | const char *p = NULL; |
2230 | 1.37k | do { |
2231 | 1.37k | const char *q = p == NULL ? h->text : p + 4; |
2232 | 1.37k | p = strstr(q, "@SQ\t"); |
2233 | 1.37k | } while (!(p == NULL || p == h->text || *(p - 1) == '\n')); |
2234 | 1.10k | no_sq = p == NULL; |
2235 | 1.10k | text = h->text; |
2236 | 1.10k | l_text = h->l_text; |
2237 | 1.10k | } |
2238 | | |
2239 | 11.6k | if (fp->is_bgzf) { |
2240 | 0 | bytes = bgzf_write(fp->fp.bgzf, text, l_text); |
2241 | 11.6k | } else { |
2242 | 11.6k | bytes = hwrite(fp->fp.hfile, text, l_text); |
2243 | 11.6k | } |
2244 | 11.6k | free(hdr_ks.s); |
2245 | 11.6k | if (bytes != l_text) |
2246 | 0 | return -1; |
2247 | | |
2248 | 11.6k | if (no_sq) { |
2249 | 653 | int i; |
2250 | 3.33k | for (i = 0; i < h->n_targets; ++i) { |
2251 | 2.68k | fp->line.l = 0; |
2252 | 2.68k | r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0; |
2253 | 2.68k | r |= kputs(h->target_name[i], &fp->line) < 0; |
2254 | 2.68k | r |= kputsn("\tLN:", 4, &fp->line) < 0; |
2255 | 2.68k | r |= kputw(h->target_len[i], &fp->line) < 0; |
2256 | 2.68k | r |= kputc('\n', &fp->line) < 0; |
2257 | 2.68k | if (r != 0) |
2258 | 0 | return -1; |
2259 | | |
2260 | 2.68k | if (fp->is_bgzf) { |
2261 | 0 | bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l); |
2262 | 2.68k | } else { |
2263 | 2.68k | bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l); |
2264 | 2.68k | } |
2265 | 2.68k | if (bytes != fp->line.l) |
2266 | 0 | return -1; |
2267 | 2.68k | } |
2268 | 653 | } |
2269 | 11.6k | if (fp->is_bgzf) { |
2270 | 0 | if (bgzf_flush(fp->fp.bgzf) != 0) return -1; |
2271 | 11.6k | } else { |
2272 | 11.6k | if (hflush(fp->fp.hfile) != 0) return -1; |
2273 | 11.6k | } |
2274 | 11.6k | } |
2275 | 11.6k | break; |
2276 | | |
2277 | 11.6k | case fastq_format: |
2278 | 0 | case fasta_format: |
2279 | | // Nothing to output; FASTQ has no file headers. |
2280 | 0 | break; |
2281 | | |
2282 | 0 | default: |
2283 | 0 | errno = EBADF; |
2284 | 0 | return -1; |
2285 | 35.0k | } |
2286 | 33.9k | return 0; |
2287 | 35.0k | } |
2288 | | |
2289 | | static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val) |
2290 | 0 | { |
2291 | 0 | char *p, *q, *beg = NULL, *end = NULL, *newtext; |
2292 | 0 | size_t new_l_text; |
2293 | 0 | if (!h || !key) |
2294 | 0 | return -1; |
2295 | | |
2296 | 0 | if (h->l_text > 3) { |
2297 | 0 | if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists |
2298 | 0 | if ((p = strchr(h->text, '\n')) == 0) return -1; |
2299 | 0 | *p = '\0'; // for strstr call |
2300 | |
|
2301 | 0 | char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' }; |
2302 | |
|
2303 | 0 | if ((q = strstr(h->text, tmp)) != 0) { // key exists |
2304 | 0 | *p = '\n'; // change back |
2305 | | |
2306 | | // mark the key:val |
2307 | 0 | beg = q; |
2308 | 0 | for (q += 4; *q != '\n' && *q != '\t'; ++q); |
2309 | 0 | end = q; |
2310 | |
|
2311 | 0 | if (val && (strncmp(beg + 4, val, end - beg - 4) == 0) |
2312 | 0 | && strlen(val) == end - beg - 4) |
2313 | 0 | return 0; // val is the same, no need to change |
2314 | |
|
2315 | 0 | } else { |
2316 | 0 | beg = end = p; |
2317 | 0 | *p = '\n'; |
2318 | 0 | } |
2319 | 0 | } |
2320 | 0 | } |
2321 | 0 | if (beg == NULL) { // no @HD |
2322 | 0 | new_l_text = h->l_text; |
2323 | 0 | if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9) |
2324 | 0 | return -1; |
2325 | 0 | new_l_text += strlen(SAM_FORMAT_VERSION) + 8; |
2326 | 0 | if (val) { |
2327 | 0 | if (new_l_text > SIZE_MAX - strlen(val) - 5) |
2328 | 0 | return -1; |
2329 | 0 | new_l_text += strlen(val) + 4; |
2330 | 0 | } |
2331 | 0 | newtext = (char*)malloc(new_l_text + 1); |
2332 | 0 | if (!newtext) return -1; |
2333 | | |
2334 | 0 | if (val) |
2335 | 0 | snprintf(newtext, new_l_text + 1, |
2336 | 0 | "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text); |
2337 | 0 | else |
2338 | 0 | snprintf(newtext, new_l_text + 1, |
2339 | 0 | "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text); |
2340 | 0 | } else { // has @HD but different or no key |
2341 | 0 | new_l_text = (beg - h->text) + (h->text + h->l_text - end); |
2342 | 0 | if (val) { |
2343 | 0 | if (new_l_text > SIZE_MAX - strlen(val) - 5) |
2344 | 0 | return -1; |
2345 | 0 | new_l_text += strlen(val) + 4; |
2346 | 0 | } |
2347 | 0 | newtext = (char*)malloc(new_l_text + 1); |
2348 | 0 | if (!newtext) return -1; |
2349 | | |
2350 | 0 | if (val) { |
2351 | 0 | snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s", |
2352 | 0 | (int) (beg - h->text), h->text, key, val, end); |
2353 | 0 | } else { //delete key |
2354 | 0 | snprintf(newtext, new_l_text + 1, "%.*s%s", |
2355 | 0 | (int) (beg - h->text), h->text, end); |
2356 | 0 | } |
2357 | 0 | } |
2358 | 0 | free(h->text); |
2359 | 0 | h->text = newtext; |
2360 | 0 | h->l_text = new_l_text; |
2361 | 0 | return 0; |
2362 | 0 | } |
2363 | | |
2364 | | |
2365 | | int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val) |
2366 | 0 | { |
2367 | 0 | if (!h || !key) |
2368 | 0 | return -1; |
2369 | | |
2370 | 0 | if (!h->hrecs) |
2371 | 0 | return old_sam_hdr_change_HD(h, key, val); |
2372 | | |
2373 | 0 | if (val) { |
2374 | 0 | if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0) |
2375 | 0 | return -1; |
2376 | 0 | } else { |
2377 | 0 | if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0) |
2378 | 0 | return -1; |
2379 | 0 | } |
2380 | 0 | return sam_hdr_rebuild(h); |
2381 | 0 | } |
2382 | | /********************** |
2383 | | *** SAM record I/O *** |
2384 | | **********************/ |
2385 | | |
2386 | | // The speed of this code can vary considerably depending on minor code |
2387 | | // changes elsewhere as some of the tight loops are particularly prone to |
2388 | | // speed changes when the instruction blocks are split over a 32-byte |
2389 | | // boundary. To protect against this, we explicitly specify an alignment |
2390 | | // for this function. If this is insufficient, we may also wish to |
2391 | | // consider alignment of blocks within this function via |
2392 | | // __attribute__((optimize("align-loops=5"))) (gcc) or clang equivalents. |
2393 | | // However it's not very portable. |
2394 | | // Instead we break into separate functions so we can explicitly specify |
2395 | | // use __attribute__((aligned(32))) instead and force consistent loop |
2396 | | // alignment. |
2397 | 101k | static inline int64_t grow_B_array(bam1_t *b, uint32_t *n, size_t size) { |
2398 | | // Avoid overflow on 32-bit platforms, but it breaks BAM anyway |
2399 | 101k | if (*n > INT32_MAX*0.666) { |
2400 | 0 | errno = ENOMEM; |
2401 | 0 | return -1; |
2402 | 0 | } |
2403 | | |
2404 | 101k | size_t bytes = (size_t)size * (size_t)(*n>>1); |
2405 | 101k | if (possibly_expand_bam_data(b, bytes) < 0) { |
2406 | 0 | hts_log_error("Out of memory"); |
2407 | 0 | return -1; |
2408 | 0 | } |
2409 | | |
2410 | 101k | (*n)+=*n>>1; |
2411 | 101k | return 0; |
2412 | 101k | } |
2413 | | |
2414 | | |
2415 | | // This ensures that q always ends up at the next comma after |
2416 | | // reading a number even if it's followed by junk. It |
2417 | | // prevents the possibility of trying to read more than n items. |
2418 | 26.0M | #define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0) |
2419 | | |
2420 | | HTS_ALIGN32 |
2421 | | static char *sam_parse_Bc_vals(bam1_t *b, char *q, uint32_t *nused, |
2422 | 4.29k | uint32_t *nalloc, int *overflow) { |
2423 | 326k | while (*q == ',') { |
2424 | 321k | if ((*nused)++ >= (*nalloc)) { |
2425 | 8.53k | if (grow_B_array(b, nalloc, 1) < 0) |
2426 | 0 | return NULL; |
2427 | 8.53k | } |
2428 | 321k | *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, overflow); |
2429 | 321k | b->l_data++; |
2430 | 321k | } |
2431 | 4.29k | return q; |
2432 | 4.29k | } |
2433 | | |
2434 | | HTS_ALIGN32 |
2435 | | static char *sam_parse_BC_vals(bam1_t *b, char *q, uint32_t *nused, |
2436 | 5.74k | uint32_t *nalloc, int *overflow) { |
2437 | 2.53M | while (*q == ',') { |
2438 | 2.52M | if ((*nused)++ >= (*nalloc)) { |
2439 | 17.9k | if (grow_B_array(b, nalloc, 1) < 0) |
2440 | 0 | return NULL; |
2441 | 17.9k | } |
2442 | 2.52M | if (q[1] != '-') { |
2443 | 2.51M | *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, overflow); |
2444 | 2.51M | b->l_data++; |
2445 | 2.51M | } else { |
2446 | 15.4k | *overflow = 1; |
2447 | 15.4k | q++; |
2448 | 15.4k | skip_to_comma_(q); |
2449 | 15.4k | } |
2450 | 2.52M | } |
2451 | 5.74k | return q; |
2452 | 5.74k | } |
2453 | | |
2454 | | HTS_ALIGN32 |
2455 | | static char *sam_parse_Bs_vals(bam1_t *b, char *q, uint32_t *nused, |
2456 | 1.81k | uint32_t *nalloc, int *overflow) { |
2457 | 81.6k | while (*q == ',') { |
2458 | 79.8k | if ((*nused)++ >= (*nalloc)) { |
2459 | 924 | if (grow_B_array(b, nalloc, 2) < 0) |
2460 | 0 | return NULL; |
2461 | 924 | } |
2462 | 79.8k | i16_to_le(hts_str2int(q + 1, &q, 16, overflow), |
2463 | 79.8k | b->data + b->l_data); |
2464 | 79.8k | b->l_data += 2; |
2465 | 79.8k | } |
2466 | 1.81k | return q; |
2467 | 1.81k | } |
2468 | | |
2469 | | HTS_ALIGN32 |
2470 | | static char *sam_parse_BS_vals(bam1_t *b, char *q, uint32_t *nused, |
2471 | 13.5k | uint32_t *nalloc, int *overflow) { |
2472 | 21.4M | while (*q == ',') { |
2473 | 21.4M | if ((*nused)++ >= (*nalloc)) { |
2474 | 65.8k | if (grow_B_array(b, nalloc, 2) < 0) |
2475 | 0 | return NULL; |
2476 | 65.8k | } |
2477 | 21.4M | if (q[1] != '-') { |
2478 | 21.1M | u16_to_le(hts_str2uint(q + 1, &q, 16, overflow), |
2479 | 21.1M | b->data + b->l_data); |
2480 | 21.1M | b->l_data += 2; |
2481 | 21.1M | } else { |
2482 | 325k | *overflow = 1; |
2483 | 325k | q++; |
2484 | 325k | skip_to_comma_(q); |
2485 | 325k | } |
2486 | 21.4M | } |
2487 | 13.5k | return q; |
2488 | 13.5k | } |
2489 | | |
2490 | | HTS_ALIGN32 |
2491 | | static char *sam_parse_Bi_vals(bam1_t *b, char *q, uint32_t *nused, |
2492 | 10.2k | uint32_t *nalloc, int *overflow) { |
2493 | 23.9M | while (*q == ',') { |
2494 | 23.9M | if ((*nused)++ >= (*nalloc)) { |
2495 | 2.96k | if (grow_B_array(b, nalloc, 4) < 0) |
2496 | 0 | return NULL; |
2497 | 2.96k | } |
2498 | 23.9M | i32_to_le(hts_str2int(q + 1, &q, 32, overflow), |
2499 | 23.9M | b->data + b->l_data); |
2500 | 23.9M | b->l_data += 4; |
2501 | 23.9M | } |
2502 | 10.2k | return q; |
2503 | 10.2k | } |
2504 | | |
2505 | | HTS_ALIGN32 |
2506 | | static char *sam_parse_BI_vals(bam1_t *b, char *q, uint32_t *nused, |
2507 | 22.8k | uint32_t *nalloc, int *overflow) { |
2508 | 300k | while (*q == ',') { |
2509 | 277k | if ((*nused)++ >= (*nalloc)) { |
2510 | 571 | if (grow_B_array(b, nalloc, 4) < 0) |
2511 | 0 | return NULL; |
2512 | 571 | } |
2513 | 277k | if (q[1] != '-') { |
2514 | 275k | u32_to_le(hts_str2uint(q + 1, &q, 32, overflow), |
2515 | 275k | b->data + b->l_data); |
2516 | 275k | b->l_data += 4; |
2517 | 275k | } else { |
2518 | 1.83k | *overflow = 1; |
2519 | 1.83k | q++; |
2520 | 1.83k | skip_to_comma_(q); |
2521 | 1.83k | } |
2522 | 277k | } |
2523 | 22.8k | return q; |
2524 | 22.8k | } |
2525 | | |
2526 | | HTS_ALIGN32 |
2527 | | static char *sam_parse_Bf_vals(bam1_t *b, char *q, uint32_t *nused, |
2528 | 3.54k | uint32_t *nalloc, int *overflow) { |
2529 | 46.7k | while (*q == ',') { |
2530 | 43.2k | if ((*nused)++ >= (*nalloc)) { |
2531 | 4.48k | if (grow_B_array(b, nalloc, 4) < 0) |
2532 | 0 | return NULL; |
2533 | 4.48k | } |
2534 | 43.2k | float_to_le(strtod(q + 1, &q), b->data + b->l_data); |
2535 | 43.2k | b->l_data += 4; |
2536 | 43.2k | } |
2537 | 3.54k | return q; |
2538 | 3.54k | } |
2539 | | |
2540 | | HTS_ALIGN32 |
2541 | | static int sam_parse_B_vals_r(char type, uint32_t nalloc, char *in, |
2542 | | char **end, bam1_t *b, |
2543 | 62.1k | int *ctr) { |
2544 | | // Protect against infinite recursion when dealing with invalid input. |
2545 | | // An example string is "XX:B:C,-". The lack of a number means min=0, |
2546 | | // but it overflowed due to "-" and so we repeat ad-infinitum. |
2547 | | // |
2548 | | // Loop detection is the safest solution incase there are other |
2549 | | // strange corner cases with malformed inputs. |
2550 | 62.1k | if (++(*ctr) > 2) { |
2551 | 142 | hts_log_error("Malformed data in B:%c array", type); |
2552 | 142 | return -1; |
2553 | 142 | } |
2554 | | |
2555 | 62.0k | int orig_l = b->l_data; |
2556 | 62.0k | char *q = in; |
2557 | 62.0k | int32_t size; |
2558 | 62.0k | size_t bytes; |
2559 | 62.0k | int overflow = 0; |
2560 | | |
2561 | 62.0k | size = aux_type2size(type); |
2562 | 62.0k | if (size <= 0 || size > 4) { |
2563 | 33 | hts_log_error("Unrecognized type B:%c", type); |
2564 | 33 | return -1; |
2565 | 33 | } |
2566 | | |
2567 | | // Ensure space for type + values. |
2568 | | // The first pass through here we don't know the number of entries and |
2569 | | // nalloc == 0. We start with a small working set and then parse the |
2570 | | // data, growing as needed. |
2571 | | // |
2572 | | // If we have a second pass through we do know the number of entries |
2573 | | // and nalloc is already known. We have no need to expand the bam data. |
2574 | 62.0k | if (!nalloc) |
2575 | 45.5k | nalloc=7; |
2576 | | |
2577 | | // Ensure allocated memory is big enough (for current nalloc estimate) |
2578 | 62.0k | bytes = (size_t) nalloc * (size_t) size; |
2579 | 62.0k | if (bytes / size != nalloc |
2580 | 62.0k | || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) { |
2581 | 0 | hts_log_error("Out of memory"); |
2582 | 0 | return -1; |
2583 | 0 | } |
2584 | | |
2585 | 62.0k | uint32_t nused = 0; |
2586 | | |
2587 | 62.0k | b->data[b->l_data++] = 'B'; |
2588 | 62.0k | b->data[b->l_data++] = type; |
2589 | | // 32-bit B-array length is inserted later once we know it. |
2590 | 62.0k | int b_len_idx = b->l_data; |
2591 | 62.0k | b->l_data += sizeof(uint32_t); |
2592 | | |
2593 | 62.0k | if (type == 'c') { |
2594 | 4.29k | if (!(q = sam_parse_Bc_vals(b, q, &nused, &nalloc, &overflow))) |
2595 | 0 | return -1; |
2596 | 57.7k | } else if (type == 'C') { |
2597 | 5.74k | if (!(q = sam_parse_BC_vals(b, q, &nused, &nalloc, &overflow))) |
2598 | 0 | return -1; |
2599 | 51.9k | } else if (type == 's') { |
2600 | 1.81k | if (!(q = sam_parse_Bs_vals(b, q, &nused, &nalloc, &overflow))) |
2601 | 0 | return -1; |
2602 | 50.1k | } else if (type == 'S') { |
2603 | 13.5k | if (!(q = sam_parse_BS_vals(b, q, &nused, &nalloc, &overflow))) |
2604 | 0 | return -1; |
2605 | 36.6k | } else if (type == 'i') { |
2606 | 10.2k | if (!(q = sam_parse_Bi_vals(b, q, &nused, &nalloc, &overflow))) |
2607 | 0 | return -1; |
2608 | 26.4k | } else if (type == 'I') { |
2609 | 22.8k | if (!(q = sam_parse_BI_vals(b, q, &nused, &nalloc, &overflow))) |
2610 | 0 | return -1; |
2611 | 22.8k | } else if (type == 'f') { |
2612 | 3.54k | if (!(q = sam_parse_Bf_vals(b, q, &nused, &nalloc, &overflow))) |
2613 | 0 | return -1; |
2614 | 3.54k | } |
2615 | 62.0k | if (*q != '\t' && *q != '\0') { |
2616 | | // Unknown B array type or junk in the numbers |
2617 | 393 | hts_log_error("Malformed B:%c", type); |
2618 | 393 | return -1; |
2619 | 393 | } |
2620 | 61.6k | i32_to_le(nused, b->data + b_len_idx); |
2621 | | |
2622 | 61.6k | if (!overflow) { |
2623 | 44.3k | *end = q; |
2624 | 44.3k | return 0; |
2625 | 44.3k | } else { |
2626 | 17.3k | int64_t max = 0, min = 0, val; |
2627 | | // Given type was incorrect. Try to rescue the situation. |
2628 | 17.3k | char *r = q; |
2629 | 17.3k | q = in; |
2630 | 17.3k | overflow = 0; |
2631 | 17.3k | b->l_data = orig_l; |
2632 | | // Find out what range of values is present |
2633 | 24.2M | while (q < r) { |
2634 | 24.2M | val = hts_str2int(q + 1, &q, 64, &overflow); |
2635 | 24.2M | if (max < val) max = val; |
2636 | 24.2M | if (min > val) min = val; |
2637 | 24.2M | skip_to_comma_(q); |
2638 | 24.2M | } |
2639 | | // Retry with appropriate type |
2640 | 17.3k | if (!overflow) { |
2641 | 17.1k | if (min < 0) { |
2642 | 12.1k | if (min >= INT8_MIN && max <= INT8_MAX) { |
2643 | 1.95k | return sam_parse_B_vals_r('c', nalloc, in, end, b, ctr); |
2644 | 10.2k | } else if (min >= INT16_MIN && max <= INT16_MAX) { |
2645 | 1.41k | return sam_parse_B_vals_r('s', nalloc, in, end, b, ctr); |
2646 | 8.82k | } else if (min >= INT32_MIN && max <= INT32_MAX) { |
2647 | 8.49k | return sam_parse_B_vals_r('i', nalloc, in, end, b, ctr); |
2648 | 8.49k | } |
2649 | 12.1k | } else { |
2650 | 4.92k | if (max < UINT8_MAX) { |
2651 | 192 | return sam_parse_B_vals_r('C', nalloc, in, end, b, ctr); |
2652 | 4.73k | } else if (max <= UINT16_MAX) { |
2653 | 342 | return sam_parse_B_vals_r('S', nalloc, in, end, b, ctr); |
2654 | 4.39k | } else if (max <= UINT32_MAX) { |
2655 | 4.21k | return sam_parse_B_vals_r('I', nalloc, in, end, b, ctr); |
2656 | 4.21k | } |
2657 | 4.92k | } |
2658 | 17.1k | } |
2659 | | // If here then at least one of the values is too big to store |
2660 | 690 | hts_log_error("Numeric value in B array out of allowed range"); |
2661 | 690 | return -1; |
2662 | 17.3k | } |
2663 | 61.6k | #undef skip_to_comma_ |
2664 | 61.6k | } |
2665 | | |
2666 | | HTS_ALIGN32 |
2667 | | static int sam_parse_B_vals(char type, char *in, char **end, bam1_t *b) |
2668 | 45.5k | { |
2669 | 45.5k | int ctr = 0; |
2670 | 45.5k | uint32_t nalloc = 0; |
2671 | 45.5k | return sam_parse_B_vals_r(type, nalloc, in, end, b, &ctr); |
2672 | 45.5k | } |
2673 | | |
2674 | 342k | static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) { |
2675 | 342k | if (*v >= '1' && *v <= '9') { |
2676 | 36.1k | return hts_str2uint(v, rv, 16, overflow); |
2677 | 36.1k | } |
2678 | 306k | else if (*v == '0') { |
2679 | | // handle single-digit "0" directly; otherwise it's hex or octal |
2680 | 24.8k | if (v[1] == '\t') { *rv = v+1; return 0; } |
2681 | 336 | else { |
2682 | 336 | unsigned long val = strtoul(v, rv, 0); |
2683 | 336 | if (val > 65535) { *overflow = 1; return 65535; } |
2684 | 186 | return val; |
2685 | 336 | } |
2686 | 24.8k | } |
2687 | 281k | else { |
2688 | | // TODO implement symbolic flag letters |
2689 | 281k | *rv = v; |
2690 | 281k | return 0; |
2691 | 281k | } |
2692 | 342k | } |
2693 | | |
2694 | | // Parse tag line and append to bam object b. |
2695 | | // Shared by both SAM and FASTQ parsers. |
2696 | | // |
2697 | | // The difference between the two is how lenient we are to recognising |
2698 | | // non-compliant strings. The FASTQ parser glosses over arbitrary |
2699 | | // non-SAM looking strings. |
2700 | | static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient, |
2701 | 337k | khash_t(tag) *tag_whitelist) { |
2702 | 337k | int overflow = 0; |
2703 | 337k | int checkpoint; |
2704 | 337k | char logbuf[40]; |
2705 | 337k | char *q = start, *p = end; |
2706 | | |
2707 | 337k | #define _parse_err(cond, ...) \ |
2708 | 20.0M | do { \ |
2709 | 40.7M | if (cond) { \ |
2710 | 766 | if (lenient) { \ |
2711 | 0 | while (q < p && !isspace_c(*q)) \ |
2712 | 0 | q++; \ |
2713 | 0 | while (q < p && isspace_c(*q)) \ |
2714 | 0 | q++; \ |
2715 | 0 | b->l_data = checkpoint; \ |
2716 | 0 | goto loop; \ |
2717 | 766 | } else { \ |
2718 | 766 | hts_log_error(__VA_ARGS__); \ |
2719 | 766 | goto err_ret; \ |
2720 | 766 | } \ |
2721 | 766 | } \ |
2722 | 20.0M | } while (0) |
2723 | | |
2724 | 19.6M | while (q < p) loop: { |
2725 | 19.6M | char type; |
2726 | 19.6M | checkpoint = b->l_data; |
2727 | 19.6M | if (p - q < 5) { |
2728 | 143 | if (lenient) { |
2729 | 0 | break; |
2730 | 143 | } else { |
2731 | 143 | hts_log_error("Incomplete aux field"); |
2732 | 143 | goto err_ret; |
2733 | 143 | } |
2734 | 143 | } |
2735 | 9.84M | _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id"); |
2736 | | |
2737 | 9.84M | if (lenient && (q[2] | q[4]) != ':') { |
2738 | 0 | while (q < p && !isspace_c(*q)) |
2739 | 0 | q++; |
2740 | 0 | while (q < p && isspace_c(*q)) |
2741 | 0 | q++; |
2742 | 0 | continue; |
2743 | 0 | } |
2744 | | |
2745 | 9.84M | if (tag_whitelist) { |
2746 | 0 | int tt = q[0]*256 + q[1]; |
2747 | 0 | if (kh_get(tag, tag_whitelist, tt) == kh_end(tag_whitelist)) { |
2748 | 0 | while (q < p && *q != '\t') |
2749 | 0 | q++; |
2750 | 0 | continue; |
2751 | 0 | } |
2752 | 0 | } |
2753 | | |
2754 | | // Copy over id |
2755 | 9.84M | if (possibly_expand_bam_data(b, 2) < 0) goto err_ret; |
2756 | 9.84M | memcpy(b->data + b->l_data, q, 2); b->l_data += 2; |
2757 | 9.84M | q += 3; type = *q++; ++q; // q points to value |
2758 | 9.84M | if (type != 'Z' && type != 'H') // the only zero length acceptable fields |
2759 | 9.64M | _parse_err(*q <= '\t', "incomplete aux field"); |
2760 | | |
2761 | | // Ensure enough space for a double + type allocated. |
2762 | 9.84M | if (possibly_expand_bam_data(b, 16) < 0) goto err_ret; |
2763 | | |
2764 | 9.84M | if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { |
2765 | 4.13M | b->data[b->l_data++] = 'A'; |
2766 | 4.13M | b->data[b->l_data++] = *q++; |
2767 | 5.70M | } else if (type == 'i' || type == 'I') { |
2768 | 5.23M | if (*q == '-') { |
2769 | 4.38M | int32_t x = hts_str2int(q, &q, 32, &overflow); |
2770 | 4.38M | if (x >= INT8_MIN) { |
2771 | 2.08M | b->data[b->l_data++] = 'c'; |
2772 | 2.08M | b->data[b->l_data++] = x; |
2773 | 2.30M | } else if (x >= INT16_MIN) { |
2774 | 723k | b->data[b->l_data++] = 's'; |
2775 | 723k | i16_to_le(x, b->data + b->l_data); |
2776 | 723k | b->l_data += 2; |
2777 | 1.57M | } else { |
2778 | 1.57M | b->data[b->l_data++] = 'i'; |
2779 | 1.57M | i32_to_le(x, b->data + b->l_data); |
2780 | 1.57M | b->l_data += 4; |
2781 | 1.57M | } |
2782 | 4.38M | } else { |
2783 | 854k | uint32_t x = hts_str2uint(q, &q, 32, &overflow); |
2784 | 854k | if (x <= UINT8_MAX) { |
2785 | 536k | b->data[b->l_data++] = 'C'; |
2786 | 536k | b->data[b->l_data++] = x; |
2787 | 536k | } else if (x <= UINT16_MAX) { |
2788 | 306k | b->data[b->l_data++] = 'S'; |
2789 | 306k | u16_to_le(x, b->data + b->l_data); |
2790 | 306k | b->l_data += 2; |
2791 | 306k | } else { |
2792 | 11.5k | b->data[b->l_data++] = 'I'; |
2793 | 11.5k | u32_to_le(x, b->data + b->l_data); |
2794 | 11.5k | b->l_data += 4; |
2795 | 11.5k | } |
2796 | 854k | } |
2797 | 5.23M | } else if (type == 'f') { |
2798 | 10.3k | b->data[b->l_data++] = 'f'; |
2799 | 10.3k | float_to_le(strtod(q, &q), b->data + b->l_data); |
2800 | 10.3k | b->l_data += sizeof(float); |
2801 | 452k | } else if (type == 'd') { |
2802 | 214k | b->data[b->l_data++] = 'd'; |
2803 | 214k | double_to_le(strtod(q, &q), b->data + b->l_data); |
2804 | 214k | b->l_data += sizeof(double); |
2805 | 237k | } else if (type == 'Z' || type == 'H') { |
2806 | 191k | char *end = strchr(q, '\t'); |
2807 | 191k | if (!end) end = q + strlen(q); |
2808 | 191k | _parse_err(type == 'H' && ((end-q)&1) != 0, |
2809 | 191k | "hex field does not have an even number of digits"); |
2810 | 191k | b->data[b->l_data++] = type; |
2811 | 191k | if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret; |
2812 | 191k | memcpy(b->data + b->l_data, q, end - q); |
2813 | 191k | b->l_data += end - q; |
2814 | 191k | b->data[b->l_data++] = '\0'; |
2815 | 191k | q = end; |
2816 | 191k | } else if (type == 'B') { |
2817 | 45.6k | type = *q++; // q points to the first ',' following the typing byte |
2818 | 45.6k | _parse_err(*q && *q != ',' && *q != '\t', |
2819 | 45.6k | "B aux field type not followed by ','"); |
2820 | | |
2821 | 45.5k | if (sam_parse_B_vals(type, q, &q, b) < 0) |
2822 | 1.25k | goto err_ret; |
2823 | 45.5k | } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1)); |
2824 | | |
2825 | 60.5M | while (*q > '\t') { q++; } // Skip any junk to next tab |
2826 | 9.83M | q++; |
2827 | 9.83M | } |
2828 | | |
2829 | 335k | _parse_err(!lenient && overflow != 0, "numeric value out of allowed range"); |
2830 | 335k | #undef _parse_err |
2831 | | |
2832 | 335k | return 0; |
2833 | | |
2834 | 2.16k | err_ret: |
2835 | 2.16k | return -2; |
2836 | 335k | } |
2837 | | |
2838 | | int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) |
2839 | 343k | { |
2840 | 1.40M | #define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0) |
2841 | | |
2842 | 343k | #if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff |
2843 | | |
2844 | | // Macro that operates on 64-bits at a time. |
2845 | 343k | #define COPY_MINUS_N(to,from,n,l,failed) \ |
2846 | 343k | do { \ |
2847 | 331k | uint64_u *from8 = (uint64_u *)(from); \ |
2848 | 331k | uint64_u *to8 = (uint64_u *)(to); \ |
2849 | 331k | uint64_t uflow = 0; \ |
2850 | 331k | size_t l8 = (l)>>3, i; \ |
2851 | 332k | for (i = 0; i < l8; i++) { \ |
2852 | 366 | to8[i] = from8[i] - (n)*0x0101010101010101UL; \ |
2853 | 366 | uflow |= to8[i]; \ |
2854 | 366 | } \ |
2855 | 333k | for (i<<=3; i < (l); ++i) { \ |
2856 | 1.40k | to[i] = from[i] - (n); \ |
2857 | 1.40k | uflow |= to[i]; \ |
2858 | 1.40k | } \ |
2859 | 331k | failed = (uflow & 0x8080808080808080UL) > 0; \ |
2860 | 331k | } while (0) |
2861 | | |
2862 | | #else |
2863 | | |
2864 | | // Basic version which operates a byte at a time |
2865 | | #define COPY_MINUS_N(to,from,n,l,failed) do { \ |
2866 | | uint8_t uflow = 0; \ |
2867 | | for (i = 0; i < (l); ++i) { \ |
2868 | | (to)[i] = (from)[i] - (n); \ |
2869 | | uflow |= (uint8_t) (to)[i]; \ |
2870 | | } \ |
2871 | | failed = (uflow & 0x80) > 0; \ |
2872 | | } while (0) |
2873 | | |
2874 | | #endif |
2875 | | |
2876 | 635k | #define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l) |
2877 | 4.50M | #define _parse_err(cond, ...) do { if (cond) { hts_log_error(__VA_ARGS__); goto err_ret; } } while (0) |
2878 | 1.28M | #define _parse_warn(cond, ...) do { if (cond) { hts_log_warning(__VA_ARGS__); } } while (0) |
2879 | | |
2880 | 343k | uint8_t *t; |
2881 | | |
2882 | 343k | char *p = s->s, *q; |
2883 | 343k | int i, overflow = 0; |
2884 | 343k | char logbuf[40]; |
2885 | 343k | hts_pos_t cigreflen; |
2886 | 343k | bam1_core_t *c = &b->core; |
2887 | | |
2888 | 343k | b->l_data = 0; |
2889 | 343k | memset(c, 0, 32); |
2890 | | |
2891 | | // qname |
2892 | 343k | q = _read_token(p); |
2893 | | |
2894 | 342k | _parse_warn(p - q <= 1, "empty query name"); |
2895 | 342k | _parse_err(p - q > 255, "query name too long"); |
2896 | | // resize large enough for name + extranul |
2897 | 342k | if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret; |
2898 | 342k | memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q; |
2899 | | |
2900 | 342k | c->l_extranul = (4 - (b->l_data & 3)) & 3; |
2901 | 342k | memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul); |
2902 | 342k | b->l_data += c->l_extranul; |
2903 | | |
2904 | 342k | c->l_qname = p - q + c->l_extranul; |
2905 | | |
2906 | | // flag |
2907 | 342k | c->flag = parse_sam_flag(p, &p, &overflow); |
2908 | 342k | if (*p++ != '\t') goto err_ret; // malformated flag |
2909 | | |
2910 | | // chr |
2911 | 342k | q = _read_token(p); |
2912 | 341k | if (strcmp(q, "*")) { |
2913 | 312k | _parse_err(h->n_targets == 0, "no SQ lines present in the header"); |
2914 | 312k | c->tid = bam_name2id(h, q); |
2915 | 312k | _parse_err(c->tid < -1, "failed to parse header"); |
2916 | 312k | _parse_warn(c->tid < 0, "unrecognized reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX)); |
2917 | 312k | } else c->tid = -1; |
2918 | | |
2919 | | // pos |
2920 | 341k | c->pos = hts_str2uint(p, &p, 63, &overflow) - 1; |
2921 | 341k | if (*p++ != '\t') goto err_ret; |
2922 | 340k | if (c->pos < 0 && c->tid >= 0) { |
2923 | 7.35k | _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped"); |
2924 | 7.35k | c->tid = -1; |
2925 | 7.35k | } |
2926 | 340k | if (c->tid < 0) c->flag |= BAM_FUNMAP; |
2927 | | |
2928 | | // mapq |
2929 | 340k | c->qual = hts_str2uint(p, &p, 8, &overflow); |
2930 | 340k | if (*p++ != '\t') goto err_ret; |
2931 | | // cigar |
2932 | 340k | if (*p != '*') { |
2933 | 302k | uint32_t *cigar = NULL; |
2934 | 302k | int old_l_data = b->l_data; |
2935 | 302k | int n_cigar = bam_parse_cigar(p, &p, b); |
2936 | 302k | if (n_cigar < 1 || *p++ != '\t') goto err_ret; |
2937 | 301k | cigar = (uint32_t *)(b->data + old_l_data); |
2938 | | |
2939 | | // can't use bam_endpos() directly as some fields not yet set up |
2940 | 301k | cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1; |
2941 | 301k | if (cigreflen == 0) cigreflen = 1; |
2942 | 301k | } else { |
2943 | 37.5k | _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped"); |
2944 | 37.5k | c->flag |= BAM_FUNMAP; |
2945 | 37.5k | q = _read_token(p); |
2946 | 37.5k | cigreflen = 1; |
2947 | 37.5k | } |
2948 | 339k | _parse_err(HTS_POS_MAX - cigreflen <= c->pos, |
2949 | 339k | "read ends beyond highest supported position"); |
2950 | 339k | c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5); |
2951 | | // mate chr |
2952 | 339k | q = _read_token(p); |
2953 | 339k | if (strcmp(q, "=") == 0) { |
2954 | 799 | c->mtid = c->tid; |
2955 | 338k | } else if (strcmp(q, "*") == 0) { |
2956 | 297 | c->mtid = -1; |
2957 | 337k | } else { |
2958 | 337k | c->mtid = bam_name2id(h, q); |
2959 | 337k | _parse_err(c->mtid < -1, "failed to parse header"); |
2960 | 337k | _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX)); |
2961 | 337k | } |
2962 | | // mpos |
2963 | 339k | c->mpos = hts_str2uint(p, &p, 63, &overflow) - 1; |
2964 | 339k | if (*p++ != '\t') goto err_ret; |
2965 | 338k | if (c->mpos < 0 && c->mtid >= 0) { |
2966 | 248k | _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped"); |
2967 | 248k | c->mtid = -1; |
2968 | 248k | } |
2969 | | // tlen |
2970 | 338k | c->isize = hts_str2int(p, &p, 64, &overflow); |
2971 | 338k | if (*p++ != '\t') goto err_ret; |
2972 | | // seq |
2973 | 338k | q = _read_token(p); |
2974 | 337k | if (strcmp(q, "*")) { |
2975 | 298k | _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long"); |
2976 | 298k | c->l_qseq = p - q - 1; |
2977 | 298k | hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname)); |
2978 | 298k | _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length"); |
2979 | 298k | i = (c->l_qseq + 1) >> 1; |
2980 | 298k | _get_mem(uint8_t, &t, b, i); |
2981 | | |
2982 | 298k | unsigned int lqs2 = c->l_qseq&~1, i; |
2983 | 301k | for (i = 0; i < lqs2; i+=2) |
2984 | 3.40k | t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]]; |
2985 | 304k | for (; i < c->l_qseq; ++i) |
2986 | 6.84k | t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2); |
2987 | 298k | } else c->l_qseq = 0; |
2988 | | // qual |
2989 | 675k | _get_mem(uint8_t, &t, b, c->l_qseq); |
2990 | 675k | if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) { |
2991 | 5.91k | memset(t, 0xff, c->l_qseq); |
2992 | 5.91k | p += 2; |
2993 | 331k | } else { |
2994 | 331k | int failed = 0; |
2995 | 331k | _parse_err(s->l - (p - s->s) < c->l_qseq |
2996 | 331k | || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'), |
2997 | 331k | "SEQ and QUAL are of different length"); |
2998 | 331k | COPY_MINUS_N(t, p, 33, c->l_qseq, failed); |
2999 | 331k | _parse_err(failed, "invalid QUAL character"); |
3000 | 331k | p += c->l_qseq + 1; |
3001 | 331k | } |
3002 | | |
3003 | | // aux |
3004 | 337k | if (aux_parse(p, s->s + s->l, b, 0, NULL) < 0) |
3005 | 2.16k | goto err_ret; |
3006 | | |
3007 | 335k | if (bam_tag2cigar(b, 1, 1) < 0) |
3008 | 0 | return -2; |
3009 | 335k | return 0; |
3010 | | |
3011 | 0 | #undef _parse_warn |
3012 | 0 | #undef _parse_err |
3013 | 0 | #undef _get_mem |
3014 | 0 | #undef _read_token |
3015 | 8.31k | err_ret: |
3016 | 8.31k | return -2; |
3017 | 335k | } |
3018 | | |
3019 | 302k | static uint32_t read_ncigar(const char *q) { |
3020 | 302k | uint32_t n_cigar = 0; |
3021 | 1.86M | for (; *q && *q != '\t'; ++q) |
3022 | 1.56M | if (!isdigit_c(*q)) ++n_cigar; |
3023 | 302k | if (!n_cigar) { |
3024 | 279 | hts_log_error("No CIGAR operations"); |
3025 | 279 | return 0; |
3026 | 279 | } |
3027 | 302k | if (n_cigar >= 2147483647) { |
3028 | 0 | hts_log_error("Too many CIGAR operations"); |
3029 | 0 | return 0; |
3030 | 0 | } |
3031 | | |
3032 | 302k | return n_cigar; |
3033 | 302k | } |
3034 | | |
3035 | | /*! @function |
3036 | | @abstract Parse a CIGAR string into preallocated a uint32_t array |
3037 | | @param in [in] pointer to the source string |
3038 | | @param a_cigar [out] address of the destination uint32_t buffer |
3039 | | @return number of processed input characters; 0 on error |
3040 | | */ |
3041 | 302k | static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) { |
3042 | 302k | int i, overflow = 0; |
3043 | 302k | const char *p = in; |
3044 | 741k | for (i = 0; i < n_cigar; i++) { |
3045 | 439k | uint32_t len; |
3046 | 439k | int op; |
3047 | 439k | char *q; |
3048 | 439k | len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT; |
3049 | 439k | if (q == p) { |
3050 | 364 | hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p); |
3051 | 364 | return 0; |
3052 | 364 | } |
3053 | 439k | if (overflow) { |
3054 | 81 | hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p); |
3055 | 81 | return 0; |
3056 | 81 | } |
3057 | 439k | p = q; |
3058 | 439k | op = bam_cigar_table[(unsigned char)*p++]; |
3059 | 439k | if (op < 0) { |
3060 | 240 | hts_log_error("Unrecognized CIGAR operator"); |
3061 | 240 | return 0; |
3062 | 240 | } |
3063 | 438k | a_cigar[i] = len; |
3064 | 438k | a_cigar[i] |= op; |
3065 | 438k | } |
3066 | | |
3067 | 301k | return p-in; |
3068 | 302k | } |
3069 | | |
3070 | 0 | ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) { |
3071 | 0 | size_t n_cigar = 0; |
3072 | 0 | int diff; |
3073 | |
|
3074 | 0 | if (!in || !a_cigar || !a_mem) { |
3075 | 0 | hts_log_error("NULL pointer arguments"); |
3076 | 0 | return -1; |
3077 | 0 | } |
3078 | 0 | if (end) *end = (char *)in; |
3079 | |
|
3080 | 0 | if (*in == '*') { |
3081 | 0 | if (end) (*end)++; |
3082 | 0 | return 0; |
3083 | 0 | } |
3084 | 0 | n_cigar = read_ncigar(in); |
3085 | 0 | if (!n_cigar) return 0; |
3086 | 0 | if (n_cigar > *a_mem) { |
3087 | 0 | uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar)); |
3088 | 0 | if (a_tmp) { |
3089 | 0 | *a_cigar = a_tmp; |
3090 | 0 | *a_mem = n_cigar; |
3091 | 0 | } else { |
3092 | 0 | hts_log_error("Memory allocation error"); |
3093 | 0 | return -1; |
3094 | 0 | } |
3095 | 0 | } |
3096 | | |
3097 | 0 | if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1; |
3098 | 0 | if (end) *end = (char *)in+diff; |
3099 | |
|
3100 | 0 | return n_cigar; |
3101 | 0 | } |
3102 | | |
3103 | 302k | ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) { |
3104 | 302k | size_t n_cigar = 0; |
3105 | 302k | int diff; |
3106 | | |
3107 | 302k | if (!in || !b) { |
3108 | 0 | hts_log_error("NULL pointer arguments"); |
3109 | 0 | return -1; |
3110 | 0 | } |
3111 | 302k | if (end) *end = (char *)in; |
3112 | | |
3113 | 302k | n_cigar = (*in == '*') ? 0 : read_ncigar(in); |
3114 | 302k | if (!n_cigar && b->core.n_cigar == 0) { |
3115 | 279 | if (end) *end = (char *)in+1; |
3116 | 279 | return 0; |
3117 | 279 | } |
3118 | | |
3119 | 302k | ssize_t cig_diff = n_cigar - b->core.n_cigar; |
3120 | 302k | if (cig_diff > 0 && |
3121 | 302k | possibly_expand_bam_data(b, cig_diff * sizeof(uint32_t)) < 0) { |
3122 | 0 | hts_log_error("Memory allocation error"); |
3123 | 0 | return -1; |
3124 | 0 | } |
3125 | | |
3126 | 302k | uint32_t *cig = bam_get_cigar(b); |
3127 | 302k | if ((uint8_t *)cig != b->data + b->l_data) { |
3128 | | // Modifying an BAM existing BAM record |
3129 | 0 | uint8_t *seq = bam_get_seq(b); |
3130 | 0 | memmove(cig + n_cigar, seq, (b->data + b->l_data) - seq); |
3131 | 0 | } |
3132 | | |
3133 | 302k | if (n_cigar) { |
3134 | 302k | if (!(diff = parse_cigar(in, cig, n_cigar))) |
3135 | 685 | return -1; |
3136 | 302k | } else { |
3137 | 0 | diff = 1; // handle "*" |
3138 | 0 | } |
3139 | | |
3140 | 301k | b->l_data += cig_diff * sizeof(uint32_t); |
3141 | 301k | b->core.n_cigar = n_cigar; |
3142 | 301k | if (end) *end = (char *)in + diff; |
3143 | | |
3144 | 301k | return n_cigar; |
3145 | 302k | } |
3146 | | |
3147 | | /* |
3148 | | * ----------------------------------------------------------------------------- |
3149 | | * SAM threading |
3150 | | */ |
3151 | | // Size of SAM text block (reading) |
3152 | 0 | #define SAM_NBYTES 240000 |
3153 | | |
3154 | | // Number of BAM records (writing, up to NB_mem in size) |
3155 | 0 | #define SAM_NBAM 1000 |
3156 | | |
3157 | | struct SAM_state; |
3158 | | |
3159 | | // Output job - a block of BAM records |
3160 | | typedef struct sp_bams { |
3161 | | struct sp_bams *next; |
3162 | | int serial; |
3163 | | |
3164 | | bam1_t *bams; |
3165 | | int nbams, abams; // used and alloc for bams[] array |
3166 | | size_t bam_mem; // very approximate total size |
3167 | | |
3168 | | struct SAM_state *fd; |
3169 | | } sp_bams; |
3170 | | |
3171 | | // Input job - a block of SAM text |
3172 | | typedef struct sp_lines { |
3173 | | struct sp_lines *next; |
3174 | | int serial; |
3175 | | |
3176 | | char *data; |
3177 | | int data_size; |
3178 | | int alloc; |
3179 | | |
3180 | | struct SAM_state *fd; |
3181 | | sp_bams *bams; |
3182 | | } sp_lines; |
3183 | | |
3184 | | enum sam_cmd { |
3185 | | SAM_NONE = 0, |
3186 | | SAM_CLOSE, |
3187 | | SAM_CLOSE_DONE, |
3188 | | }; |
3189 | | |
3190 | | typedef struct SAM_state { |
3191 | | sam_hdr_t *h; |
3192 | | |
3193 | | hts_tpool *p; |
3194 | | int own_pool; |
3195 | | pthread_mutex_t lines_m; |
3196 | | hts_tpool_process *q; |
3197 | | pthread_t dispatcher; |
3198 | | int dispatcher_set; |
3199 | | |
3200 | | sp_lines *lines; |
3201 | | sp_bams *bams; |
3202 | | |
3203 | | sp_bams *curr_bam; |
3204 | | int curr_idx; |
3205 | | int serial; |
3206 | | |
3207 | | // Be warned: moving these mutexes around in this struct can reduce |
3208 | | // threading performance by up to 70%! |
3209 | | pthread_mutex_t command_m; |
3210 | | pthread_cond_t command_c; |
3211 | | enum sam_cmd command; |
3212 | | |
3213 | | // One of the E* errno codes |
3214 | | int errcode; |
3215 | | |
3216 | | htsFile *fp; |
3217 | | } SAM_state; |
3218 | | |
3219 | | // Returns a SAM_state struct from a generic hFILE. |
3220 | | // |
3221 | | // Returns NULL on failure. |
3222 | 0 | static SAM_state *sam_state_create(htsFile *fp) { |
3223 | | // Ideally sam_open wouldn't be a #define to hts_open but instead would |
3224 | | // be a redirect call with an additional 'S' mode. This in turn would |
3225 | | // correctly set the designed format to sam instead of a generic |
3226 | | // text_format. |
3227 | 0 | if (fp->format.format != sam && fp->format.format != text_format) |
3228 | 0 | return NULL; |
3229 | | |
3230 | 0 | SAM_state *fd = calloc(1, sizeof(*fd)); |
3231 | 0 | if (!fd) |
3232 | 0 | return NULL; |
3233 | | |
3234 | 0 | fp->state = fd; |
3235 | 0 | fd->fp = fp; |
3236 | |
|
3237 | 0 | return fd; |
3238 | 0 | } |
3239 | | |
3240 | | static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str); |
3241 | | static void *sam_format_worker(void *arg); |
3242 | | |
3243 | 0 | static void sam_state_err(SAM_state *fd, int errcode) { |
3244 | 0 | pthread_mutex_lock(&fd->command_m); |
3245 | 0 | if (!fd->errcode) |
3246 | 0 | fd->errcode = errcode; |
3247 | 0 | pthread_mutex_unlock(&fd->command_m); |
3248 | 0 | } |
3249 | | |
3250 | 0 | static void sam_free_sp_bams(sp_bams *b) { |
3251 | 0 | if (!b) |
3252 | 0 | return; |
3253 | | |
3254 | 0 | if (b->bams) { |
3255 | 0 | int i; |
3256 | 0 | for (i = 0; i < b->abams; i++) { |
3257 | 0 | if (b->bams[i].data) |
3258 | 0 | free(b->bams[i].data); |
3259 | 0 | } |
3260 | 0 | free(b->bams); |
3261 | 0 | } |
3262 | 0 | free(b); |
3263 | 0 | } |
3264 | | |
3265 | | // Destroys the state produce by sam_state_create. |
3266 | 39.7k | int sam_state_destroy(htsFile *fp) { |
3267 | 39.7k | int ret = 0; |
3268 | | |
3269 | 39.7k | if (!fp->state) |
3270 | 39.7k | return 0; |
3271 | | |
3272 | 0 | SAM_state *fd = fp->state; |
3273 | 0 | if (fd->p) { |
3274 | 0 | if (fd->h) { |
3275 | | // Notify sam_dispatcher we're closing |
3276 | 0 | pthread_mutex_lock(&fd->command_m); |
3277 | 0 | if (fd->command != SAM_CLOSE_DONE) |
3278 | 0 | fd->command = SAM_CLOSE; |
3279 | 0 | pthread_cond_signal(&fd->command_c); |
3280 | 0 | ret = -fd->errcode; |
3281 | 0 | if (fd->q) |
3282 | 0 | hts_tpool_wake_dispatch(fd->q); // unstick the reader |
3283 | |
|
3284 | 0 | if (!fp->is_write && fd->q && fd->dispatcher_set) { |
3285 | 0 | for (;;) { |
3286 | | // Avoid deadlocks with dispatcher |
3287 | 0 | if (fd->command == SAM_CLOSE_DONE) |
3288 | 0 | break; |
3289 | 0 | hts_tpool_wake_dispatch(fd->q); |
3290 | 0 | pthread_mutex_unlock(&fd->command_m); |
3291 | 0 | usleep(10000); |
3292 | 0 | pthread_mutex_lock(&fd->command_m); |
3293 | 0 | } |
3294 | 0 | } |
3295 | 0 | pthread_mutex_unlock(&fd->command_m); |
3296 | |
|
3297 | 0 | if (fp->is_write) { |
3298 | | // Dispatch the last partial block. |
3299 | 0 | sp_bams *gb = fd->curr_bam; |
3300 | 0 | if (!ret && gb && gb->nbams > 0 && fd->q) |
3301 | 0 | ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb); |
3302 | | |
3303 | | // Flush and drain output |
3304 | 0 | if (fd->q) |
3305 | 0 | hts_tpool_process_flush(fd->q); |
3306 | 0 | pthread_mutex_lock(&fd->command_m); |
3307 | 0 | if (!ret) ret = -fd->errcode; |
3308 | 0 | pthread_mutex_unlock(&fd->command_m); |
3309 | |
|
3310 | 0 | while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) { |
3311 | 0 | usleep(10000); |
3312 | 0 | pthread_mutex_lock(&fd->command_m); |
3313 | 0 | ret = -fd->errcode; |
3314 | | // not empty but shutdown implies error |
3315 | 0 | if (hts_tpool_process_is_shutdown(fd->q) && !ret) |
3316 | 0 | ret = EIO; |
3317 | 0 | pthread_mutex_unlock(&fd->command_m); |
3318 | 0 | } |
3319 | 0 | if (fd->q) |
3320 | 0 | hts_tpool_process_shutdown(fd->q); |
3321 | 0 | } |
3322 | | |
3323 | | // Wait for it to acknowledge |
3324 | 0 | if (fd->dispatcher_set) |
3325 | 0 | pthread_join(fd->dispatcher, NULL); |
3326 | 0 | if (!ret) ret = -fd->errcode; |
3327 | 0 | } |
3328 | | |
3329 | | // Tidy up memory |
3330 | 0 | if (fd->q) |
3331 | 0 | hts_tpool_process_destroy(fd->q); |
3332 | |
|
3333 | 0 | if (fd->own_pool && fp->format.compression == no_compression) { |
3334 | 0 | hts_tpool_destroy(fd->p); |
3335 | 0 | fd->p = NULL; |
3336 | 0 | } |
3337 | 0 | pthread_mutex_destroy(&fd->lines_m); |
3338 | 0 | pthread_mutex_destroy(&fd->command_m); |
3339 | 0 | pthread_cond_destroy(&fd->command_c); |
3340 | |
|
3341 | 0 | sp_lines *l = fd->lines; |
3342 | 0 | while (l) { |
3343 | 0 | sp_lines *n = l->next; |
3344 | 0 | free(l->data); |
3345 | 0 | free(l); |
3346 | 0 | l = n; |
3347 | 0 | } |
3348 | |
|
3349 | 0 | sp_bams *b = fd->bams; |
3350 | 0 | while (b) { |
3351 | 0 | if (fd->curr_bam == b) |
3352 | 0 | fd->curr_bam = NULL; |
3353 | 0 | sp_bams *n = b->next; |
3354 | 0 | sam_free_sp_bams(b); |
3355 | 0 | b = n; |
3356 | 0 | } |
3357 | |
|
3358 | 0 | if (fd->curr_bam) |
3359 | 0 | sam_free_sp_bams(fd->curr_bam); |
3360 | | |
3361 | | // Decrement counter by one, maybe destroying too. |
3362 | | // This is to permit the caller using bam_hdr_destroy |
3363 | | // before sam_close without triggering decode errors |
3364 | | // in the background threads. |
3365 | 0 | bam_hdr_destroy(fd->h); |
3366 | 0 | } |
3367 | |
|
3368 | 0 | free(fp->state); |
3369 | 0 | fp->state = NULL; |
3370 | 0 | return ret; |
3371 | 39.7k | } |
3372 | | |
3373 | | // Cleanup function - job for sam_parse_worker; result for sam_format_worker |
3374 | 0 | static void cleanup_sp_lines(void *arg) { |
3375 | 0 | sp_lines *gl = (sp_lines *)arg; |
3376 | 0 | if (!gl) return; |
3377 | | |
3378 | | // Should always be true for lines passed to / from thread workers. |
3379 | 0 | assert(gl->next == NULL); |
3380 | | |
3381 | 0 | free(gl->data); |
3382 | 0 | sam_free_sp_bams(gl->bams); |
3383 | 0 | free(gl); |
3384 | 0 | } |
3385 | | |
3386 | | // Run from one of the worker threads. |
3387 | | // Convert a passed in array of lines to array of BAMs, returning |
3388 | | // the result back to the thread queue. |
3389 | 0 | static void *sam_parse_worker(void *arg) { |
3390 | 0 | sp_lines *gl = (sp_lines *)arg; |
3391 | 0 | sp_bams *gb = NULL; |
3392 | 0 | char *lines = gl->data; |
3393 | 0 | int i; |
3394 | 0 | bam1_t *b; |
3395 | 0 | SAM_state *fd = gl->fd; |
3396 | | |
3397 | | // Use a block of BAM structs we had earlier if available. |
3398 | 0 | pthread_mutex_lock(&fd->lines_m); |
3399 | 0 | if (fd->bams) { |
3400 | 0 | gb = fd->bams; |
3401 | 0 | fd->bams = gb->next; |
3402 | 0 | } |
3403 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3404 | |
|
3405 | 0 | if (gb == NULL) { |
3406 | 0 | gb = calloc(1, sizeof(*gb)); |
3407 | 0 | if (!gb) { |
3408 | 0 | return NULL; |
3409 | 0 | } |
3410 | 0 | gb->abams = 100; |
3411 | 0 | gb->bams = b = calloc(gb->abams, sizeof(*b)); |
3412 | 0 | if (!gb->bams) { |
3413 | 0 | sam_state_err(fd, ENOMEM); |
3414 | 0 | goto err; |
3415 | 0 | } |
3416 | 0 | gb->nbams = 0; |
3417 | 0 | gb->bam_mem = 0; |
3418 | 0 | } |
3419 | 0 | gb->serial = gl->serial; |
3420 | 0 | gb->next = NULL; |
3421 | |
|
3422 | 0 | b = (bam1_t *)gb->bams; |
3423 | 0 | if (!b) { |
3424 | 0 | sam_state_err(fd, ENOMEM); |
3425 | 0 | goto err; |
3426 | 0 | } |
3427 | | |
3428 | 0 | i = 0; |
3429 | 0 | char *cp = lines, *cp_end = lines + gl->data_size; |
3430 | 0 | while (cp < cp_end) { |
3431 | 0 | if (i >= gb->abams) { |
3432 | 0 | int old_abams = gb->abams; |
3433 | 0 | gb->abams *= 2; |
3434 | 0 | b = (bam1_t *)realloc(gb->bams, gb->abams*sizeof(bam1_t)); |
3435 | 0 | if (!b) { |
3436 | 0 | gb->abams /= 2; |
3437 | 0 | sam_state_err(fd, ENOMEM); |
3438 | 0 | goto err; |
3439 | 0 | } |
3440 | 0 | memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b)); |
3441 | 0 | gb->bams = b; |
3442 | 0 | } |
3443 | | |
3444 | | // Ideally we'd get sam_parse1 to return the number of |
3445 | | // bytes decoded and to be able to stop on newline as |
3446 | | // well as \0. |
3447 | | // |
3448 | | // We can then avoid the additional strchr loop. |
3449 | | // It's around 6% of our CPU cost, albeit threadable. |
3450 | | // |
3451 | | // However this is an API change so for now we copy. |
3452 | | |
3453 | 0 | char *nl = strchr(cp, '\n'); |
3454 | 0 | char *line_end; |
3455 | 0 | if (nl) { |
3456 | 0 | line_end = nl; |
3457 | 0 | if (line_end > cp && *(line_end - 1) == '\r') |
3458 | 0 | line_end--; |
3459 | 0 | nl++; |
3460 | 0 | } else { |
3461 | 0 | nl = line_end = cp_end; |
3462 | 0 | } |
3463 | 0 | *line_end = '\0'; |
3464 | 0 | kstring_t ks = { line_end - cp, gl->alloc, cp }; |
3465 | 0 | if (sam_parse1(&ks, fd->h, &b[i]) < 0) { |
3466 | 0 | sam_state_err(fd, errno ? errno : EIO); |
3467 | 0 | cleanup_sp_lines(gl); |
3468 | 0 | goto err; |
3469 | 0 | } |
3470 | | |
3471 | 0 | cp = nl; |
3472 | 0 | i++; |
3473 | 0 | } |
3474 | 0 | gb->nbams = i; |
3475 | |
|
3476 | 0 | pthread_mutex_lock(&fd->lines_m); |
3477 | 0 | gl->next = fd->lines; |
3478 | 0 | fd->lines = gl; |
3479 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3480 | 0 | return gb; |
3481 | | |
3482 | 0 | err: |
3483 | 0 | sam_free_sp_bams(gb); |
3484 | 0 | return NULL; |
3485 | 0 | } |
3486 | | |
3487 | 0 | static void *sam_parse_eof(void *arg) { |
3488 | 0 | return NULL; |
3489 | 0 | } |
3490 | | |
3491 | | // Cleanup function - result for sam_parse_worker; job for sam_format_worker |
3492 | 0 | static void cleanup_sp_bams(void *arg) { |
3493 | 0 | sam_free_sp_bams((sp_bams *) arg); |
3494 | 0 | } |
3495 | | |
3496 | | // Runs in its own thread. |
3497 | | // Reads a block of text (SAM) and sends a new job to the thread queue to |
3498 | | // translate this to BAM. |
3499 | 0 | static void *sam_dispatcher_read(void *vp) { |
3500 | 0 | htsFile *fp = vp; |
3501 | 0 | kstring_t line = {0}; |
3502 | 0 | int line_frag = 0; |
3503 | 0 | SAM_state *fd = fp->state; |
3504 | 0 | sp_lines *l = NULL; |
3505 | | |
3506 | | // Pre-allocate buffer for left-over bits of line (exact size doesn't |
3507 | | // matter as it will grow if necessary). |
3508 | 0 | if (ks_resize(&line, 1000) < 0) |
3509 | 0 | goto err; |
3510 | | |
3511 | 0 | for (;;) { |
3512 | | // Check for command |
3513 | 0 | pthread_mutex_lock(&fd->command_m); |
3514 | 0 | switch (fd->command) { |
3515 | | |
3516 | 0 | case SAM_CLOSE: |
3517 | 0 | pthread_cond_signal(&fd->command_c); |
3518 | 0 | pthread_mutex_unlock(&fd->command_m); |
3519 | 0 | hts_tpool_process_shutdown(fd->q); |
3520 | 0 | goto tidyup; |
3521 | | |
3522 | 0 | default: |
3523 | 0 | break; |
3524 | 0 | } |
3525 | 0 | pthread_mutex_unlock(&fd->command_m); |
3526 | |
|
3527 | 0 | pthread_mutex_lock(&fd->lines_m); |
3528 | 0 | if (fd->lines) { |
3529 | | // reuse existing line buffer |
3530 | 0 | l = fd->lines; |
3531 | 0 | fd->lines = l->next; |
3532 | 0 | } |
3533 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3534 | |
|
3535 | 0 | if (l == NULL) { |
3536 | | // none to reuse, to create a new one |
3537 | 0 | l = calloc(1, sizeof(*l)); |
3538 | 0 | if (!l) |
3539 | 0 | goto err; |
3540 | 0 | l->alloc = SAM_NBYTES; |
3541 | 0 | l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1 |
3542 | 0 | if (!l->data) { |
3543 | 0 | free(l); |
3544 | 0 | l = NULL; |
3545 | 0 | goto err; |
3546 | 0 | } |
3547 | 0 | l->fd = fd; |
3548 | 0 | } |
3549 | 0 | l->next = NULL; |
3550 | |
|
3551 | 0 | if (l->alloc < line_frag+SAM_NBYTES/2) { |
3552 | 0 | char *rp = realloc(l->data, line_frag+SAM_NBYTES/2 +8); |
3553 | 0 | if (!rp) |
3554 | 0 | goto err; |
3555 | 0 | l->alloc = line_frag+SAM_NBYTES/2; |
3556 | 0 | l->data = rp; |
3557 | 0 | } |
3558 | 0 | memcpy(l->data, line.s, line_frag); |
3559 | |
|
3560 | 0 | l->data_size = line_frag; |
3561 | 0 | ssize_t nbytes; |
3562 | 0 | longer_line: |
3563 | 0 | if (fp->is_bgzf) |
3564 | 0 | nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag); |
3565 | 0 | else |
3566 | 0 | nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag); |
3567 | 0 | if (nbytes < 0) { |
3568 | 0 | sam_state_err(fd, errno ? errno : EIO); |
3569 | 0 | goto err; |
3570 | 0 | } else if (nbytes == 0) |
3571 | 0 | break; // EOF |
3572 | 0 | l->data_size += nbytes; |
3573 | | |
3574 | | // trim to last \n. Maybe \r\n, but that's still fine |
3575 | 0 | if (nbytes == l->alloc - line_frag) { |
3576 | 0 | char *cp_end = l->data + l->data_size; |
3577 | 0 | char *cp = cp_end-1; |
3578 | |
|
3579 | 0 | while (cp > (char *)l->data && *cp != '\n') |
3580 | 0 | cp--; |
3581 | | |
3582 | | // entire buffer is part of a single line |
3583 | 0 | if (cp == l->data) { |
3584 | 0 | line_frag = l->data_size; |
3585 | 0 | char *rp = realloc(l->data, l->alloc * 2 + 8); |
3586 | 0 | if (!rp) |
3587 | 0 | goto err; |
3588 | 0 | l->alloc *= 2; |
3589 | 0 | l->data = rp; |
3590 | 0 | assert(l->alloc >= l->data_size); |
3591 | 0 | assert(l->alloc >= line_frag); |
3592 | 0 | assert(l->alloc >= l->alloc - line_frag); |
3593 | 0 | goto longer_line; |
3594 | 0 | } |
3595 | 0 | cp++; |
3596 | | |
3597 | | // line holds the remainder of our line. |
3598 | 0 | if (ks_resize(&line, cp_end - cp) < 0) |
3599 | 0 | goto err; |
3600 | 0 | memcpy(line.s, cp, cp_end - cp); |
3601 | 0 | line_frag = cp_end - cp; |
3602 | 0 | l->data_size = l->alloc - line_frag; |
3603 | 0 | } else { |
3604 | | // out of buffer |
3605 | 0 | line_frag = 0; |
3606 | 0 | } |
3607 | | |
3608 | 0 | l->serial = fd->serial++; |
3609 | | //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial); |
3610 | 0 | if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l, |
3611 | 0 | cleanup_sp_lines, cleanup_sp_bams, 0) < 0) |
3612 | 0 | goto err; |
3613 | 0 | pthread_mutex_lock(&fd->command_m); |
3614 | 0 | if (fd->command == SAM_CLOSE) { |
3615 | 0 | pthread_mutex_unlock(&fd->command_m); |
3616 | 0 | l = NULL; |
3617 | 0 | goto tidyup; |
3618 | 0 | } |
3619 | 0 | l = NULL; // Now "owned" by sam_parse_worker() |
3620 | 0 | pthread_mutex_unlock(&fd->command_m); |
3621 | 0 | } |
3622 | | |
3623 | 0 | if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0) |
3624 | 0 | goto err; |
3625 | | |
3626 | | // At EOF, wait for close request. |
3627 | | // (In future if we add support for seek, this is where we need to catch it.) |
3628 | 0 | for (;;) { |
3629 | 0 | pthread_mutex_lock(&fd->command_m); |
3630 | 0 | if (fd->command == SAM_NONE) |
3631 | 0 | pthread_cond_wait(&fd->command_c, &fd->command_m); |
3632 | 0 | switch (fd->command) { |
3633 | 0 | case SAM_CLOSE: |
3634 | 0 | pthread_cond_signal(&fd->command_c); |
3635 | 0 | pthread_mutex_unlock(&fd->command_m); |
3636 | 0 | hts_tpool_process_shutdown(fd->q); |
3637 | 0 | goto tidyup; |
3638 | | |
3639 | 0 | default: |
3640 | 0 | pthread_mutex_unlock(&fd->command_m); |
3641 | 0 | break; |
3642 | 0 | } |
3643 | 0 | } |
3644 | | |
3645 | 0 | tidyup: |
3646 | 0 | pthread_mutex_lock(&fd->command_m); |
3647 | 0 | fd->command = SAM_CLOSE_DONE; |
3648 | 0 | pthread_cond_signal(&fd->command_c); |
3649 | 0 | pthread_mutex_unlock(&fd->command_m); |
3650 | |
|
3651 | 0 | if (l) { |
3652 | 0 | pthread_mutex_lock(&fd->lines_m); |
3653 | 0 | l->next = fd->lines; |
3654 | 0 | fd->lines = l; |
3655 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3656 | 0 | } |
3657 | 0 | free(line.s); |
3658 | |
|
3659 | 0 | return NULL; |
3660 | | |
3661 | 0 | err: |
3662 | 0 | sam_state_err(fd, errno ? errno : ENOMEM); |
3663 | 0 | hts_tpool_process_shutdown(fd->q); |
3664 | 0 | goto tidyup; |
3665 | 0 | } |
3666 | | |
3667 | | // Runs in its own thread. |
3668 | | // Takes encoded blocks of SAM off the thread results queue and writes them |
3669 | | // to our output stream. |
3670 | 0 | static void *sam_dispatcher_write(void *vp) { |
3671 | 0 | htsFile *fp = vp; |
3672 | 0 | SAM_state *fd = fp->state; |
3673 | 0 | hts_tpool_result *r; |
3674 | | |
3675 | | // Iterates until result queue is shutdown, where it returns NULL. |
3676 | 0 | while ((r = hts_tpool_next_result_wait(fd->q))) { |
3677 | 0 | sp_lines *gl = (sp_lines *)hts_tpool_result_data(r); |
3678 | 0 | if (!gl) { |
3679 | 0 | sam_state_err(fd, ENOMEM); |
3680 | 0 | goto err; |
3681 | 0 | } |
3682 | | |
3683 | 0 | if (fp->idx) { |
3684 | 0 | sp_bams *gb = gl->bams; |
3685 | 0 | int i = 0, count = 0; |
3686 | 0 | while (i < gl->data_size) { |
3687 | 0 | int j = i; |
3688 | 0 | while (i < gl->data_size && gl->data[i] != '\n') |
3689 | 0 | i++; |
3690 | 0 | if (i < gl->data_size) |
3691 | 0 | i++; |
3692 | |
|
3693 | 0 | if (fp->is_bgzf) { |
3694 | 0 | if (bgzf_flush_try(fp->fp.bgzf, i-j) < 0) |
3695 | 0 | goto err; |
3696 | 0 | if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j) |
3697 | 0 | goto err; |
3698 | 0 | } else { |
3699 | 0 | if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j) |
3700 | 0 | goto err; |
3701 | 0 | } |
3702 | | |
3703 | 0 | bam1_t *b = &gb->bams[count++]; |
3704 | 0 | if (fp->format.compression == bgzf) { |
3705 | 0 | if (bgzf_idx_push(fp->fp.bgzf, fp->idx, |
3706 | 0 | b->core.tid, b->core.pos, bam_endpos(b), |
3707 | 0 | bgzf_tell(fp->fp.bgzf), |
3708 | 0 | !(b->core.flag&BAM_FUNMAP)) < 0) { |
3709 | 0 | sam_state_err(fd, errno ? errno : ENOMEM); |
3710 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
3711 | 0 | bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1); |
3712 | 0 | goto err; |
3713 | 0 | } |
3714 | 0 | } else { |
3715 | 0 | if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b), |
3716 | 0 | bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) { |
3717 | 0 | sam_state_err(fd, errno ? errno : ENOMEM); |
3718 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
3719 | 0 | bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1); |
3720 | 0 | goto err; |
3721 | 0 | } |
3722 | 0 | } |
3723 | 0 | } |
3724 | | |
3725 | 0 | assert(count == gb->nbams); |
3726 | | |
3727 | | // Add bam array to free-list |
3728 | 0 | pthread_mutex_lock(&fd->lines_m); |
3729 | 0 | gb->next = fd->bams; |
3730 | 0 | fd->bams = gl->bams; |
3731 | 0 | gl->bams = NULL; |
3732 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3733 | 0 | } else { |
3734 | 0 | if (fp->is_bgzf) { |
3735 | | // We keep track of how much in the current block we have |
3736 | | // remaining => R. We look for the last newline in input |
3737 | | // [i] to [i+R], backwards => position N. |
3738 | | // |
3739 | | // If we find a newline, we write out bytes i to N. |
3740 | | // We know we cannot fit the next record in this bgzf block, |
3741 | | // so we flush what we have and copy input N to i+R into |
3742 | | // the start of a new block, and recompute a new R for that. |
3743 | | // |
3744 | | // If we don't find a newline (i==N) then we cannot extend |
3745 | | // the current block at all, so flush whatever is in it now |
3746 | | // if it ends on a newline. |
3747 | | // We still copy i(==N) to i+R to the next block and |
3748 | | // continue as before with a new R. |
3749 | | // |
3750 | | // The only exception on the flush is when we run out of |
3751 | | // data in the input. In that case we skip it as we don't |
3752 | | // yet know if the next record will fit. |
3753 | | // |
3754 | | // Both conditions share the same code here: |
3755 | | // - Look for newline (pos N) |
3756 | | // - Write i to N (which maybe 0) |
3757 | | // - Flush if block ends on newline and not end of input |
3758 | | // - write N to i+R |
3759 | |
|
3760 | 0 | int i = 0; |
3761 | 0 | BGZF *fb = fp->fp.bgzf; |
3762 | 0 | while (i < gl->data_size) { |
3763 | | // remaining space in block |
3764 | 0 | int R = BGZF_BLOCK_SIZE - fb->block_offset; |
3765 | 0 | int eod = 0; |
3766 | 0 | if (R > gl->data_size-i) |
3767 | 0 | R = gl->data_size-i, eod = 1; |
3768 | | |
3769 | | // Find last newline in input data |
3770 | 0 | int N = i + R; |
3771 | 0 | while (--N > i) { |
3772 | 0 | if (gl->data[N] == '\n') |
3773 | 0 | break; |
3774 | 0 | } |
3775 | |
|
3776 | 0 | if (N != i) { |
3777 | | // Found a newline |
3778 | 0 | N++; |
3779 | 0 | if (bgzf_write(fb, &gl->data[i], N-i) != N-i) |
3780 | 0 | goto err; |
3781 | 0 | } |
3782 | | |
3783 | | // Flush bgzf block |
3784 | 0 | int b_off = fb->block_offset; |
3785 | 0 | if (!eod && b_off && |
3786 | 0 | ((char *)fb->uncompressed_block)[b_off-1] == '\n') |
3787 | 0 | if (bgzf_flush_try(fb, BGZF_BLOCK_SIZE) < 0) |
3788 | 0 | goto err; |
3789 | | |
3790 | | // Copy from N onwards into next block |
3791 | 0 | if (i+R > N) |
3792 | 0 | if (bgzf_write(fb, &gl->data[N], i+R - N) |
3793 | 0 | != i+R - N) |
3794 | 0 | goto err; |
3795 | | |
3796 | 0 | i = i+R; |
3797 | 0 | } |
3798 | 0 | } else { |
3799 | 0 | if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size) |
3800 | 0 | goto err; |
3801 | 0 | } |
3802 | 0 | } |
3803 | | |
3804 | 0 | hts_tpool_delete_result(r, 0); |
3805 | | |
3806 | | // Also updated by main thread |
3807 | 0 | pthread_mutex_lock(&fd->lines_m); |
3808 | 0 | gl->next = fd->lines; |
3809 | 0 | fd->lines = gl; |
3810 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3811 | 0 | } |
3812 | | |
3813 | 0 | sam_state_err(fd, 0); // success |
3814 | 0 | hts_tpool_process_shutdown(fd->q); |
3815 | 0 | return NULL; |
3816 | | |
3817 | 0 | err: |
3818 | 0 | sam_state_err(fd, errno ? errno : EIO); |
3819 | 0 | return (void *)-1; |
3820 | 0 | } |
3821 | | |
3822 | | // Run from one of the worker threads. |
3823 | | // Convert a passed in array of BAMs (sp_bams) and converts to a block |
3824 | | // of text SAM records (sp_lines). |
3825 | 0 | static void *sam_format_worker(void *arg) { |
3826 | 0 | sp_bams *gb = (sp_bams *)arg; |
3827 | 0 | sp_lines *gl = NULL; |
3828 | 0 | int i; |
3829 | 0 | SAM_state *fd = gb->fd; |
3830 | 0 | htsFile *fp = fd->fp; |
3831 | | |
3832 | | // Use a block of SAM strings we had earlier if available. |
3833 | 0 | pthread_mutex_lock(&fd->lines_m); |
3834 | 0 | if (fd->lines) { |
3835 | 0 | gl = fd->lines; |
3836 | 0 | fd->lines = gl->next; |
3837 | 0 | } |
3838 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3839 | |
|
3840 | 0 | if (gl == NULL) { |
3841 | 0 | gl = calloc(1, sizeof(*gl)); |
3842 | 0 | if (!gl) { |
3843 | 0 | sam_state_err(fd, ENOMEM); |
3844 | 0 | return NULL; |
3845 | 0 | } |
3846 | 0 | gl->alloc = gl->data_size = 0; |
3847 | 0 | gl->data = NULL; |
3848 | 0 | } |
3849 | 0 | gl->serial = gb->serial; |
3850 | 0 | gl->next = NULL; |
3851 | |
|
3852 | 0 | kstring_t ks = {0, gl->alloc, gl->data}; |
3853 | |
|
3854 | 0 | for (i = 0; i < gb->nbams; i++) { |
3855 | 0 | if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) { |
3856 | 0 | sam_state_err(fd, errno ? errno : EIO); |
3857 | 0 | goto err; |
3858 | 0 | } |
3859 | 0 | kputc('\n', &ks); |
3860 | 0 | } |
3861 | | |
3862 | 0 | pthread_mutex_lock(&fd->lines_m); |
3863 | 0 | gl->data_size = ks.l; |
3864 | 0 | gl->alloc = ks.m; |
3865 | 0 | gl->data = ks.s; |
3866 | |
|
3867 | 0 | if (fp->idx) { |
3868 | | // Keep hold of the bam array a little longer as |
3869 | | // sam_dispatcher_write needs to use them for building the index. |
3870 | 0 | gl->bams = gb; |
3871 | 0 | } else { |
3872 | | // Add bam array to free-list |
3873 | 0 | gb->next = fd->bams; |
3874 | 0 | fd->bams = gb; |
3875 | 0 | } |
3876 | 0 | pthread_mutex_unlock(&fd->lines_m); |
3877 | |
|
3878 | 0 | return gl; |
3879 | | |
3880 | 0 | err: |
3881 | | // Possible race between this and fd->curr_bam. |
3882 | | // Easier to not free and leave it on the input list so it |
3883 | | // gets freed there instead? |
3884 | | // sam_free_sp_bams(gb); |
3885 | 0 | if (gl) { |
3886 | 0 | free(gl->data); |
3887 | 0 | free(gl); |
3888 | 0 | } |
3889 | 0 | return NULL; |
3890 | 0 | } |
3891 | | |
3892 | 0 | int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) { |
3893 | 0 | if (fp->state) |
3894 | 0 | return 0; |
3895 | | |
3896 | 0 | if (!(fp->state = sam_state_create(fp))) |
3897 | 0 | return -1; |
3898 | 0 | SAM_state *fd = (SAM_state *)fp->state; |
3899 | |
|
3900 | 0 | pthread_mutex_init(&fd->lines_m, NULL); |
3901 | 0 | pthread_mutex_init(&fd->command_m, NULL); |
3902 | 0 | pthread_cond_init(&fd->command_c, NULL); |
3903 | 0 | fd->p = p->pool; |
3904 | 0 | int qsize = p->qsize; |
3905 | 0 | if (!qsize) |
3906 | 0 | qsize = 2*hts_tpool_size(fd->p); |
3907 | 0 | fd->q = hts_tpool_process_init(fd->p, qsize, 0); |
3908 | 0 | if (!fd->q) { |
3909 | 0 | sam_state_destroy(fp); |
3910 | 0 | return -1; |
3911 | 0 | } |
3912 | | |
3913 | 0 | if (fp->format.compression == bgzf) |
3914 | 0 | return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize); |
3915 | | |
3916 | 0 | return 0; |
3917 | 0 | } |
3918 | | |
3919 | 0 | int sam_set_threads(htsFile *fp, int nthreads) { |
3920 | 0 | if (nthreads <= 0) |
3921 | 0 | return 0; |
3922 | | |
3923 | 0 | htsThreadPool p; |
3924 | 0 | p.pool = hts_tpool_init(nthreads); |
3925 | 0 | p.qsize = nthreads*2; |
3926 | |
|
3927 | 0 | int ret = sam_set_thread_pool(fp, &p); |
3928 | 0 | if (ret < 0) |
3929 | 0 | return ret; |
3930 | | |
3931 | 0 | SAM_state *fd = (SAM_state *)fp->state; |
3932 | 0 | fd->own_pool = 1; |
3933 | |
|
3934 | 0 | return 0; |
3935 | 0 | } |
3936 | | |
3937 | | typedef struct { |
3938 | | kstring_t name; |
3939 | | kstring_t comment; // NB: pointer into name, do not free |
3940 | | kstring_t seq; |
3941 | | kstring_t qual; |
3942 | | int casava; |
3943 | | int aux; |
3944 | | int rnum; |
3945 | | char BC[3]; // aux tag ID for barcode |
3946 | | khash_t(tag) *tags; // which aux tags to use (if empty, use all). |
3947 | | char nprefix; |
3948 | | int sra_names; |
3949 | | } fastq_state; |
3950 | | |
3951 | | // Initialise fastq state. |
3952 | | // Name char of '@' or '>' distinguishes fastq vs fasta variant |
3953 | 3.49k | static fastq_state *fastq_state_init(int name_char) { |
3954 | 3.49k | fastq_state *x = (fastq_state *)calloc(1, sizeof(*x)); |
3955 | 3.49k | if (!x) |
3956 | 0 | return NULL; |
3957 | 3.49k | strcpy(x->BC, "BC"); |
3958 | 3.49k | x->nprefix = name_char; |
3959 | | |
3960 | 3.49k | return x; |
3961 | 3.49k | } |
3962 | | |
3963 | 4.65k | void fastq_state_destroy(htsFile *fp) { |
3964 | 4.65k | if (fp->state) { |
3965 | 3.49k | fastq_state *x = (fastq_state *)fp->state; |
3966 | 3.49k | if (x->tags) |
3967 | 3.49k | kh_destroy(tag, x->tags); |
3968 | 3.49k | ks_free(&x->name); |
3969 | 3.49k | ks_free(&x->seq); |
3970 | 3.49k | ks_free(&x->qual); |
3971 | 3.49k | free(fp->state); |
3972 | 3.49k | } |
3973 | 4.65k | } |
3974 | | |
3975 | 0 | int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) { |
3976 | 0 | va_list args; |
3977 | |
|
3978 | 0 | if (!fp) |
3979 | 0 | return -1; |
3980 | 0 | if (!fp->state) |
3981 | 0 | if (!(fp->state = fastq_state_init(fp->format.format == fastq_format |
3982 | 0 | ? '@' : '>'))) |
3983 | 0 | return -1; |
3984 | | |
3985 | 0 | fastq_state *x = (fastq_state *)fp->state; |
3986 | |
|
3987 | 0 | switch (opt) { |
3988 | 0 | case FASTQ_OPT_CASAVA: |
3989 | 0 | x->casava = 1; |
3990 | 0 | break; |
3991 | | |
3992 | 0 | case FASTQ_OPT_NAME2: |
3993 | 0 | x->sra_names = 1; |
3994 | 0 | break; |
3995 | | |
3996 | 0 | case FASTQ_OPT_AUX: { |
3997 | 0 | va_start(args, opt); |
3998 | 0 | x->aux = 1; |
3999 | 0 | char *tag = va_arg(args, char *); |
4000 | 0 | va_end(args); |
4001 | 0 | if (tag && strcmp(tag, "1") != 0) { |
4002 | 0 | if (!x->tags) |
4003 | 0 | if (!(x->tags = kh_init(tag))) |
4004 | 0 | return -1; |
4005 | | |
4006 | 0 | size_t i, tlen = strlen(tag); |
4007 | 0 | for (i = 0; i+3 <= tlen+1; i += 3) { |
4008 | 0 | if (tag[i+0] == ',' || tag[i+1] == ',' || |
4009 | 0 | !(tag[i+2] == ',' || tag[i+2] == '\0')) { |
4010 | 0 | hts_log_warning("Bad tag format '%.3s'; skipping option", tag+i); |
4011 | 0 | break; |
4012 | 0 | } |
4013 | 0 | int ret, tcode = tag[i+0]*256 + tag[i+1]; |
4014 | 0 | kh_put(tag, x->tags, tcode, &ret); |
4015 | 0 | if (ret < 0) |
4016 | 0 | return -1; |
4017 | 0 | } |
4018 | 0 | } |
4019 | 0 | break; |
4020 | 0 | } |
4021 | | |
4022 | 0 | case FASTQ_OPT_BARCODE: { |
4023 | 0 | va_start(args, opt); |
4024 | 0 | char *bc = va_arg(args, char *); |
4025 | 0 | va_end(args); |
4026 | 0 | strncpy(x->BC, bc, 2); |
4027 | 0 | x->BC[2] = 0; |
4028 | 0 | break; |
4029 | 0 | } |
4030 | | |
4031 | 0 | case FASTQ_OPT_RNUM: |
4032 | 0 | x->rnum = 1; |
4033 | 0 | break; |
4034 | | |
4035 | 0 | default: |
4036 | 0 | break; |
4037 | 0 | } |
4038 | 0 | return 0; |
4039 | 0 | } |
4040 | | |
4041 | 55.8M | static int fastq_parse1(htsFile *fp, bam1_t *b) { |
4042 | 55.8M | fastq_state *x = (fastq_state *)fp->state; |
4043 | 55.8M | size_t i, l; |
4044 | 55.8M | int ret = 0; |
4045 | | |
4046 | 55.8M | if (fp->format.format == fasta_format && fp->line.s) { |
4047 | | // For FASTA we've already read the >name line; steal it |
4048 | | // Not the most efficient, but we don't optimise for fasta reading. |
4049 | 55.8M | if (fp->line.l == 0) |
4050 | 1.65k | return -1; // EOF |
4051 | | |
4052 | 55.8M | free(x->name.s); |
4053 | 55.8M | x->name = fp->line; |
4054 | 55.8M | fp->line.l = fp->line.m = 0; |
4055 | 55.8M | fp->line.s = NULL; |
4056 | 55.8M | } else { |
4057 | | // Read a FASTQ format entry. |
4058 | 5.86k | ret = hts_getline(fp, KS_SEP_LINE, &x->name); |
4059 | 5.86k | if (ret == -1) |
4060 | 51 | return -1; // EOF |
4061 | 5.81k | else if (ret < -1) |
4062 | 156 | return ret; // ERR |
4063 | 5.86k | } |
4064 | | |
4065 | | // Name |
4066 | 55.8M | if (*x->name.s != x->nprefix) |
4067 | 78 | return -2; |
4068 | | |
4069 | | // Reverse the SRA strangeness of putting the run_name.number before |
4070 | | // the read name. |
4071 | 55.8M | i = 0; |
4072 | 55.8M | char *name = x->name.s+1; |
4073 | 55.8M | if (x->sra_names) { |
4074 | 0 | char *cp = strpbrk(x->name.s, " \t"); |
4075 | 0 | if (cp) { |
4076 | 0 | while (*cp == ' ' || *cp == '\t') |
4077 | 0 | cp++; |
4078 | 0 | *--cp = '@'; |
4079 | 0 | i = cp - x->name.s; |
4080 | 0 | name = cp+1; |
4081 | 0 | } |
4082 | 0 | } |
4083 | | |
4084 | 55.8M | l = x->name.l; |
4085 | 55.8M | char *s = x->name.s; |
4086 | 147M | while (i < l && !isspace_c(s[i])) |
4087 | 92.0M | i++; |
4088 | 55.8M | if (i < l) { |
4089 | 455k | s[i] = 0; |
4090 | 455k | x->name.l = i++; |
4091 | 455k | } |
4092 | | |
4093 | | // Comment; a kstring struct, but pointer into name line. (Do not free) |
4094 | 58.3M | while (i < l && isspace_c(s[i])) |
4095 | 2.44M | i++; |
4096 | 55.8M | x->comment.s = s+i; |
4097 | 55.8M | x->comment.l = l - i; |
4098 | | |
4099 | | // Seq |
4100 | 55.8M | x->seq.l = 0; |
4101 | 184M | for (;;) { |
4102 | 184M | if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0) |
4103 | 3.02k | if (fp->format.format == fastq_format || ret < -1) |
4104 | 1.34k | return -2; |
4105 | 184M | if (ret == -1 || |
4106 | 184M | *fp->line.s == (fp->format.format == fastq_format ? '+' : '>')) |
4107 | 55.8M | break; |
4108 | 128M | if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0) |
4109 | 0 | return -2; |
4110 | 128M | } |
4111 | | |
4112 | | // Qual |
4113 | 55.8M | if (fp->format.format == fastq_format) { |
4114 | 2.47k | size_t remainder = x->seq.l; |
4115 | 2.47k | x->qual.l = 0; |
4116 | 12.0k | do { |
4117 | 12.0k | if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0) |
4118 | 45 | return -2; |
4119 | 12.0k | if (fp->line.l > remainder) |
4120 | 75 | return -2; |
4121 | 11.9k | if (kputsn(fp->line.s, fp->line.l, &x->qual) < 0) |
4122 | 0 | return -2; |
4123 | 11.9k | remainder -= fp->line.l; |
4124 | 11.9k | } while (remainder > 0); |
4125 | | |
4126 | | // Decr qual |
4127 | 9.06k | for (i = 0; i < x->qual.l; i++) |
4128 | 6.70k | x->qual.s[i] -= '!'; |
4129 | 2.35k | } |
4130 | | |
4131 | 55.8M | int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED; |
4132 | 55.8M | if (x->name.l > 2 && |
4133 | 55.8M | x->name.s[x->name.l-2] == '/' && |
4134 | 55.8M | isdigit_c(x->name.s[x->name.l-1])) { |
4135 | 8.99k | switch(x->name.s[x->name.l-1]) { |
4136 | 1.74k | case '1': flag |= BAM_FREAD1 | pflag; break; |
4137 | 2.52k | case '2': flag |= BAM_FREAD2 | pflag; break; |
4138 | 4.73k | default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break; |
4139 | 8.99k | } |
4140 | 8.99k | x->name.s[x->name.l-=2] = 0; |
4141 | 8.99k | } |
4142 | | |
4143 | | // Convert to BAM |
4144 | 55.8M | ret = bam_set1(b, |
4145 | 55.8M | x->name.s + x->name.l - name, name, |
4146 | 55.8M | flag, |
4147 | 55.8M | -1, -1, 0, // ref '*', pos, mapq, |
4148 | 55.8M | 0, NULL, // no cigar, |
4149 | 55.8M | -1, -1, 0, // mate |
4150 | 55.8M | x->seq.l, x->seq.s, x->qual.s, |
4151 | 55.8M | 0); |
4152 | | |
4153 | | // Identify Illumina CASAVA strings. |
4154 | | // <read>:<is_filtered>:<control_bits>:<barcode_sequence> |
4155 | 55.8M | char *barcode = NULL; |
4156 | 55.8M | int barcode_len = 0; |
4157 | 55.8M | kstring_t *kc = &x->comment; |
4158 | 55.8M | char *endptr; |
4159 | 55.8M | if (x->casava && |
4160 | | // \d:[YN]:\d+:[ACGTN]+ |
4161 | 55.8M | kc->l > 6 && (kc->s[1] | kc->s[3]) == ':' && isdigit_c(kc->s[0]) && |
4162 | 55.8M | strtol(kc->s+4, &endptr, 10) >= 0 && endptr != kc->s+4 |
4163 | 55.8M | && *endptr == ':') { |
4164 | | |
4165 | | // read num |
4166 | 0 | switch(kc->s[0]) { |
4167 | 0 | case '1': b->core.flag |= BAM_FREAD1 | pflag; break; |
4168 | 0 | case '2': b->core.flag |= BAM_FREAD2 | pflag; break; |
4169 | 0 | default : b->core.flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break; |
4170 | 0 | } |
4171 | | |
4172 | 0 | if (kc->s[2] == 'Y') |
4173 | 0 | b->core.flag |= BAM_FQCFAIL; |
4174 | | |
4175 | | // Barcode, maybe numeric in which case we skip it |
4176 | 0 | if (!isdigit_c(endptr[1])) { |
4177 | 0 | barcode = endptr+1; |
4178 | 0 | for (i = barcode - kc->s; i < kc->l; i++) |
4179 | 0 | if (isspace_c(kc->s[i])) |
4180 | 0 | break; |
4181 | |
|
4182 | 0 | kc->s[i] = 0; |
4183 | 0 | barcode_len = i+1-(barcode - kc->s); |
4184 | 0 | } |
4185 | 0 | } |
4186 | | |
4187 | 55.8M | if (ret >= 0 && barcode_len) |
4188 | 0 | if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0) |
4189 | 0 | ret = -2; |
4190 | | |
4191 | 55.8M | if (!x->aux) |
4192 | 55.8M | return ret; |
4193 | | |
4194 | | // Identify any SAM style aux tags in comments too. |
4195 | 0 | if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0) |
4196 | 0 | ret = -2; |
4197 | |
|
4198 | 0 | return ret; |
4199 | 55.8M | } |
4200 | | |
4201 | | // Internal component of sam_read1 below |
4202 | 4.56k | static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { |
4203 | 4.56k | int ret = bam_read1(fp->fp.bgzf, b); |
4204 | 4.56k | if (h && ret >= 0) { |
4205 | 3.12k | if (b->core.tid >= h->n_targets || b->core.tid < -1 || |
4206 | 3.12k | b->core.mtid >= h->n_targets || b->core.mtid < -1) { |
4207 | 316 | errno = ERANGE; |
4208 | 316 | return -3; |
4209 | 316 | } |
4210 | 3.12k | } |
4211 | 4.25k | return ret; |
4212 | 4.56k | } |
4213 | | |
4214 | | // Internal component of sam_read1 below |
4215 | 8.50k | static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) { |
4216 | 8.50k | int ret = cram_get_bam_seq(fp->fp.cram, b); |
4217 | 8.50k | if (ret < 0) |
4218 | 8.50k | return cram_eof(fp->fp.cram) ? -1 : -2; |
4219 | | |
4220 | 0 | if (bam_tag2cigar(*b, 1, 1) < 0) |
4221 | 0 | return -2; |
4222 | | |
4223 | 0 | return ret; |
4224 | 0 | } |
4225 | | |
4226 | | // Internal component of sam_read1 below |
4227 | 354k | static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { |
4228 | 354k | int ret; |
4229 | | |
4230 | | // Consume 1st line after header parsing as it wasn't using peek |
4231 | 354k | if (fp->line.l != 0) { |
4232 | 39 | ret = sam_parse1(&fp->line, h, b); |
4233 | 39 | fp->line.l = 0; |
4234 | 39 | return ret; |
4235 | 39 | } |
4236 | | |
4237 | 354k | if (fp->state) { |
4238 | 0 | SAM_state *fd = (SAM_state *)fp->state; |
4239 | |
|
4240 | 0 | if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) { |
4241 | | // We don't support multi-threaded SAM parsing with seeks yet. |
4242 | 0 | int ret; |
4243 | 0 | if ((ret = sam_state_destroy(fp)) < 0) { |
4244 | 0 | errno = -ret; |
4245 | 0 | return -2; |
4246 | 0 | } |
4247 | 0 | if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0) |
4248 | 0 | return -1; |
4249 | 0 | fp->fp.bgzf->seeked = 0; |
4250 | 0 | goto err_recover; |
4251 | 0 | } |
4252 | | |
4253 | 0 | if (!fd->h) { |
4254 | 0 | fd->h = h; |
4255 | 0 | fd->h->ref_count++; |
4256 | | // Ensure hrecs is initialised now as we don't want multiple |
4257 | | // threads trying to do this simultaneously. |
4258 | 0 | if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0) |
4259 | 0 | return -2; |
4260 | | |
4261 | | // We can only do this once we've got a header |
4262 | 0 | if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read, |
4263 | 0 | fp) != 0) |
4264 | 0 | return -2; |
4265 | 0 | fd->dispatcher_set = 1; |
4266 | 0 | } |
4267 | | |
4268 | 0 | if (fd->h != h) { |
4269 | 0 | hts_log_error("SAM multi-threaded decoding does not support changing header"); |
4270 | 0 | return -1; |
4271 | 0 | } |
4272 | | |
4273 | 0 | sp_bams *gb = fd->curr_bam; |
4274 | 0 | if (!gb) { |
4275 | 0 | if (fd->errcode) { |
4276 | | // In case reader failed |
4277 | 0 | errno = fd->errcode; |
4278 | 0 | return -2; |
4279 | 0 | } |
4280 | 0 | hts_tpool_result *r = hts_tpool_next_result_wait(fd->q); |
4281 | 0 | if (!r) |
4282 | 0 | return -2; |
4283 | 0 | fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r); |
4284 | 0 | hts_tpool_delete_result(r, 0); |
4285 | 0 | } |
4286 | 0 | if (!gb) |
4287 | 0 | return fd->errcode ? -2 : -1; |
4288 | 0 | bam1_t *b_array = (bam1_t *)gb->bams; |
4289 | 0 | if (fd->curr_idx < gb->nbams) |
4290 | 0 | if (!bam_copy1(b, &b_array[fd->curr_idx++])) |
4291 | 0 | return -2; |
4292 | 0 | if (fd->curr_idx == gb->nbams) { |
4293 | 0 | pthread_mutex_lock(&fd->lines_m); |
4294 | 0 | gb->next = fd->bams; |
4295 | 0 | fd->bams = gb; |
4296 | 0 | pthread_mutex_unlock(&fd->lines_m); |
4297 | |
|
4298 | 0 | fd->curr_bam = NULL; |
4299 | 0 | fd->curr_idx = 0; |
4300 | 0 | } |
4301 | |
|
4302 | 0 | ret = 0; |
4303 | |
|
4304 | 354k | } else { |
4305 | 354k | err_recover: |
4306 | 354k | ret = hts_getline(fp, KS_SEP_LINE, &fp->line); |
4307 | 354k | if (ret < 0) return ret; |
4308 | | |
4309 | 343k | ret = sam_parse1(&fp->line, h, b); |
4310 | 343k | fp->line.l = 0; |
4311 | 343k | if (ret < 0) { |
4312 | 8.30k | hts_log_warning("Parse error at line %lld", (long long)fp->lineno); |
4313 | 8.30k | if (h && h->ignore_sam_err) goto err_recover; |
4314 | 8.30k | } |
4315 | 343k | } |
4316 | | |
4317 | 343k | return ret; |
4318 | 354k | } |
4319 | | |
4320 | | // Returns 0 on success, |
4321 | | // -1 on EOF, |
4322 | | // <-1 on error |
4323 | | int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) |
4324 | 56.2M | { |
4325 | 56.2M | int ret, pass_filter; |
4326 | | |
4327 | 56.2M | do { |
4328 | 56.2M | switch (fp->format.format) { |
4329 | 4.56k | case bam: |
4330 | 4.56k | ret = sam_read1_bam(fp, h, b); |
4331 | 4.56k | break; |
4332 | | |
4333 | 8.50k | case cram: |
4334 | 8.50k | ret = sam_read1_cram(fp, h, &b); |
4335 | 8.50k | break; |
4336 | | |
4337 | 354k | case sam: |
4338 | 354k | ret = sam_read1_sam(fp, h, b); |
4339 | 354k | break; |
4340 | | |
4341 | 55.8M | case fasta_format: |
4342 | 55.8M | case fastq_format: { |
4343 | 55.8M | fastq_state *x = (fastq_state *)fp->state; |
4344 | 55.8M | if (!x) { |
4345 | 3.49k | if (!(fp->state = fastq_state_init(fp->format.format |
4346 | 3.49k | == fastq_format ? '@' : '>'))) |
4347 | 0 | return -2; |
4348 | 3.49k | } |
4349 | | |
4350 | 55.8M | return fastq_parse1(fp, b); |
4351 | 55.8M | } |
4352 | | |
4353 | 0 | case empty_format: |
4354 | 0 | errno = EPIPE; |
4355 | 0 | return -3; |
4356 | | |
4357 | 0 | default: |
4358 | 0 | errno = EFTYPE; |
4359 | 0 | return -3; |
4360 | 56.2M | } |
4361 | | |
4362 | 367k | pass_filter = (ret >= 0 && fp->filter) |
4363 | 367k | ? sam_passes_filter(h, b, fp->filter) |
4364 | 367k | : 1; |
4365 | 367k | } while (pass_filter == 0); |
4366 | | |
4367 | 367k | return pass_filter < 0 ? -2 : ret; |
4368 | 56.2M | } |
4369 | | |
4370 | | // With gcc, -O3 or -ftree-loop-vectorize is really key here as otherwise |
4371 | | // this code isn't vectorised and runs far slower than is necessary (even |
4372 | | // with the restrict keyword being used). |
4373 | | static inline void HTS_OPT3 |
4374 | 1.23k | add33(uint8_t *a, const uint8_t * b, int32_t len) { |
4375 | 1.23k | uint32_t i; |
4376 | 4.91k | for (i = 0; i < len; i++) |
4377 | 3.67k | a[i] = b[i]+33; |
4378 | 1.23k | } |
4379 | | |
4380 | | static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) |
4381 | 18.7M | { |
4382 | 18.7M | int i, r = 0; |
4383 | 18.7M | uint8_t *s, *end; |
4384 | 18.7M | const bam1_core_t *c = &b->core; |
4385 | | |
4386 | 18.7M | if (c->l_qname == 0) |
4387 | 0 | return -1; |
4388 | 18.7M | r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str); |
4389 | 18.7M | r |= kputc_('\t', str); // query name |
4390 | 18.7M | r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag |
4391 | 18.7M | if (c->tid >= 0) { // chr |
4392 | 60.4k | r |= kputs(h->target_name[c->tid] , str); |
4393 | 60.4k | r |= kputc_('\t', str); |
4394 | 18.6M | } else r |= kputsn_("*\t", 2, str); |
4395 | 18.7M | r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos |
4396 | 18.7M | r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual |
4397 | 18.7M | if (c->n_cigar) { // cigar |
4398 | 102k | uint32_t *cigar = bam_get_cigar(b); |
4399 | 7.48M | for (i = 0; i < c->n_cigar; ++i) { |
4400 | 7.37M | r |= kputw(bam_cigar_oplen(cigar[i]), str); |
4401 | 7.37M | r |= kputc_(bam_cigar_opchr(cigar[i]), str); |
4402 | 7.37M | } |
4403 | 18.6M | } else r |= kputc_('*', str); |
4404 | 18.7M | r |= kputc_('\t', str); |
4405 | 18.7M | if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr |
4406 | 4.26k | else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str); |
4407 | 2.63k | else { |
4408 | 2.63k | r |= kputs(h->target_name[c->mtid], str); |
4409 | 2.63k | r |= kputc_('\t', str); |
4410 | 2.63k | } |
4411 | 18.7M | r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos |
4412 | 18.7M | r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len |
4413 | 18.7M | if (c->l_qseq) { // seq and qual |
4414 | 729k | uint8_t *s = bam_get_seq(b); |
4415 | 729k | if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err; |
4416 | 729k | char *cp = str->s + str->l; |
4417 | | |
4418 | | // Sequence, 2 bases at a time |
4419 | 729k | nibble2base(s, cp, c->l_qseq); |
4420 | 729k | cp[c->l_qseq] = '\t'; |
4421 | 729k | cp += c->l_qseq+1; |
4422 | | |
4423 | | // Quality |
4424 | 729k | s = bam_get_qual(b); |
4425 | 729k | i = 0; |
4426 | 729k | if (s[0] == 0xff) { |
4427 | 728k | cp[i++] = '*'; |
4428 | 728k | } else { |
4429 | 1.23k | add33((uint8_t *)cp, s, c->l_qseq); // cp[i] = s[i]+33; |
4430 | 1.23k | i = c->l_qseq; |
4431 | 1.23k | } |
4432 | 729k | cp[i] = 0; |
4433 | 729k | cp += i; |
4434 | 729k | str->l = cp - str->s; |
4435 | 18.0M | } else r |= kputsn_("*\t*", 3, str); |
4436 | | |
4437 | 18.7M | s = bam_get_aux(b); // aux |
4438 | 18.7M | end = b->data + b->l_data; |
4439 | | |
4440 | 21.9M | while (end - s >= 4) { |
4441 | 3.23M | r |= kputc_('\t', str); |
4442 | 3.23M | if ((s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)) == NULL) |
4443 | 287 | goto bad_aux; |
4444 | 3.23M | } |
4445 | 18.7M | r |= kputsn("", 0, str); // nul terminate |
4446 | 18.7M | if (r < 0) goto mem_err; |
4447 | | |
4448 | 18.7M | return str->l; |
4449 | | |
4450 | 287 | bad_aux: |
4451 | 287 | hts_log_error("Corrupted aux data for read %.*s flag %d", |
4452 | 287 | b->core.l_qname, bam_get_qname(b), b->core.flag); |
4453 | 287 | errno = EINVAL; |
4454 | 287 | return -1; |
4455 | | |
4456 | 0 | mem_err: |
4457 | 0 | hts_log_error("Out of memory"); |
4458 | 0 | errno = ENOMEM; |
4459 | 0 | return -1; |
4460 | 18.7M | } |
4461 | | |
4462 | | int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) |
4463 | 18.7M | { |
4464 | 18.7M | str->l = 0; |
4465 | 18.7M | return sam_format1_append(h, b, str); |
4466 | 18.7M | } |
4467 | | |
4468 | | static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end); |
4469 | | int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str) |
4470 | 0 | { |
4471 | 0 | unsigned flag = b->core.flag; |
4472 | 0 | int i, e = 0, len = b->core.l_qseq; |
4473 | 0 | uint8_t *seq, *qual; |
4474 | |
|
4475 | 0 | str->l = 0; |
4476 | | |
4477 | | // Name |
4478 | 0 | if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF) |
4479 | 0 | return -1; |
4480 | | |
4481 | | // /1 or /2 suffix |
4482 | 0 | if (x && x->rnum && (flag & BAM_FPAIRED)) { |
4483 | 0 | int r12 = flag & (BAM_FREAD1 | BAM_FREAD2); |
4484 | 0 | if (r12 == BAM_FREAD1) { |
4485 | 0 | if (kputs("/1", str) == EOF) |
4486 | 0 | return -1; |
4487 | 0 | } else if (r12 == BAM_FREAD2) { |
4488 | 0 | if (kputs("/2", str) == EOF) |
4489 | 0 | return -1; |
4490 | 0 | } |
4491 | 0 | } |
4492 | | |
4493 | | // Illumina CASAVA tag. |
4494 | | // This is <rnum>:<Y/N qcfail>:<control-bits>:<barcode-or-zero> |
4495 | 0 | if (x && x->casava) { |
4496 | 0 | int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0; |
4497 | 0 | char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N'; |
4498 | 0 | uint8_t *bc = bam_aux_get(b, x->BC); |
4499 | 0 | if (ksprintf(str, " %d:%c:0:%s", rnum, filtered, |
4500 | 0 | bc ? (char *)bc+1 : "0") < 0) |
4501 | 0 | return -1; |
4502 | | |
4503 | 0 | if (bc && (*bc != 'Z' || (!isupper_c(bc[1]) && !islower_c(bc[1])))) { |
4504 | 0 | hts_log_warning("BC tag starts with non-sequence base; using '0'"); |
4505 | 0 | str->l -= strlen((char *)bc)-2; // limit to 1 char |
4506 | 0 | str->s[str->l-1] = '0'; |
4507 | 0 | str->s[str->l] = 0; |
4508 | 0 | bc = NULL; |
4509 | 0 | } |
4510 | | |
4511 | | // Replace any non-alpha with '+'. Ie seq-seq to seq+seq |
4512 | 0 | if (bc) { |
4513 | 0 | int l = strlen((char *)bc+1); |
4514 | 0 | char *c = (char *)str->s + str->l - l; |
4515 | 0 | for (i = 0; i < l; i++) { |
4516 | 0 | if (!isalpha_c(c[i])) |
4517 | 0 | c[i] = '+'; |
4518 | 0 | else if (islower_c(c[i])) |
4519 | 0 | c[i] = toupper_c(c[i]); |
4520 | 0 | } |
4521 | 0 | } |
4522 | 0 | } |
4523 | | |
4524 | | // Aux tags |
4525 | 0 | if (x && x->aux) { |
4526 | 0 | uint8_t *s = bam_get_aux(b), *end = b->data + b->l_data; |
4527 | 0 | while (s && end - s >= 4) { |
4528 | 0 | int tt = s[0]*256 + s[1]; |
4529 | 0 | if (x->tags == NULL || |
4530 | 0 | kh_get(tag, x->tags, tt) != kh_end(x->tags)) { |
4531 | 0 | e |= kputc_('\t', str) < 0; |
4532 | 0 | if (!(s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str))) |
4533 | 0 | return -1; |
4534 | 0 | } else { |
4535 | 0 | s = skip_aux(s+2, end); |
4536 | 0 | } |
4537 | 0 | } |
4538 | 0 | e |= kputsn("", 0, str) < 0; // nul terminate |
4539 | 0 | } |
4540 | | |
4541 | 0 | if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1; |
4542 | 0 | e |= kputc_('\n', str) < 0; |
4543 | | |
4544 | | // Seq line |
4545 | 0 | seq = bam_get_seq(b); |
4546 | 0 | if (flag & BAM_FREVERSE) |
4547 | 0 | for (i = len-1; i >= 0; i--) |
4548 | 0 | e |= kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str) < 0; |
4549 | 0 | else |
4550 | 0 | for (i = 0; i < len; i++) |
4551 | 0 | e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0; |
4552 | | |
4553 | | |
4554 | | // Qual line |
4555 | 0 | if (x->nprefix == '@') { |
4556 | 0 | kputsn("\n+\n", 3, str); |
4557 | 0 | qual = bam_get_qual(b); |
4558 | 0 | if (qual[0] == 0xff) |
4559 | 0 | for (i = 0; i < len; i++) |
4560 | 0 | e |= kputc_('B', str) < 0; |
4561 | 0 | else if (flag & BAM_FREVERSE) |
4562 | 0 | for (i = len-1; i >= 0; i--) |
4563 | 0 | e |= kputc_(33 + qual[i], str) < 0; |
4564 | 0 | else |
4565 | 0 | for (i = 0; i < len; i++) |
4566 | 0 | e |= kputc_(33 + qual[i], str) < 0; |
4567 | |
|
4568 | 0 | } |
4569 | 0 | e |= kputc('\n', str) < 0; |
4570 | |
|
4571 | 0 | return e ? -1 : str->l; |
4572 | 0 | } |
4573 | | |
4574 | | // Sadly we need to be able to modify the bam_hdr here so we can |
4575 | | // reference count the structure. |
4576 | | int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) |
4577 | 56.2M | { |
4578 | 56.2M | switch (fp->format.format) { |
4579 | 0 | case binary_format: |
4580 | 0 | fp->format.category = sequence_data; |
4581 | 0 | fp->format.format = bam; |
4582 | | /* fall-through */ |
4583 | 18.7M | case bam: |
4584 | 18.7M | return bam_write_idx1(fp, h, b); |
4585 | | |
4586 | 18.7M | case cram: |
4587 | 18.7M | return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b); |
4588 | | |
4589 | 0 | case text_format: |
4590 | 0 | fp->format.category = sequence_data; |
4591 | 0 | fp->format.format = sam; |
4592 | | /* fall-through */ |
4593 | 18.7M | case sam: |
4594 | 18.7M | if (fp->state) { |
4595 | 0 | SAM_state *fd = (SAM_state *)fp->state; |
4596 | | |
4597 | | // Threaded output |
4598 | 0 | if (!fd->h) { |
4599 | | // NB: discard const. We don't actually modify sam_hdr_t here, |
4600 | | // just data pointed to by it (which is a bit weasely still), |
4601 | | // but out cached pointer must be non-const as we want to |
4602 | | // destroy it later on and sam_hdr_destroy takes non-const. |
4603 | | // |
4604 | | // We do this because some tools do sam_hdr_destroy; sam_close |
4605 | | // while others do sam_close; sam_hdr_destroy. The former is |
4606 | | // an issue as we need the header still when flushing. |
4607 | 0 | fd->h = (sam_hdr_t *)h; |
4608 | 0 | fd->h->ref_count++; |
4609 | |
|
4610 | 0 | if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write, |
4611 | 0 | fp) != 0) |
4612 | 0 | return -2; |
4613 | 0 | fd->dispatcher_set = 1; |
4614 | 0 | } |
4615 | | |
4616 | 0 | if (fd->h != h) { |
4617 | 0 | hts_log_error("SAM multi-threaded decoding does not support changing header"); |
4618 | 0 | return -2; |
4619 | 0 | } |
4620 | | |
4621 | | // Find a suitable BAM array to copy to |
4622 | 0 | sp_bams *gb = fd->curr_bam; |
4623 | 0 | if (!gb) { |
4624 | 0 | pthread_mutex_lock(&fd->lines_m); |
4625 | 0 | if (fd->bams) { |
4626 | 0 | fd->curr_bam = gb = fd->bams; |
4627 | 0 | fd->bams = gb->next; |
4628 | 0 | gb->next = NULL; |
4629 | 0 | gb->nbams = 0; |
4630 | 0 | gb->bam_mem = 0; |
4631 | 0 | pthread_mutex_unlock(&fd->lines_m); |
4632 | 0 | } else { |
4633 | 0 | pthread_mutex_unlock(&fd->lines_m); |
4634 | 0 | if (!(gb = calloc(1, sizeof(*gb)))) return -1; |
4635 | 0 | if (!(gb->bams = calloc(SAM_NBAM, sizeof(*gb->bams)))) { |
4636 | 0 | free(gb); |
4637 | 0 | return -1; |
4638 | 0 | } |
4639 | 0 | gb->nbams = 0; |
4640 | 0 | gb->abams = SAM_NBAM; |
4641 | 0 | gb->bam_mem = 0; |
4642 | 0 | gb->fd = fd; |
4643 | 0 | fd->curr_idx = 0; |
4644 | 0 | fd->curr_bam = gb; |
4645 | 0 | } |
4646 | 0 | } |
4647 | | |
4648 | 0 | if (!bam_copy1(&gb->bams[gb->nbams++], b)) |
4649 | 0 | return -2; |
4650 | 0 | gb->bam_mem += b->l_data + sizeof(*b); |
4651 | | |
4652 | | // Dispatch if full |
4653 | 0 | if (gb->nbams == SAM_NBAM || gb->bam_mem > SAM_NBYTES*0.8) { |
4654 | 0 | gb->serial = fd->serial++; |
4655 | 0 | pthread_mutex_lock(&fd->command_m); |
4656 | 0 | if (fd->errcode != 0) { |
4657 | 0 | pthread_mutex_unlock(&fd->command_m); |
4658 | 0 | return -fd->errcode; |
4659 | 0 | } |
4660 | 0 | if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb, |
4661 | 0 | cleanup_sp_bams, |
4662 | 0 | cleanup_sp_lines, 0) < 0) { |
4663 | 0 | pthread_mutex_unlock(&fd->command_m); |
4664 | 0 | return -1; |
4665 | 0 | } |
4666 | 0 | pthread_mutex_unlock(&fd->command_m); |
4667 | 0 | fd->curr_bam = NULL; |
4668 | 0 | } |
4669 | | |
4670 | | // Dummy value as we don't know how long it really is. |
4671 | | // We could track file sizes via a SAM_state field, but I don't think |
4672 | | // it is necessary. |
4673 | 0 | return 1; |
4674 | 18.7M | } else { |
4675 | 18.7M | if (sam_format1(h, b, &fp->line) < 0) return -1; |
4676 | 18.7M | kputc('\n', &fp->line); |
4677 | 18.7M | if (fp->is_bgzf) { |
4678 | 0 | if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) |
4679 | 0 | return -1; |
4680 | 0 | if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1; |
4681 | 18.7M | } else { |
4682 | 18.7M | if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1; |
4683 | 18.7M | } |
4684 | | |
4685 | 18.7M | if (fp->idx) { |
4686 | 0 | if (fp->format.compression == bgzf) { |
4687 | 0 | if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), |
4688 | 0 | bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) { |
4689 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
4690 | 0 | bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); |
4691 | 0 | return -1; |
4692 | 0 | } |
4693 | 0 | } else { |
4694 | 0 | if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b), |
4695 | 0 | bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) { |
4696 | 0 | hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", |
4697 | 0 | bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1); |
4698 | 0 | return -1; |
4699 | 0 | } |
4700 | 0 | } |
4701 | 0 | } |
4702 | | |
4703 | 18.7M | return fp->line.l; |
4704 | 18.7M | } |
4705 | | |
4706 | | |
4707 | 0 | case fasta_format: |
4708 | 0 | case fastq_format: { |
4709 | 0 | fastq_state *x = (fastq_state *)fp->state; |
4710 | 0 | if (!x) { |
4711 | 0 | if (!(fp->state = fastq_state_init(fp->format.format |
4712 | 0 | == fastq_format ? '@' : '>'))) |
4713 | 0 | return -2; |
4714 | 0 | } |
4715 | | |
4716 | 0 | if (fastq_format1(fp->state, b, &fp->line) < 0) |
4717 | 0 | return -1; |
4718 | 0 | if (fp->is_bgzf) { |
4719 | 0 | if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) |
4720 | 0 | return -1; |
4721 | 0 | if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l) |
4722 | 0 | return -1; |
4723 | 0 | } else { |
4724 | 0 | if (hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l) |
4725 | 0 | return -1; |
4726 | 0 | } |
4727 | 0 | return fp->line.l; |
4728 | 0 | } |
4729 | | |
4730 | 0 | default: |
4731 | 0 | errno = EBADF; |
4732 | 0 | return -1; |
4733 | 56.2M | } |
4734 | 56.2M | } |
4735 | | |
4736 | | /************************ |
4737 | | *** Auxiliary fields *** |
4738 | | ************************/ |
4739 | | #ifndef HTS_LITTLE_ENDIAN |
4740 | | static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) { |
4741 | | int tsz = aux_type2size(type); |
4742 | | |
4743 | | if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1; |
4744 | | |
4745 | | switch (tsz) { |
4746 | | case 'H': case 'Z': case 1: // Trivial |
4747 | | memcpy(out, in, len); |
4748 | | break; |
4749 | | |
4750 | | #define aux_val_to_le(type_t, store_le) do { \ |
4751 | | type_t v; \ |
4752 | | size_t i; \ |
4753 | | for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \ |
4754 | | memcpy(&v, in + i, sizeof(type_t)); \ |
4755 | | store_le(v, out); \ |
4756 | | } \ |
4757 | | } while (0) |
4758 | | |
4759 | | case 2: aux_val_to_le(uint16_t, u16_to_le); break; |
4760 | | case 4: aux_val_to_le(uint32_t, u32_to_le); break; |
4761 | | case 8: aux_val_to_le(uint64_t, u64_to_le); break; |
4762 | | |
4763 | | #undef aux_val_to_le |
4764 | | |
4765 | | case 'B': { // Recurse! |
4766 | | uint32_t n; |
4767 | | if (len < 5) return -1; |
4768 | | memcpy(&n, in + 1, 4); |
4769 | | out[0] = in[0]; |
4770 | | u32_to_le(n, out + 1); |
4771 | | return aux_to_le(in[0], out + 5, in + 5, len - 5); |
4772 | | } |
4773 | | |
4774 | | default: // Unknown type code |
4775 | | return -1; |
4776 | | } |
4777 | | |
4778 | | |
4779 | | |
4780 | | return 0; |
4781 | | } |
4782 | | #endif |
4783 | | |
4784 | | int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data) |
4785 | 0 | { |
4786 | 0 | uint32_t new_len; |
4787 | |
|
4788 | 0 | assert(b->l_data >= 0); |
4789 | 0 | new_len = b->l_data + 3 + len; |
4790 | 0 | if (new_len > INT32_MAX || new_len < b->l_data) goto nomem; |
4791 | | |
4792 | 0 | if (realloc_bam_data(b, new_len) < 0) return -1; |
4793 | | |
4794 | 0 | b->data[b->l_data] = tag[0]; |
4795 | 0 | b->data[b->l_data + 1] = tag[1]; |
4796 | 0 | b->data[b->l_data + 2] = type; |
4797 | |
|
4798 | 0 | #ifdef HTS_LITTLE_ENDIAN |
4799 | 0 | memcpy(b->data + b->l_data + 3, data, len); |
4800 | | #else |
4801 | | if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) { |
4802 | | errno = EINVAL; |
4803 | | return -1; |
4804 | | } |
4805 | | #endif |
4806 | |
|
4807 | 0 | b->l_data = new_len; |
4808 | |
|
4809 | 0 | return 0; |
4810 | | |
4811 | 0 | nomem: |
4812 | 0 | errno = ENOMEM; |
4813 | 0 | return -1; |
4814 | 0 | } |
4815 | | |
4816 | | static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end) |
4817 | 3.79M | { |
4818 | 3.79M | int size; |
4819 | 3.79M | uint32_t n; |
4820 | 3.79M | if (s >= end) return end; |
4821 | 3.79M | size = aux_type2size(*s); ++s; // skip type |
4822 | 3.79M | switch (size) { |
4823 | 134k | case 'Z': |
4824 | 163k | case 'H': |
4825 | 12.4M | while (s < end && *s) ++s; |
4826 | 163k | return s < end ? s + 1 : end; |
4827 | 45.8k | case 'B': |
4828 | 45.8k | if (end - s < 5) return NULL; |
4829 | 45.8k | size = aux_type2size(*s); ++s; |
4830 | 45.8k | n = le_to_u32(s); |
4831 | 45.8k | s += 4; |
4832 | 45.8k | if (size == 0 || end - s < size * n) return NULL; |
4833 | 45.8k | return s + size * n; |
4834 | 350 | case 0: |
4835 | 350 | return NULL; |
4836 | 3.58M | default: |
4837 | 3.58M | if (end - s < size) return NULL; |
4838 | 3.58M | return s + size; |
4839 | 3.79M | } |
4840 | 3.79M | } |
4841 | | |
4842 | | uint8_t *bam_aux_first(const bam1_t *b) |
|