Coverage Report

Created: 2026-02-11 06:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/sam.c
Line
Count
Source
1
/*  sam.c -- SAM and BAM file I/O and manipulation.
2
3
    Copyright (C) 2008-2010, 2012-2025 Genome Research Ltd.
4
    Copyright (C) 2010, 2012, 2013 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <strings.h>
30
#include <stdio.h>
31
#include <stdlib.h>
32
#include <string.h>
33
#include <errno.h>
34
#include <zlib.h>
35
#include <assert.h>
36
#include <signal.h>
37
#include <inttypes.h>
38
#include <unistd.h>
39
#include <regex.h>
40
41
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
42
#include "fuzz_settings.h"
43
#endif
44
45
// Suppress deprecation message for cigar_tab, which we initialise
46
#include "htslib/hts_defs.h"
47
#undef HTS_DEPRECATED
48
#define HTS_DEPRECATED(message)
49
50
#include "htslib/sam.h"
51
#include "htslib/bgzf.h"
52
#include "cram/cram.h"
53
#include "hts_internal.h"
54
#include "sam_internal.h"
55
#include "htslib/hfile.h"
56
#include "htslib/hts_endian.h"
57
#include "htslib/hts_expr.h"
58
#include "header.h"
59
60
#include "htslib/khash.h"
61
KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
62
KHASH_SET_INIT_INT(tag)
63
64
#ifndef EFTYPE
65
0
#define EFTYPE ENOEXEC
66
#endif
67
#ifndef EOVERFLOW
68
#define EOVERFLOW ERANGE
69
#endif
70
71
/**********************
72
 *** BAM header I/O ***
73
 **********************/
74
75
HTSLIB_EXPORT
76
const int8_t bam_cigar_table[256] = {
77
    // 0 .. 47
78
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
79
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
80
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
81
82
    // 48 .. 63  (including =)
83
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, BAM_CEQUAL, -1, -1,
84
85
    // 64 .. 79  (including MIDNHB)
86
    -1, -1, BAM_CBACK, -1,  BAM_CDEL, -1, -1, -1,
87
        BAM_CHARD_CLIP, BAM_CINS, -1, -1,  -1, BAM_CMATCH, BAM_CREF_SKIP, -1,
88
89
    // 80 .. 95  (including SPX)
90
    BAM_CPAD, -1, -1, BAM_CSOFT_CLIP,  -1, -1, -1, -1,
91
        BAM_CDIFF, -1, -1, -1,  -1, -1, -1, -1,
92
93
    // 96 .. 127
94
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
95
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
96
97
    // 128 .. 255
98
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
99
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
100
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
101
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
102
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
103
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
104
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
105
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1
106
};
107
108
sam_hdr_t *sam_hdr_init(void)
109
10.9k
{
110
10.9k
    sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t));
111
10.9k
    if (bh == NULL) return NULL;
112
113
10.9k
    bh->cigar_tab = bam_cigar_table;
114
10.9k
    return bh;
115
10.9k
}
116
117
void sam_hdr_destroy(sam_hdr_t *bh)
118
24.8k
{
119
24.8k
    int32_t i;
120
121
24.8k
    if (bh == NULL) return;
122
123
14.1k
    if (bh->ref_count > 0) {
124
3.19k
        --bh->ref_count;
125
3.19k
        return;
126
3.19k
    }
127
128
10.9k
    if (bh->target_name) {
129
13.6k
        for (i = 0; i < bh->n_targets; ++i)
130
8.09k
            free(bh->target_name[i]);
131
5.56k
        free(bh->target_name);
132
5.56k
        free(bh->target_len);
133
5.56k
    }
134
10.9k
    free(bh->text);
135
10.9k
    if (bh->hrecs)
136
6.74k
        sam_hrecs_free(bh->hrecs);
137
10.9k
    if (bh->sdict)
138
410
        kh_destroy(s2i, (khash_t(s2i) *) bh->sdict);
139
10.9k
    free(bh);
140
10.9k
}
141
142
// Copy the sam_hdr_t::sdict hash, used to store the real lengths of long
143
// references before sam_hdr_t::hrecs is populated
144
int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h)
145
0
{
146
0
    const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict;
147
0
    khash_t(s2i) *dest_long_refs = kh_init(s2i);
148
0
    int i;
149
0
    if (!dest_long_refs) return -1;
150
151
0
    for (i = 0; i < h->n_targets; i++) {
152
0
        int ret;
153
0
        khiter_t ksrc, kdest;
154
0
        if (h->target_len[i] < UINT32_MAX) continue;
155
0
        ksrc = kh_get(s2i, src_long_refs, h->target_name[i]);
156
0
        if (ksrc == kh_end(src_long_refs)) continue;
157
0
        kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret);
158
0
        if (ret < 0) {
159
0
            kh_destroy(s2i, dest_long_refs);
160
0
            return -1;
161
0
        }
162
0
        kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc);
163
0
    }
164
165
0
    h->sdict = dest_long_refs;
166
0
    return 0;
167
0
}
168
169
sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0)
170
6.30k
{
171
6.30k
    if (h0 == NULL) return NULL;
172
6.30k
    sam_hdr_t *h;
173
6.30k
    if ((h = sam_hdr_init()) == NULL) return NULL;
174
    // copy the simple data
175
6.30k
    h->n_targets = 0;
176
6.30k
    h->ignore_sam_err = h0->ignore_sam_err;
177
6.30k
    h->l_text = 0;
178
179
    // Then the pointery stuff
180
181
6.30k
    if (!h0->hrecs) {
182
3
        h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t));
183
3
        if (!h->target_len) goto fail;
184
3
        h->target_name = (char**)calloc(h0->n_targets, sizeof(char*));
185
3
        if (!h->target_name) goto fail;
186
187
3
        int i;
188
3
        for (i = 0; i < h0->n_targets; ++i) {
189
0
            h->target_len[i] = h0->target_len[i];
190
0
            h->target_name[i] = strdup(h0->target_name[i]);
191
0
            if (!h->target_name[i]) break;
192
0
        }
193
3
        h->n_targets = i;
194
3
        if (i < h0->n_targets) goto fail;
195
196
3
        if (h0->sdict) {
197
0
            if (sam_hdr_dup_sdict(h0, h) < 0) goto fail;
198
0
        }
199
3
    }
200
201
6.30k
    if (h0->hrecs) {
202
6.30k
        kstring_t tmp = { 0, 0, NULL };
203
6.30k
        if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) {
204
0
            free(ks_release(&tmp));
205
0
            goto fail;
206
0
        }
207
208
6.30k
        h->l_text = tmp.l;
209
6.30k
        h->text   = ks_release(&tmp);
210
211
6.30k
        if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0)
212
0
            goto fail;
213
6.30k
    } else {
214
3
        h->l_text = h0->text ? h0->l_text : 0;
215
3
        h->text = malloc(h->l_text + 1);
216
3
        if (!h->text) goto fail;
217
3
        if (h0->text)
218
3
            memcpy(h->text, h0->text, h->l_text);
219
3
        h->text[h->l_text] = '\0';
220
3
    }
221
222
6.30k
    return h;
223
224
0
 fail:
225
0
    sam_hdr_destroy(h);
226
0
    return NULL;
227
6.30k
}
228
229
sam_hdr_t *bam_hdr_read(BGZF *fp)
230
78
{
231
78
    sam_hdr_t *h;
232
78
    uint8_t buf[4];
233
78
    int magic_len, has_EOF;
234
78
    int32_t i, name_len, num_names = 0;
235
78
    size_t bufsize;
236
78
    ssize_t bytes;
237
    // check EOF
238
78
    has_EOF = bgzf_check_EOF(fp);
239
78
    if (has_EOF < 0) {
240
0
        perror("[W::bam_hdr_read] bgzf_check_EOF");
241
78
    } else if (has_EOF == 0) {
242
78
        hts_log_warning("EOF marker is absent. The input is probably truncated");
243
78
    }
244
    // read "BAM1"
245
78
    magic_len = bgzf_read(fp, buf, 4);
246
78
    if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) {
247
3
        hts_log_error("Invalid BAM binary header");
248
3
        return 0;
249
3
    }
250
75
    h = sam_hdr_init();
251
75
    if (!h) goto nomem;
252
253
    // read plain text and the number of reference sequences
254
75
    bytes = bgzf_read(fp, buf, 4);
255
75
    if (bytes != 4) goto read_err;
256
75
    h->l_text = le_to_u32(buf);
257
258
75
    bufsize = h->l_text + 1;
259
75
    if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
260
75
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
261
75
    if (bufsize > FUZZ_ALLOC_LIMIT) goto nomem;
262
75
#endif
263
75
    h->text = (char*)malloc(bufsize);
264
75
    if (!h->text) goto nomem;
265
75
    h->text[h->l_text] = 0; // make sure it is NULL terminated
266
75
    bytes = bgzf_read(fp, h->text, h->l_text);
267
75
    if (bytes != h->l_text) goto read_err;
268
269
69
    bytes = bgzf_read(fp, &h->n_targets, 4);
270
69
    if (bytes != 4) goto read_err;
271
69
    if (fp->is_be) ed_swap_4p(&h->n_targets);
272
273
69
    if (h->n_targets < 0) goto invalid;
274
275
    // read reference sequence names and lengths
276
69
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
277
69
    if (h->n_targets > (FUZZ_ALLOC_LIMIT - bufsize)/(sizeof(char*)+sizeof(uint32_t)))
278
3
        goto nomem;
279
66
#endif
280
66
    if (h->n_targets > 0) {
281
27
        h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
282
27
        if (!h->target_name) goto nomem;
283
27
        h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
284
27
        if (!h->target_len) goto nomem;
285
27
    }
286
39
    else {
287
39
        h->target_name = NULL;
288
39
        h->target_len = NULL;
289
39
    }
290
291
330
    for (i = 0; i != h->n_targets; ++i) {
292
288
        bytes = bgzf_read(fp, &name_len, 4);
293
288
        if (bytes != 4) goto read_err;
294
288
        if (fp->is_be) ed_swap_4p(&name_len);
295
288
        if (name_len <= 0) goto invalid;
296
297
285
        h->target_name[i] = (char*)malloc(name_len);
298
285
        if (!h->target_name[i]) goto nomem;
299
285
        num_names++;
300
301
285
        bytes = bgzf_read(fp, h->target_name[i], name_len);
302
285
        if (bytes != name_len) goto read_err;
303
304
264
        if (h->target_name[i][name_len - 1] != '\0') {
305
            /* Fix missing NUL-termination.  Is this being too nice?
306
               We could alternatively bail out with an error. */
307
114
            char *new_name;
308
114
            if (name_len == INT32_MAX) goto invalid;
309
114
            new_name = realloc(h->target_name[i], name_len + 1);
310
114
            if (new_name == NULL) goto nomem;
311
114
            h->target_name[i] = new_name;
312
114
            h->target_name[i][name_len] = '\0';
313
114
        }
314
315
264
        bytes = bgzf_read(fp, &h->target_len[i], 4);
316
264
        if (bytes != 4) goto read_err;
317
264
        if (fp->is_be) ed_swap_4p(&h->target_len[i]);
318
264
    }
319
42
    return h;
320
321
3
 nomem:
322
3
    hts_log_error("Out of memory");
323
3
    goto clean;
324
325
27
 read_err:
326
27
    if (bytes < 0) {
327
21
        hts_log_error("Error reading BGZF stream");
328
21
    } else {
329
6
        hts_log_error("Truncated BAM header");
330
6
    }
331
27
    goto clean;
332
333
3
 invalid:
334
3
    hts_log_error("Invalid BAM binary header");
335
336
33
 clean:
337
33
    if (h != NULL) {
338
33
        h->n_targets = num_names; // ensure we free only allocated target_names
339
33
        sam_hdr_destroy(h);
340
33
    }
341
33
    return NULL;
342
3
}
343
344
int bam_hdr_write(BGZF *fp, const sam_hdr_t *h)
345
1.40k
{
346
1.40k
    int32_t i, name_len, x;
347
1.40k
    kstring_t hdr_ks = { 0, 0, NULL };
348
1.40k
    char *text;
349
1.40k
    uint32_t l_text;
350
351
1.40k
    if (!h) return -1;
352
353
1.40k
    if (h->hrecs) {
354
1.40k
        if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1;
355
1.40k
        if (hdr_ks.l > UINT32_MAX) {
356
0
            hts_log_error("Header too long for BAM format");
357
0
            free(hdr_ks.s);
358
0
            return -1;
359
1.40k
        } else if (hdr_ks.l > INT32_MAX) {
360
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
361
0
            hts_log_warning("Output file may not be portable");
362
0
        }
363
1.40k
        text = hdr_ks.s;
364
1.40k
        l_text = hdr_ks.l;
365
1.40k
    } else {
366
1
        if (h->l_text > UINT32_MAX) {
367
0
            hts_log_error("Header too long for BAM format");
368
0
            return -1;
369
1
        } else if (h->l_text > INT32_MAX) {
370
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
371
0
            hts_log_warning("Output file may not be portable");
372
0
        }
373
1
        text = h->text;
374
1
        l_text = h->l_text;
375
1
    }
376
    // write "BAM1"
377
1.40k
    if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; }
378
    // write plain text and the number of reference sequences
379
1.40k
    if (fp->is_be) {
380
0
        x = ed_swap_4(l_text);
381
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
382
0
        if (l_text) {
383
0
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
384
0
        }
385
0
        x = ed_swap_4(h->n_targets);
386
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
387
1.40k
    } else {
388
1.40k
        if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; }
389
1.40k
        if (l_text) {
390
866
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
391
866
        }
392
1.40k
        if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; }
393
1.40k
    }
394
1.40k
    free(hdr_ks.s);
395
    // write sequence names and lengths
396
2.46k
    for (i = 0; i != h->n_targets; ++i) {
397
1.06k
        char *p = h->target_name[i];
398
1.06k
        name_len = strlen(p) + 1;
399
1.06k
        if (fp->is_be) {
400
0
            x = ed_swap_4(name_len);
401
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
402
1.06k
        } else {
403
1.06k
            if (bgzf_write(fp, &name_len, 4) < 0) return -1;
404
1.06k
        }
405
1.06k
        if (bgzf_write(fp, p, name_len) < 0) return -1;
406
1.06k
        if (fp->is_be) {
407
0
            x = ed_swap_4(h->target_len[i]);
408
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
409
1.06k
        } else {
410
1.06k
            if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
411
1.06k
        }
412
1.06k
    }
413
1.40k
    if (bgzf_flush(fp) < 0) return -1;
414
1.40k
    return 0;
415
1.40k
}
416
417
const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
418
0
                             hts_pos_t *beg, hts_pos_t *end, int flags) {
419
0
    return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags);
420
0
}
421
422
/*************************
423
 *** BAM alignment I/O ***
424
 *************************/
425
426
bam1_t *bam_init1(void)
427
632k
{
428
632k
    return (bam1_t*)calloc(1, sizeof(bam1_t));
429
632k
}
430
431
int sam_realloc_bam_data(bam1_t *b, size_t desired)
432
658k
{
433
658k
    uint32_t new_m_data;
434
658k
    uint8_t *new_data;
435
658k
    new_m_data = desired;
436
658k
    kroundup32(new_m_data); // next power of 2
437
658k
    new_m_data += 32; // reduces malloc arena migrations?
438
658k
    if (new_m_data < desired) {
439
0
        errno = ENOMEM; // Not strictly true but we can't store the size
440
0
        return -1;
441
0
    }
442
658k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
443
658k
    if (new_m_data > FUZZ_ALLOC_LIMIT) {
444
0
        errno = ENOMEM;
445
0
        return -1;
446
0
    }
447
658k
#endif
448
658k
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
449
658k
        new_data = realloc(b->data, new_m_data);
450
658k
    } else {
451
0
        if ((new_data = malloc(new_m_data)) != NULL) {
452
0
            if (b->l_data > 0)
453
0
                memcpy(new_data, b->data,
454
0
                       b->l_data < b->m_data ? b->l_data : b->m_data);
455
0
            bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA));
456
0
        }
457
0
    }
458
658k
    if (!new_data) return -1;
459
658k
    b->data = new_data;
460
658k
    b->m_data = new_m_data;
461
658k
    return 0;
462
658k
}
463
464
void bam_destroy1(bam1_t *b)
465
8.96M
{
466
8.96M
    if (b == 0) return;
467
632k
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
468
632k
        free(b->data);
469
632k
        if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) {
470
            // In case of reuse
471
0
            b->data = NULL;
472
0
            b->m_data = 0;
473
0
            b->l_data = 0;
474
0
        }
475
632k
    }
476
477
632k
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0)
478
632k
        free(b);
479
632k
}
480
481
bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
482
2.68M
{
483
2.68M
    if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL;
484
2.68M
    memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data
485
2.68M
    memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest
486
2.68M
    bdst->l_data = bsrc->l_data;
487
2.68M
    bdst->id = bsrc->id;
488
2.68M
    return bdst;
489
2.68M
}
490
491
bam1_t *bam_dup1(const bam1_t *bsrc)
492
628k
{
493
628k
    if (bsrc == NULL) return NULL;
494
628k
    bam1_t *bdst = bam_init1();
495
628k
    if (bdst == NULL) return NULL;
496
628k
    if (bam_copy1(bdst, bsrc) == NULL) {
497
0
        bam_destroy1(bdst);
498
0
        return NULL;
499
0
    }
500
628k
    return bdst;
501
628k
}
502
503
static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar,
504
                             hts_pos_t *rlen, hts_pos_t *qlen)
505
160
{
506
160
    int k;
507
160
    *rlen = *qlen = 0;
508
5.15k
    for (k = 0; k < n_cigar; ++k) {
509
4.99k
        int type = bam_cigar_type(bam_cigar_op(cigar[k]));
510
4.99k
        int len = bam_cigar_oplen(cigar[k]);
511
4.99k
        if (type & 1) *qlen += len;
512
4.99k
        if (type & 2) *rlen += len;
513
4.99k
    }
514
160
}
515
516
static int subtract_check_underflow(size_t length, size_t *limit)
517
39.7M
{
518
39.7M
    if (length <= *limit) {
519
39.7M
        *limit -= length;
520
39.7M
        return 0;
521
39.7M
    }
522
523
0
    return -1;
524
39.7M
}
525
526
int bam_set1(bam1_t *bam,
527
             size_t l_qname, const char *qname,
528
             uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq,
529
             size_t n_cigar, const uint32_t *cigar,
530
             int32_t mtid, hts_pos_t mpos, hts_pos_t isize,
531
             size_t l_seq, const char *seq, const char *qual,
532
             size_t l_aux)
533
7.94M
{
534
    // use a default qname "*" if none is provided
535
7.94M
    if (l_qname == 0) {
536
6.47M
        l_qname = 1;
537
6.47M
        qname = "*";
538
6.47M
    }
539
540
    // note: the qname is stored nul terminated and padded as described in the
541
    // documentation for the bam1_t struct.
542
7.94M
    size_t qname_nuls = 4 - l_qname % 4;
543
544
    // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos().
545
    // can't use bam_endpos() directly as some fields not yet set up.
546
7.94M
    hts_pos_t rlen = 0, qlen = 0;
547
7.94M
    if (!(flag & BAM_FUNMAP)) {
548
0
        bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen);
549
0
    }
550
7.94M
    if (rlen == 0) {
551
7.94M
        rlen = 1;
552
7.94M
    }
553
554
    // validate parameters
555
7.94M
    if (l_qname > 254) {
556
54
        hts_log_error("Query name too long");
557
54
        errno = EINVAL;
558
54
        return -1;
559
54
    }
560
7.94M
    if (HTS_POS_MAX - rlen <= pos) {
561
0
        hts_log_error("Read ends beyond highest supported position");
562
0
        errno = EINVAL;
563
0
        return -1;
564
0
    }
565
7.94M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) {
566
0
        hts_log_error("Mapped query must have a CIGAR");
567
0
        errno = EINVAL;
568
0
        return -1;
569
0
    }
570
7.94M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) {
571
0
        hts_log_error("CIGAR and query sequence are of different length");
572
0
        errno = EINVAL;
573
0
        return -1;
574
0
    }
575
576
7.94M
    size_t limit = INT32_MAX;
577
7.94M
    int u = subtract_check_underflow(l_qname + qname_nuls, &limit);
578
7.94M
    u    += subtract_check_underflow(n_cigar * 4, &limit);
579
7.94M
    u    += subtract_check_underflow((l_seq + 1) / 2, &limit);
580
7.94M
    u    += subtract_check_underflow(l_seq, &limit);
581
7.94M
    u    += subtract_check_underflow(l_aux, &limit);
582
7.94M
    if (u != 0) {
583
0
        hts_log_error("Size overflow");
584
0
        errno = EINVAL;
585
0
        return -1;
586
0
    }
587
588
    // re-allocate the data buffer as needed.
589
7.94M
    size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq;
590
7.94M
    if (realloc_bam_data(bam, data_len + l_aux) < 0) {
591
0
        return -1;
592
0
    }
593
594
7.94M
    bam->l_data = (int)data_len;
595
7.94M
    bam->core.pos = pos;
596
7.94M
    bam->core.tid = tid;
597
7.94M
    bam->core.bin = bam_reg2bin(pos, pos + rlen);
598
7.94M
    bam->core.qual = mapq;
599
7.94M
    bam->core.l_extranul = (uint8_t)(qname_nuls - 1);
600
7.94M
    bam->core.flag = flag;
601
7.94M
    bam->core.l_qname = (uint16_t)(l_qname + qname_nuls);
602
7.94M
    bam->core.n_cigar = (uint32_t)n_cigar;
603
7.94M
    bam->core.l_qseq = (int32_t)l_seq;
604
7.94M
    bam->core.mtid = mtid;
605
7.94M
    bam->core.mpos = mpos;
606
7.94M
    bam->core.isize = isize;
607
608
7.94M
    uint8_t *cp = bam->data;
609
7.94M
    strncpy((char *)cp, qname, l_qname);
610
7.94M
    int i;
611
31.3M
    for (i = 0; i < qname_nuls; i++) {
612
23.4M
        cp[l_qname + i] = '\0';
613
23.4M
    }
614
7.94M
    cp += l_qname + qname_nuls;
615
616
7.94M
    if (n_cigar > 0) {
617
0
        memcpy(cp, cigar, n_cigar * 4);
618
0
    }
619
7.94M
    cp += n_cigar * 4;
620
621
440M
#define NN 16
622
7.94M
    const uint8_t *useq = (uint8_t *)seq;
623
44.0M
    for (i = 0; i + NN < l_seq; i += NN) {
624
36.0M
        int j;
625
36.0M
        const uint8_t *u2 = useq+i;
626
324M
        for (j = 0; j < NN/2; j++)
627
288M
            cp[j] = (seq_nt16_table[u2[j*2]]<<4) | seq_nt16_table[u2[j*2+1]];
628
36.0M
        cp += NN/2;
629
36.0M
    }
630
9.20M
    for (; i + 1 < l_seq; i += 2) {
631
1.26M
        *cp++ = (seq_nt16_table[useq[i]] << 4) | seq_nt16_table[useq[i + 1]];
632
1.26M
    }
633
634
8.13M
    for (; i < l_seq; i++) {
635
192k
        *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4;
636
192k
    }
637
638
7.94M
    if (qual) {
639
15
        memcpy(cp, qual, l_seq);
640
15
    }
641
7.94M
    else {
642
7.94M
        memset(cp, '\xff', l_seq);
643
7.94M
    }
644
645
7.94M
    return (int)data_len;
646
7.94M
}
647
648
hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
649
2.63M
{
650
2.63M
    int k;
651
2.63M
    hts_pos_t l;
652
4.31M
    for (k = l = 0; k < n_cigar; ++k)
653
1.68M
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
654
1.51M
            l += bam_cigar_oplen(cigar[k]);
655
2.63M
    return l;
656
2.63M
}
657
658
hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
659
85.6k
{
660
85.6k
    int k;
661
85.6k
    hts_pos_t l;
662
5.95M
    for (k = l = 0; k < n_cigar; ++k)
663
5.87M
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
664
5.45M
            l += bam_cigar_oplen(cigar[k]);
665
85.6k
    return l;
666
85.6k
}
667
668
hts_pos_t bam_endpos(const bam1_t *b)
669
1.02k
{
670
1.02k
    hts_pos_t rlen = (b->core.flag & BAM_FUNMAP)? 0 : bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
671
1.02k
    if (rlen == 0) rlen = 1;
672
1.02k
    return b->core.pos + rlen;
673
1.02k
}
674
675
static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
676
116k
{
677
116k
    bam1_core_t *c = &b->core;
678
679
    // Bail out as fast as possible for the easy case
680
116k
    uint32_t test_CG = BAM_CSOFT_CLIP | (c->l_qseq << BAM_CIGAR_SHIFT);
681
116k
    if (c->n_cigar == 0 || test_CG != *bam_get_cigar(b))
682
81.0k
        return 0;
683
684
    // The above isn't fool proof - we may have old CIGAR tags that aren't used,
685
    // but this is much less likely so do as a secondary check.
686
35.9k
    if (c->tid < 0 || c->pos < 0)
687
17.0k
        return 0;
688
689
    // Do we have a CG tag?
690
18.9k
    uint8_t *CG = bam_aux_get(b, "CG");
691
18.9k
    int saved_errno = errno;
692
18.9k
    if (!CG) {
693
17.3k
        if (errno != ENOENT) return -1;  // Bad aux data
694
17.3k
        errno = saved_errno; // restore errno on expected no-CG-tag case
695
17.3k
        return 0;
696
17.3k
    }
697
698
    // Now we start with the serious work migrating CG to CIGAR
699
1.52k
    uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data,
700
1.52k
        *cigar0, CG_len, fake_bytes;
701
1.52k
    cigar0 = bam_get_cigar(b);
702
1.52k
    fake_bytes = c->n_cigar * 4;
703
1.52k
    if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i'))
704
497
        return 0; // not of type B,I
705
1.02k
    CG_len = le_to_u32(CG + 2);
706
    // don't move if the real CIGAR length is shorter than the fake cigar length
707
1.02k
    if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0;
708
709
    // move from the CG tag to the right position
710
1.02k
    cigar_st = (uint8_t*)cigar0 - b->data;
711
1.02k
    c->n_cigar = CG_len;
712
1.02k
    n_cigar4 = c->n_cigar * 4;
713
1.02k
    CG_st = CG - b->data - 2;
714
1.02k
    CG_en = CG_st + 8 + n_cigar4;
715
1.02k
    if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1;
716
    // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
717
1.02k
    b->l_data = b->l_data - fake_bytes + n_cigar4;
718
    // insert c->n_cigar-fake_bytes empty space to make room
719
1.02k
    memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes));
720
    // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
721
1.02k
    memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4);
722
1.02k
    if (ori_len > CG_en) // move data after the CG tag
723
194
        memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
724
1.02k
    b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
725
1.02k
    if (recal_bin)
726
1.02k
        b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5);
727
1.02k
    if (give_warning)
728
1.02k
        hts_log_warning("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
729
1.02k
    return 1;
730
1.02k
}
731
732
static inline int aux_type2size(uint8_t type)
733
1.71M
{
734
1.71M
    switch (type) {
735
935k
    case 'A': case 'c': case 'C':
736
935k
        return 1;
737
121k
    case 's': case 'S':
738
121k
        return 2;
739
406k
    case 'i': case 'I': case 'f':
740
406k
        return 4;
741
9.19k
    case 'd':
742
9.19k
        return 8;
743
242k
    case 'Z': case 'H': case 'B':
744
242k
        return type;
745
2
    default:
746
2
        return 0;
747
1.71M
    }
748
1.71M
}
749
750
static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
751
0
{
752
0
    uint32_t *cigar = (uint32_t*)(data + c->l_qname);
753
0
    uint32_t i;
754
0
    for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
755
0
}
756
757
// Fix bad records where qname is not terminated correctly.
758
123
static int fixup_missing_qname_nul(bam1_t *b) {
759
123
    bam1_core_t *c = &b->core;
760
761
    // Note this is called before c->l_extranul is added to c->l_qname
762
123
    if (c->l_extranul > 0) {
763
112
        b->data[c->l_qname++] = '\0';
764
112
        c->l_extranul--;
765
112
    } else {
766
11
        if (b->l_data > INT_MAX - 4) return -1;
767
11
        if (realloc_bam_data(b, b->l_data + 4) < 0) return -1;
768
11
        b->l_data += 4;
769
11
        b->data[c->l_qname++] = '\0';
770
11
        c->l_extranul = 3;
771
11
    }
772
123
    return 0;
773
123
}
774
775
/*
776
 * Note a second interface that returns a bam pointer instead would avoid bam_copy1
777
 * in multi-threaded handling.  This may be worth considering for htslib2.
778
 */
779
int bam_read1(BGZF *fp, bam1_t *b)
780
193
{
781
193
    bam1_core_t *c = &b->core;
782
193
    int32_t block_len, ret, i;
783
193
    uint32_t new_l_data;
784
193
    uint8_t tmp[32], *x;
785
786
193
    b->l_data = 0;
787
788
193
    if ((ret = bgzf_read_small(fp, &block_len, 4)) != 4) {
789
0
        if (ret == 0) return -1; // normal end-of-file
790
0
        else return -2; // truncated
791
0
    }
792
193
    if (fp->is_be)
793
0
        ed_swap_4p(&block_len);
794
193
    if (block_len < 32) return -4;  // block_len includes core data
795
189
    if (fp->block_length - fp->block_offset > 32) {
796
        // Avoid bgzf_read and a temporary copy to a local buffer
797
189
        x = (uint8_t *)fp->uncompressed_block + fp->block_offset;
798
189
        fp->block_offset += 32;
799
189
    } else {
800
0
        x = tmp;
801
0
        if (bgzf_read(fp, x, 32) != 32) return -3;
802
0
    }
803
804
189
    c->tid        = le_to_u32(x);
805
189
    c->pos        = le_to_i32(x+4);
806
189
    uint32_t x2   = le_to_u32(x+8);
807
189
    c->bin        = x2>>16;
808
189
    c->qual       = x2>>8&0xff;
809
189
    c->l_qname    = x2&0xff;
810
189
    c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
811
189
    uint32_t x3   = le_to_u32(x+12);
812
189
    c->flag       = x3>>16;
813
189
    c->n_cigar    = x3&0xffff;
814
189
    c->l_qseq     = le_to_u32(x+16);
815
189
    c->mtid       = le_to_u32(x+20);
816
189
    c->mpos       = le_to_i32(x+24);
817
189
    c->isize      = le_to_i32(x+28);
818
819
189
    new_l_data = block_len - 32 + c->l_extranul;
820
189
    if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
821
188
    if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
822
188
        + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data)
823
8
        return -4;
824
180
    if (realloc_bam_data(b, new_l_data) < 0) return -4;
825
180
    b->l_data = new_l_data;
826
827
180
    if (bgzf_read_small(fp, b->data, c->l_qname) != c->l_qname) return -4;
828
178
    if (b->data[c->l_qname - 1] != '\0') { // try to fix missing nul termination
829
123
        if (fixup_missing_qname_nul(b) < 0) return -4;
830
123
    }
831
316
    for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
832
178
    c->l_qname += c->l_extranul;
833
178
    if (b->l_data < c->l_qname ||
834
178
        bgzf_read_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
835
5
        return -4;
836
173
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
837
173
    if (bam_tag2cigar(b, 0, 0) < 0)
838
0
        return -4;
839
840
    // TODO: consider making this conditional
841
173
    if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
842
160
        hts_pos_t rlen, qlen;
843
160
        bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
844
160
        if ((b->core.flag & BAM_FUNMAP) || rlen == 0) rlen = 1;
845
160
        b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
846
        // Sanity check for broken CIGAR alignments
847
160
        if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
848
6
            hts_log_error("CIGAR and query sequence lengths differ for %s",
849
6
                    bam_get_qname(b));
850
6
            return -4;
851
6
        }
852
160
    }
853
854
167
    return 4 + block_len;
855
173
}
856
857
int bam_write1(BGZF *fp, const bam1_t *b)
858
2.68M
{
859
2.68M
    const bam1_core_t *c = &b->core;
860
2.68M
    uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
861
2.68M
    int i, ok;
862
2.68M
    if (c->l_qname - c->l_extranul > 255) {
863
0
        hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b));
864
0
        errno = EOVERFLOW;
865
0
        return -1;
866
0
    }
867
2.68M
    if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
868
2.68M
    if (c->pos > INT_MAX ||
869
2.68M
        c->mpos > INT_MAX ||
870
2.68M
        c->isize < INT_MIN || c->isize > INT_MAX) {
871
31
        hts_log_error("Positional data is too large for BAM format");
872
31
        return -1;
873
31
    }
874
2.68M
    x[0] = c->tid;
875
2.68M
    x[1] = c->pos;
876
2.68M
    x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
877
2.68M
    if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
878
2.68M
    else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
879
2.68M
    x[4] = c->l_qseq;
880
2.68M
    x[5] = c->mtid;
881
2.68M
    x[6] = c->mpos;
882
2.68M
    x[7] = c->isize;
883
2.68M
    ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
884
2.68M
    if (fp->is_be) {
885
0
        for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
886
0
        y = block_len;
887
0
        if (ok) ok = (bgzf_write_small(fp, ed_swap_4p(&y), 4) >= 0);
888
0
        swap_data(c, b->l_data, b->data, 1);
889
2.68M
    } else {
890
2.68M
        if (ok) ok = (bgzf_write_small(fp, &block_len, 4) >= 0);
891
2.68M
    }
892
2.68M
    if (ok) ok = (bgzf_write_small(fp, x, 32) >= 0);
893
2.68M
    if (ok) ok = (bgzf_write_small(fp, b->data, c->l_qname - c->l_extranul) >= 0);
894
2.68M
    if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
895
2.68M
        if (ok) ok = (bgzf_write_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
896
2.68M
    } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
897
11
        uint8_t buf[8];
898
11
        uint32_t cigar_st, cigar_en, cigar[2];
899
11
        hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b));
900
11
        if (cigreflen >= (1<<28)) {
901
            // Length of reference covered is greater than the biggest
902
            // CIGAR operation currently allowed.
903
3
            hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos
904
3
                          " cannot be written in BAM.  Try writing SAM or CRAM instead.\n",
905
3
                          bam_get_qname(b), c->n_cigar, cigreflen);
906
3
            return -1;
907
3
        }
908
8
        cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
909
8
        cigar_en = cigar_st + c->n_cigar * 4;
910
8
        cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
911
8
        cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP;
912
8
        u32_to_le(cigar[0], buf);
913
8
        u32_to_le(cigar[1], buf + 4);
914
8
        if (ok) ok = (bgzf_write_small(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
915
8
        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
916
8
        if (ok) ok = (bgzf_write_small(fp, "CGBI", 4) >= 0); // write CG:B,I
917
8
        u32_to_le(c->n_cigar, buf);
918
8
        if (ok) ok = (bgzf_write_small(fp, buf, 4) >= 0); // write the true CIGAR length
919
8
        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
920
8
    }
921
2.68M
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
922
2.68M
    return ok? 4 + block_len : -1;
923
2.68M
}
924
925
/*
926
 * Write a BAM file and append to the in-memory index simultaneously.
927
 */
928
2.68M
static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) {
929
2.68M
    BGZF *bfp = fp->fp.bgzf;
930
931
2.68M
    if (!fp->idx)
932
2.68M
        return bam_write1(bfp, b);
933
934
0
    uint32_t block_len = b->l_data - b->core.l_extranul + 32;
935
0
    if (bgzf_flush_try(bfp, 4 + block_len) < 0)
936
0
        return -1;
937
0
    if (!bfp->mt)
938
0
        hts_idx_amend_last(fp->idx, bgzf_tell(bfp));
939
940
0
    int ret = bam_write1(bfp, b);
941
0
    if (ret < 0)
942
0
        return -1;
943
944
0
    if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) {
945
0
        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
946
0
                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
947
0
        ret = -1;
948
0
    }
949
950
0
    return ret;
951
0
}
952
953
/*
954
 * Set the qname in a BAM record
955
 */
956
int bam_set_qname(bam1_t *rec, const char *qname)
957
0
{
958
0
    if (!rec) return -1;
959
0
    if (!qname || !*qname) return -1;
960
961
0
    size_t old_len = rec->core.l_qname;
962
0
    size_t new_len = strlen(qname) + 1;
963
0
    if (new_len < 1 || new_len > 255) return -1;
964
965
0
    int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0;
966
967
0
    size_t new_data_len = rec->l_data - old_len + new_len + extranul;
968
0
    if (realloc_bam_data(rec, new_data_len) < 0) return -1;
969
970
    // Make room
971
0
    if (new_len + extranul != rec->core.l_qname)
972
0
        memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname);
973
    // Copy in new name and pad if needed
974
0
    memcpy(rec->data, qname, new_len);
975
0
    int n;
976
0
    for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0';
977
978
0
    rec->l_data = new_data_len;
979
0
    rec->core.l_qname = new_len + extranul;
980
0
    rec->core.l_extranul = extranul;
981
982
0
    return 0;
983
0
}
984
985
/********************
986
 *** BAM indexing ***
987
 ********************/
988
989
static hts_idx_t *sam_index(htsFile *fp, int min_shift)
990
0
{
991
0
    int n_lvls, i, fmt, ret;
992
0
    bam1_t *b;
993
0
    hts_idx_t *idx;
994
0
    sam_hdr_t *h;
995
0
    h = sam_hdr_read(fp);
996
0
    if (h == NULL) return NULL;
997
0
    if (min_shift > 0) {
998
0
        hts_pos_t max_len = 0;
999
0
        for (i = 0; i < h->n_targets; ++i) {
1000
0
            hts_pos_t len = sam_hdr_tid2len(h, i);
1001
0
            if (max_len < len) max_len = len;
1002
0
        }
1003
0
        n_lvls = 0;
1004
0
        hts_adjust_csi_settings(max_len, &min_shift, &n_lvls);
1005
0
        fmt = HTS_FMT_CSI;
1006
0
    } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1007
0
    idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1008
0
    b = bam_init1();
1009
0
    while ((ret = sam_read1(fp, h, b)) >= 0) {
1010
0
        ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP));
1011
0
        if (ret < 0) { // unsorted or doesn't fit
1012
0
            hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
1013
0
            goto err;
1014
0
        }
1015
0
    }
1016
0
    if (ret < -1) goto err; // corrupted BAM file
1017
1018
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
1019
0
    sam_hdr_destroy(h);
1020
0
    bam_destroy1(b);
1021
0
    return idx;
1022
1023
0
err:
1024
0
    bam_destroy1(b);
1025
0
    hts_idx_destroy(idx);
1026
0
    return NULL;
1027
0
}
1028
1029
int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
1030
0
{
1031
0
    hts_idx_t *idx;
1032
0
    htsFile *fp;
1033
0
    int ret = 0;
1034
1035
0
    if ((fp = hts_open(fn, "r")) == 0) return -2;
1036
0
    if (nthreads)
1037
0
        hts_set_threads(fp, nthreads);
1038
1039
0
    switch (fp->format.format) {
1040
0
    case cram:
1041
1042
0
        ret = cram_index_build(fp->fp.cram, fn, fnidx);
1043
0
        break;
1044
1045
0
    case bam:
1046
0
    case sam:
1047
0
        if (fp->format.compression != bgzf) {
1048
0
            hts_log_error("%s file \"%s\" not BGZF compressed",
1049
0
                          fp->format.format == bam ? "BAM" : "SAM", fn);
1050
0
            ret = -1;
1051
0
            break;
1052
0
        }
1053
0
        idx = sam_index(fp, min_shift);
1054
0
        if (idx) {
1055
0
            ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
1056
0
            if (ret < 0) ret = -4;
1057
0
            hts_idx_destroy(idx);
1058
0
        }
1059
0
        else ret = -1;
1060
0
        break;
1061
1062
0
    default:
1063
0
        ret = -3;
1064
0
        break;
1065
0
    }
1066
0
    hts_close(fp);
1067
1068
0
    return ret;
1069
0
}
1070
1071
int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
1072
0
{
1073
0
    return sam_index_build3(fn, fnidx, min_shift, 0);
1074
0
}
1075
1076
int sam_index_build(const char *fn, int min_shift)
1077
0
{
1078
0
    return sam_index_build3(fn, NULL, min_shift, 0);
1079
0
}
1080
1081
// Provide bam_index_build() symbol for binary compatibility with earlier HTSlib
1082
#undef bam_index_build
1083
int bam_index_build(const char *fn, int min_shift)
1084
0
{
1085
0
    return sam_index_build2(fn, NULL, min_shift);
1086
0
}
1087
1088
// Initialise fp->idx for the current format type.
1089
// This must be called after the header has been written but no other data.
1090
0
int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) {
1091
0
    fp->fnidx = fnidx;
1092
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1093
0
        (fp->format.format == sam && fp->format.compression == bgzf)) {
1094
0
        int n_lvls, fmt = HTS_FMT_CSI;
1095
0
        if (min_shift > 0) {
1096
0
            int64_t max_len = 0;
1097
0
            int i;
1098
0
            for (i = 0; i < h->n_targets; ++i)
1099
0
                if (max_len < h->target_len[i]) max_len = h->target_len[i];
1100
0
            n_lvls = 0;
1101
0
            hts_adjust_csi_settings(max_len, &min_shift, &n_lvls);
1102
0
        } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1103
1104
0
        fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1105
0
        return fp->idx ? 0 : -1;
1106
0
    }
1107
1108
0
    if (fp->format.format == cram) {
1109
0
        fp->fp.cram->idxfp = bgzf_open(fnidx, "wg");
1110
0
        return fp->fp.cram->idxfp ? 0 : -1;
1111
0
    }
1112
1113
0
    return -1;
1114
0
}
1115
1116
// Finishes an index. Call after the last record has been written.
1117
// Returns 0 on success, <0 on failure.
1118
0
int sam_idx_save(htsFile *fp) {
1119
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1120
0
        fp->format.format == vcf || fp->format.format == sam) {
1121
0
        int ret;
1122
0
        if ((ret = sam_state_destroy(fp)) < 0) {
1123
0
            errno = -ret;
1124
0
            return -1;
1125
0
        }
1126
0
        if (!fp->is_bgzf || bgzf_flush(fp->fp.bgzf) < 0)
1127
0
            return -1;
1128
0
        hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
1129
1130
0
        if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0)
1131
0
            return -1;
1132
1133
0
        return hts_idx_save_but_not_close(fp->idx, fp->fnidx, hts_idx_fmt(fp->idx));
1134
1135
0
    } else if (fp->format.format == cram) {
1136
        // flushed and closed by cram_close
1137
0
    }
1138
1139
0
    return 0;
1140
0
}
1141
1142
static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1143
0
{
1144
0
    htsFile *fp = (htsFile *)fpv;
1145
0
    bam1_t *b = bv;
1146
0
    fp->line.l = 0;
1147
0
    int ret = sam_read1(fp, fp->bam_header, b);
1148
0
    if (ret >= 0) {
1149
0
        *tid = b->core.tid;
1150
0
        *beg = b->core.pos;
1151
0
        *end = bam_endpos(b);
1152
0
    }
1153
0
    return ret;
1154
0
}
1155
1156
// This is used only with read_rest=1 iterators, so need not set tid/beg/end.
1157
static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1158
0
{
1159
0
    htsFile *fp = (htsFile *)fpv;
1160
0
    bam1_t *b = bv;
1161
0
    fp->line.l = 0;
1162
0
    int ret = sam_read1(fp, fp->bam_header, b);
1163
0
    return ret;
1164
0
}
1165
1166
// Internal (for now) func used by bam_sym_lookup.  This is copied from
1167
// samtools/bam.c.
1168
static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b)
1169
0
{
1170
0
    const char *rg;
1171
0
    kstring_t lib = { 0, 0, NULL };
1172
0
    rg = (char *)bam_aux_get(b, "RG");
1173
1174
0
    if (!rg)
1175
0
        return NULL;
1176
0
    else
1177
0
        rg++;
1178
1179
0
    if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib)  < 0)
1180
0
        return NULL;
1181
1182
0
    static char LB_text[1024];
1183
0
    int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1;
1184
1185
0
    memcpy(LB_text, lib.s, len);
1186
0
    LB_text[len] = 0;
1187
1188
0
    free(lib.s);
1189
1190
0
    return LB_text;
1191
0
}
1192
1193
1194
// Bam record pointer and SAM header combined
1195
typedef struct {
1196
    const sam_hdr_t *h;
1197
    const bam1_t *b;
1198
} hb_pair;
1199
1200
// Looks up variable names in str and replaces them with their value.
1201
// Also supports aux tags.
1202
//
1203
// Note the expression parser deliberately overallocates str size so it
1204
// is safe to use memcmp over strcmp.
1205
static int bam_sym_lookup(void *data, char *str, char **end,
1206
0
                          hts_expr_val_t *res) {
1207
0
    hb_pair *hb = (hb_pair *)data;
1208
0
    const bam1_t *b = hb->b;
1209
1210
0
    res->is_str = 0;
1211
0
    switch(*str) {
1212
0
    case 'c':
1213
0
        if (memcmp(str, "cigar", 5) == 0) {
1214
0
            *end = str+5;
1215
0
            res->is_str = 1;
1216
0
            ks_clear(&res->s);
1217
0
            uint32_t *cigar = bam_get_cigar(b);
1218
0
            int i, n = b->core.n_cigar, r = 0;
1219
0
            if (n) {
1220
0
                for (i = 0; i < n; i++) {
1221
0
                    r |= kputw (bam_cigar_oplen(cigar[i]), &res->s) < 0;
1222
0
                    r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s) < 0;
1223
0
                }
1224
0
                r |= kputs("", &res->s) < 0;
1225
0
            } else {
1226
0
                r |= kputs("*", &res->s) < 0;
1227
0
            }
1228
0
            return r ? -1 : 0;
1229
0
        }
1230
0
        break;
1231
1232
0
    case 'e':
1233
0
        if (memcmp(str, "endpos", 6) == 0) {
1234
0
            *end = str+6;
1235
0
            res->d = bam_endpos(b);
1236
0
            return 0;
1237
0
        }
1238
0
        break;
1239
1240
0
    case 'f':
1241
0
        if (memcmp(str, "flag", 4) == 0) {
1242
0
            str = *end = str+4;
1243
0
            if (*str != '.') {
1244
0
                res->d = b->core.flag;
1245
0
                return 0;
1246
0
            } else {
1247
0
                str++;
1248
0
                if (!memcmp(str, "paired", 6)) {
1249
0
                    *end = str+6;
1250
0
                    res->d = b->core.flag & BAM_FPAIRED;
1251
0
                    return 0;
1252
0
                } else if (!memcmp(str, "proper_pair", 11)) {
1253
0
                    *end = str+11;
1254
0
                    res->d = b->core.flag & BAM_FPROPER_PAIR;
1255
0
                    return 0;
1256
0
                } else if (!memcmp(str, "unmap", 5)) {
1257
0
                    *end = str+5;
1258
0
                    res->d = b->core.flag & BAM_FUNMAP;
1259
0
                    return 0;
1260
0
                } else if (!memcmp(str, "munmap", 6)) {
1261
0
                    *end = str+6;
1262
0
                    res->d = b->core.flag & BAM_FMUNMAP;
1263
0
                    return 0;
1264
0
                } else if (!memcmp(str, "reverse", 7)) {
1265
0
                    *end = str+7;
1266
0
                    res->d = b->core.flag & BAM_FREVERSE;
1267
0
                    return 0;
1268
0
                } else if (!memcmp(str, "mreverse", 8)) {
1269
0
                    *end = str+8;
1270
0
                    res->d = b->core.flag & BAM_FMREVERSE;
1271
0
                    return 0;
1272
0
                } else if (!memcmp(str, "read1", 5)) {
1273
0
                    *end = str+5;
1274
0
                    res->d = b->core.flag & BAM_FREAD1;
1275
0
                    return 0;
1276
0
                } else if (!memcmp(str, "read2", 5)) {
1277
0
                    *end = str+5;
1278
0
                    res->d = b->core.flag & BAM_FREAD2;
1279
0
                    return 0;
1280
0
                } else if (!memcmp(str, "secondary", 9)) {
1281
0
                    *end = str+9;
1282
0
                    res->d = b->core.flag & BAM_FSECONDARY;
1283
0
                    return 0;
1284
0
                } else if (!memcmp(str, "qcfail", 6)) {
1285
0
                    *end = str+6;
1286
0
                    res->d = b->core.flag & BAM_FQCFAIL;
1287
0
                    return 0;
1288
0
                } else if (!memcmp(str, "dup", 3)) {
1289
0
                    *end = str+3;
1290
0
                    res->d = b->core.flag & BAM_FDUP;
1291
0
                    return 0;
1292
0
                } else if (!memcmp(str, "supplementary", 13)) {
1293
0
                    *end = str+13;
1294
0
                    res->d = b->core.flag & BAM_FSUPPLEMENTARY;
1295
0
                    return 0;
1296
0
                } else {
1297
0
                    hts_log_error("Unrecognised flag string");
1298
0
                    return -1;
1299
0
                }
1300
0
            }
1301
0
        }
1302
0
        break;
1303
1304
0
    case 'h':
1305
0
        if (memcmp(str, "hclen", 5) == 0) {
1306
0
            int hclen = 0;
1307
0
            uint32_t *cigar = bam_get_cigar(b);
1308
0
            uint32_t ncigar = b->core.n_cigar;
1309
1310
            // left
1311
0
            if (ncigar > 0 && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP)
1312
0
                hclen = bam_cigar_oplen(cigar[0]);
1313
1314
            // right
1315
0
            if (ncigar > 1 && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP)
1316
0
                hclen += bam_cigar_oplen(cigar[ncigar-1]);
1317
1318
0
            *end = str+5;
1319
0
            res->d = hclen;
1320
0
            return 0;
1321
0
        }
1322
0
        break;
1323
1324
0
    case 'l':
1325
0
        if (memcmp(str, "library", 7) == 0) {
1326
0
            *end = str+7;
1327
0
            res->is_str = 1;
1328
0
            const char *lib = bam_get_library(hb->h, b);
1329
0
            kputs(lib ? lib : "", ks_clear(&res->s));
1330
0
            return 0;
1331
0
        }
1332
0
        break;
1333
1334
0
    case 'm':
1335
0
        if (memcmp(str, "mapq", 4) == 0) {
1336
0
            *end = str+4;
1337
0
            res->d = b->core.qual;
1338
0
            return 0;
1339
0
        } else if (memcmp(str, "mpos", 4) == 0) {
1340
0
            *end = str+4;
1341
0
            res->d = b->core.mpos+1;
1342
0
            return 0;
1343
0
        } else if (memcmp(str, "mrname", 6) == 0) {
1344
0
            *end = str+6;
1345
0
            res->is_str = 1;
1346
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1347
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1348
0
            return 0;
1349
0
        } else if (memcmp(str, "mrefid", 6) == 0) {
1350
0
            *end = str+6;
1351
0
            res->d = b->core.mtid;
1352
0
            return 0;
1353
0
        }
1354
0
        break;
1355
1356
0
    case 'n':
1357
0
        if (memcmp(str, "ncigar", 6) == 0) {
1358
0
            *end = str+6;
1359
0
            res->d = b->core.n_cigar;
1360
0
            return 0;
1361
0
        }
1362
0
        break;
1363
1364
0
    case 'p':
1365
0
        if (memcmp(str, "pos", 3) == 0) {
1366
0
            *end = str+3;
1367
0
            res->d = b->core.pos+1;
1368
0
            return 0;
1369
0
        } else if (memcmp(str, "pnext", 5) == 0) {
1370
0
            *end = str+5;
1371
0
            res->d = b->core.mpos+1;
1372
0
            return 0;
1373
0
        }
1374
0
        break;
1375
1376
0
    case 'q':
1377
0
        if (memcmp(str, "qlen", 4) == 0) {
1378
0
            *end = str+4;
1379
0
            res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
1380
0
            return 0;
1381
0
        } else if (memcmp(str, "qname", 5) == 0) {
1382
0
            *end = str+5;
1383
0
            res->is_str = 1;
1384
0
            kputs(bam_get_qname(b), ks_clear(&res->s));
1385
0
            return 0;
1386
0
        } else if (memcmp(str, "qual", 4) == 0) {
1387
0
            *end = str+4;
1388
0
            ks_clear(&res->s);
1389
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1390
0
                return -1;
1391
0
            memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq);
1392
0
            res->s.l = b->core.l_qseq;
1393
0
            res->is_str = 1;
1394
0
            return 0;
1395
0
        }
1396
0
        break;
1397
1398
0
    case 'r':
1399
0
        if (memcmp(str, "rlen", 4) == 0) {
1400
0
            *end = str+4;
1401
0
            res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
1402
0
            return 0;
1403
0
        } else if (memcmp(str, "rname", 5) == 0) {
1404
0
            *end = str+5;
1405
0
            res->is_str = 1;
1406
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.tid);
1407
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1408
0
            return 0;
1409
0
        } else if (memcmp(str, "rnext", 5) == 0) {
1410
0
            *end = str+5;
1411
0
            res->is_str = 1;
1412
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1413
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1414
0
            return 0;
1415
0
        } else if (memcmp(str, "refid", 5) == 0) {
1416
0
            *end = str+5;
1417
0
            res->d = b->core.tid;
1418
0
            return 0;
1419
0
        }
1420
0
        break;
1421
1422
0
    case 's':
1423
0
        if (memcmp(str, "seq", 3) == 0) {
1424
0
            *end = str+3;
1425
0
            ks_clear(&res->s);
1426
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1427
0
                return -1;
1428
0
            nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq);
1429
0
            res->s.s[b->core.l_qseq] = 0;
1430
0
            res->s.l = b->core.l_qseq;
1431
0
            res->is_str = 1;
1432
0
            return 0;
1433
0
        } else if (memcmp(str, "sclen", 5) == 0) {
1434
0
            int sclen = 0;
1435
0
            uint32_t *cigar = bam_get_cigar(b);
1436
0
            int ncigar = b->core.n_cigar;
1437
0
            int left = 0;
1438
1439
            // left
1440
0
            if (ncigar > 0
1441
0
                && bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP)
1442
0
                left = 0, sclen += bam_cigar_oplen(cigar[0]);
1443
0
            else if (ncigar > 1
1444
0
                     && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP
1445
0
                     && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP)
1446
0
                left = 1, sclen += bam_cigar_oplen(cigar[1]);
1447
1448
            // right
1449
0
            if (ncigar-1 > left
1450
0
                && bam_cigar_op(cigar[ncigar-1]) == BAM_CSOFT_CLIP)
1451
0
                sclen += bam_cigar_oplen(cigar[ncigar-1]);
1452
0
            else if (ncigar-2 > left
1453
0
                     && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP
1454
0
                     && bam_cigar_op(cigar[ncigar-2]) == BAM_CSOFT_CLIP)
1455
0
                sclen += bam_cigar_oplen(cigar[ncigar-2]);
1456
1457
0
            *end = str+5;
1458
0
            res->d = sclen;
1459
0
            return 0;
1460
0
        }
1461
0
        break;
1462
1463
0
    case 't':
1464
0
        if (memcmp(str, "tlen", 4) == 0) {
1465
0
            *end = str+4;
1466
0
            res->d = b->core.isize;
1467
0
            return 0;
1468
0
        }
1469
0
        break;
1470
1471
0
    case '[':
1472
0
        if (*str == '[' && str[1] && str[2] && str[3] == ']') {
1473
            /* aux tags */
1474
0
            *end = str+4;
1475
1476
0
            uint8_t *aux = bam_aux_get(b, str+1);
1477
0
            if (aux) {
1478
                // we define the truth of a tag to be its presence, even if 0.
1479
0
                res->is_true = 1;
1480
0
                switch (*aux) {
1481
0
                case 'Z':
1482
0
                case 'H':
1483
0
                    res->is_str = 1;
1484
0
                    kputs((char *)aux+1, ks_clear(&res->s));
1485
0
                    break;
1486
1487
0
                case 'A':
1488
0
                    res->is_str = 1;
1489
0
                    kputsn((char *)aux+1, 1, ks_clear(&res->s));
1490
0
                    break;
1491
1492
0
                case 'i': case 'I':
1493
0
                case 's': case 'S':
1494
0
                case 'c': case 'C':
1495
0
                    res->is_str = 0;
1496
0
                    res->d = bam_aux2i(aux);
1497
0
                    break;
1498
1499
0
                case 'f':
1500
0
                case 'd':
1501
0
                    res->is_str = 0;
1502
0
                    res->d = bam_aux2f(aux);
1503
0
                    break;
1504
1505
0
                default:
1506
0
                    hts_log_error("Aux type '%c not yet supported by filters",
1507
0
                                  *aux);
1508
0
                    return -1;
1509
0
                }
1510
0
                return 0;
1511
1512
0
            } else {
1513
                // hence absent tags are always false (and strings)
1514
0
                res->is_str = 1;
1515
0
                res->s.l = 0;
1516
0
                res->d = 0;
1517
0
                res->is_true = 0;
1518
0
                return 0;
1519
0
            }
1520
0
        }
1521
0
        break;
1522
0
    }
1523
1524
    // All successful matches in switch should return 0.
1525
    // So if we didn't match, it's a parse error.
1526
0
    return -1;
1527
0
}
1528
1529
// Returns 1 when accepted by the filter, 0 if not, -1 on error.
1530
int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt)
1531
0
{
1532
0
    hb_pair hb = {h, b};
1533
0
    hts_expr_val_t res = HTS_EXPR_VAL_INIT;
1534
0
    if (hts_filter_eval2(filt, &hb, bam_sym_lookup, &res)) {
1535
0
        hts_log_error("Couldn't process filter expression");
1536
0
        hts_expr_val_free(&res);
1537
0
        return -1;
1538
0
    }
1539
1540
0
    int t = res.is_true;
1541
0
    hts_expr_val_free(&res);
1542
1543
0
    return t;
1544
0
}
1545
1546
static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1547
0
{
1548
0
    htsFile *fp = fpv;
1549
0
    bam1_t *b = bv;
1550
0
    int pass_filter, ret;
1551
1552
0
    do {
1553
0
        ret = cram_get_bam_seq(fp->fp.cram, &b);
1554
0
        if (ret < 0)
1555
0
            return cram_eof(fp->fp.cram) ? -1 : -2;
1556
1557
0
        if (bam_tag2cigar(b, 1, 1) < 0)
1558
0
            return -2;
1559
1560
0
        *tid = b->core.tid;
1561
0
        *beg = b->core.pos;
1562
0
        *end = bam_endpos(b);
1563
1564
0
        if (fp->filter) {
1565
0
            pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter);
1566
0
            if (pass_filter < 0)
1567
0
                return -2;
1568
0
        } else {
1569
0
            pass_filter = 1;
1570
0
        }
1571
0
    } while (pass_filter == 0);
1572
1573
0
    return ret;
1574
0
}
1575
1576
static int cram_pseek(void *fp, int64_t offset, int whence)
1577
0
{
1578
0
    cram_fd *fd =  (cram_fd *)fp;
1579
1580
0
    if ((0 != cram_seek(fd, offset, SEEK_SET))
1581
0
     && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
1582
0
        return -1;
1583
1584
0
    fd->curr_position = offset;
1585
1586
0
    if (fd->ctr) {
1587
0
        cram_free_container(fd->ctr);
1588
0
        if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
1589
0
            cram_free_container(fd->ctr_mt);
1590
1591
0
        fd->ctr = NULL;
1592
0
        fd->ctr_mt = NULL;
1593
0
        fd->ooc = 0;
1594
0
    }
1595
1596
0
    return 0;
1597
0
}
1598
1599
/*
1600
 * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
1601
 *   after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
1602
 *   container previously fetched. It was designed like this to integrate with the functionality
1603
 *   of the iterator stepping logic.
1604
 */
1605
1606
static int64_t cram_ptell(void *fp)
1607
0
{
1608
0
    cram_fd *fd = (cram_fd *)fp;
1609
0
    cram_container *c;
1610
0
    cram_slice *s;
1611
0
    int64_t ret = -1L;
1612
1613
0
    if (fd) {
1614
0
        if ((c = fd->ctr) != NULL) {
1615
0
            if ((s = c->slice) != NULL && s->max_rec) {
1616
0
                if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1))
1617
0
                    fd->curr_position += c->offset + c->length;
1618
0
            }
1619
0
        }
1620
0
        ret = fd->curr_position;
1621
0
    }
1622
1623
0
    return ret;
1624
0
}
1625
1626
static int bam_pseek(void *fp, int64_t offset, int whence)
1627
0
{
1628
0
    BGZF *fd = (BGZF *)fp;
1629
1630
0
    return bgzf_seek(fd, offset, whence);
1631
0
}
1632
1633
static int64_t bam_ptell(void *fp)
1634
0
{
1635
0
    BGZF *fd = (BGZF *)fp;
1636
0
    if (!fd)
1637
0
        return -1L;
1638
1639
0
    return bgzf_tell(fd);
1640
0
}
1641
1642
1643
1644
static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags)
1645
0
{
1646
0
    switch (fp->format.format) {
1647
0
    case bam:
1648
0
    case sam:
1649
0
        return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags);
1650
1651
0
    case cram: {
1652
0
        if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
1653
1654
        // Cons up a fake "index" just pointing at the associated cram_fd:
1655
0
        hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
1656
0
        if (idx == NULL) return NULL;
1657
0
        idx->fmt = HTS_FMT_CRAI;
1658
0
        idx->cram = fp->fp.cram;
1659
0
        return (hts_idx_t *) idx;
1660
0
        }
1661
1662
0
    default:
1663
0
        return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
1664
0
    }
1665
0
}
1666
1667
hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags)
1668
0
{
1669
0
    return index_load(fp, fn, fnidx, flags);
1670
0
}
1671
1672
0
hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) {
1673
0
    return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE);
1674
0
}
1675
1676
hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
1677
0
{
1678
0
    return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE);
1679
0
}
1680
1681
static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
1682
0
{
1683
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1684
0
    hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
1685
0
    if (iter == NULL) return NULL;
1686
1687
    // Cons up a dummy iterator for which hts_itr_next() will simply invoke
1688
    // the readrec function:
1689
0
    iter->is_cram = 1;
1690
0
    iter->read_rest = 1;
1691
0
    iter->off = NULL;
1692
0
    iter->bins.a = NULL;
1693
0
    iter->readrec = readrec;
1694
1695
0
    if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
1696
0
        cram_range r = { tid, beg+1, end };
1697
0
        int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
1698
1699
0
        iter->curr_off = 0;
1700
        // The following fields are not required by hts_itr_next(), but are
1701
        // filled in in case user code wants to look at them.
1702
0
        iter->tid = tid;
1703
0
        iter->beg = beg;
1704
0
        iter->end = end;
1705
1706
0
        switch (ret) {
1707
0
        case 0:
1708
0
            break;
1709
1710
0
        case -2:
1711
            // No data vs this ref, so mark iterator as completed.
1712
            // Same as HTS_IDX_NONE.
1713
0
            iter->finished = 1;
1714
0
            break;
1715
1716
0
        default:
1717
0
            free(iter);
1718
0
            return NULL;
1719
0
        }
1720
0
    }
1721
0
    else switch (tid) {
1722
0
    case HTS_IDX_REST:
1723
0
        iter->curr_off = 0;
1724
0
        break;
1725
0
    case HTS_IDX_NONE:
1726
0
        iter->curr_off = 0;
1727
0
        iter->finished = 1;
1728
0
        break;
1729
0
    default:
1730
0
        hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
1731
0
        abort();
1732
0
        break;
1733
0
    }
1734
1735
0
    return iter;
1736
0
}
1737
1738
hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
1739
0
{
1740
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1741
0
    if (idx == NULL)
1742
0
        return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest);
1743
0
    else if (cidx->fmt == HTS_FMT_CRAI)
1744
0
        return cram_itr_query(idx, tid, beg, end, sam_readrec);
1745
0
    else
1746
0
        return hts_itr_query(idx, tid, beg, end, sam_readrec);
1747
0
}
1748
1749
static int cram_name2id(void *fdv, const char *ref)
1750
0
{
1751
0
    cram_fd *fd = (cram_fd *) fdv;
1752
0
    return sam_hdr_name2tid(fd->header, ref);
1753
0
}
1754
1755
hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region)
1756
0
{
1757
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1758
0
    return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr,
1759
0
                          cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query,
1760
0
                          sam_readrec);
1761
0
}
1762
1763
hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount)
1764
0
{
1765
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1766
0
    hts_reglist_t *r_list = NULL;
1767
0
    int r_count = 0;
1768
1769
0
    if (!cidx || !hdr)
1770
0
        return NULL;
1771
1772
0
    hts_itr_t *itr = NULL;
1773
0
    if (cidx->fmt == HTS_FMT_CRAI) {
1774
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id);
1775
0
        if (!r_list)
1776
0
            return NULL;
1777
0
        itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram,
1778
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1779
0
    } else {
1780
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, (hts_name2id_f)(bam_name2id));
1781
0
        if (!r_list)
1782
0
            return NULL;
1783
0
        itr = hts_itr_regions(idx, r_list, r_count, (hts_name2id_f)(bam_name2id), hdr,
1784
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1785
0
    }
1786
1787
0
    if (!itr)
1788
0
        hts_reglist_free(r_list, r_count);
1789
1790
0
    return itr;
1791
0
}
1792
1793
hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
1794
0
{
1795
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1796
1797
0
    if(!cidx || !hdr || !reglist)
1798
0
        return NULL;
1799
1800
0
    if (cidx->fmt == HTS_FMT_CRAI)
1801
0
        return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
1802
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1803
0
    else
1804
0
        return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr,
1805
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1806
0
}
1807
1808
/**********************
1809
 *** SAM header I/O ***
1810
 **********************/
1811
1812
#include "htslib/kseq.h"
1813
#include "htslib/kstring.h"
1814
1815
sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text)
1816
0
{
1817
0
    sam_hdr_t *bh = sam_hdr_init();
1818
0
    if (!bh) return NULL;
1819
1820
0
    if (sam_hdr_add_lines(bh, text, l_text) != 0) {
1821
0
        sam_hdr_destroy(bh);
1822
0
        return NULL;
1823
0
    }
1824
1825
0
    return bh;
1826
0
}
1827
1828
// Minimal sanitisation of a header to ensure.
1829
// - null terminated string.
1830
// - all lines start with @ (also implies no blank lines).
1831
//
1832
// Much more could be done, but currently is not, including:
1833
// - checking header types are known (HD, SQ, etc).
1834
// - syntax (eg checking tab separated fields).
1835
// - validating n_targets matches @SQ records.
1836
// - validating target lengths against @SQ records.
1837
3.23k
static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) {
1838
3.23k
    if (!h)
1839
36
        return NULL;
1840
1841
    // Special case for empty headers.
1842
3.20k
    if (h->l_text == 0)
1843
588
        return h;
1844
1845
2.61k
    size_t i;
1846
2.61k
    unsigned int lnum = 0;
1847
2.61k
    char *cp = h->text, last = '\n';
1848
28.5M
    for (i = 0; i < h->l_text; i++) {
1849
        // NB: l_text excludes terminating nul.  This finds early ones.
1850
28.5M
        if (cp[i] == 0)
1851
1.32k
            break;
1852
1853
        // Error on \n[^@], including duplicate newlines
1854
28.5M
        if (last == '\n') {
1855
13.8k
            lnum++;
1856
13.8k
            if (cp[i] != '@') {
1857
3
                hts_log_error("Malformed SAM header at line %u", lnum);
1858
3
                sam_hdr_destroy(h);
1859
3
                return NULL;
1860
3
            }
1861
13.8k
        }
1862
1863
28.5M
        last = cp[i];
1864
28.5M
    }
1865
1866
2.61k
    if (i < h->l_text) { // Early nul found.  Complain if not just padding.
1867
1.32k
        size_t j = i;
1868
3.12M
        while (j < h->l_text && cp[j] == '\0') j++;
1869
1.32k
        if (j < h->l_text)
1870
1.31k
            hts_log_warning("Unexpected NUL character in header. Possibly truncated");
1871
1.32k
    }
1872
1873
    // Add trailing newline and/or trailing nul if required.
1874
2.61k
    if (last != '\n') {
1875
1.31k
        hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
1876
1877
1.31k
        if (h->l_text < 2 || i >= h->l_text - 2) {
1878
162
            if (h->l_text >= SIZE_MAX - 2) {
1879
0
                hts_log_error("No room for extra newline");
1880
0
                sam_hdr_destroy(h);
1881
0
                return NULL;
1882
0
            }
1883
1884
162
            cp = realloc(h->text, (size_t) h->l_text+2);
1885
162
            if (!cp) {
1886
0
                sam_hdr_destroy(h);
1887
0
                return NULL;
1888
0
            }
1889
162
            h->text = cp;
1890
162
        }
1891
1.31k
        cp[i++] = '\n';
1892
1893
        // l_text may be larger already due to multiple nul padding
1894
1.31k
        if (h->l_text < i)
1895
0
            h->l_text = i;
1896
1.31k
        cp[h->l_text] = '\0';
1897
1.31k
    }
1898
1899
2.61k
    return h;
1900
2.61k
}
1901
1902
2.60k
static sam_hdr_t *sam_hdr_create(htsFile* fp) {
1903
2.60k
    sam_hdr_t* h = sam_hdr_init();
1904
2.60k
    if (!h)
1905
0
        return NULL;
1906
1907
2.60k
    if (sam_hdr_build_from_sam_file(h, fp) != 0) {
1908
192
        sam_hdr_destroy(h);
1909
192
        return NULL;
1910
192
    }
1911
1912
2.40k
    if (fp->bam_header)
1913
0
        sam_hdr_destroy(fp->bam_header);
1914
2.40k
    fp->bam_header = sam_hdr_sanitise(h);
1915
2.40k
    fp->bam_header->ref_count = 1;
1916
1917
2.40k
    return fp->bam_header;
1918
2.60k
}
1919
1920
sam_hdr_t *sam_hdr_read(htsFile *fp)
1921
4.43k
{
1922
4.43k
    sam_hdr_t *h = NULL;
1923
4.43k
    if (!fp) {
1924
0
        errno = EINVAL;
1925
0
        return NULL;
1926
0
    }
1927
1928
4.43k
    switch (fp->format.format) {
1929
78
    case bam:
1930
78
        h = sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
1931
78
        break;
1932
1933
750
    case cram:
1934
750
        h = sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header));
1935
750
        break;
1936
1937
2.60k
    case sam:
1938
2.60k
        h = sam_hdr_create(fp);
1939
2.60k
        break;
1940
1941
21
    case fastq_format:
1942
1.00k
    case fasta_format:
1943
1.00k
        return sam_hdr_init();
1944
1945
0
    case empty_format:
1946
0
        errno = EPIPE;
1947
0
        return NULL;
1948
1949
0
    default:
1950
0
        errno = EFTYPE;
1951
0
        return NULL;
1952
4.43k
    }
1953
    //only sam,bam and cram reaches here
1954
3.42k
    if (h && !fp->bam_header) { //set except for sam which already has it
1955
        //for cram, it is the o/p header as for rest and not the internal header
1956
789
        fp->bam_header = h;
1957
789
        sam_hdr_incr_ref(fp->bam_header);
1958
789
    }
1959
3.42k
    return h;
1960
4.43k
}
1961
1962
int sam_hdr_write(htsFile *fp, const sam_hdr_t *h)
1963
4.20k
{
1964
4.20k
    if (!fp || !h) {
1965
0
        errno = EINVAL;
1966
0
        return -1;
1967
0
    }
1968
1969
4.20k
    switch (fp->format.format) {
1970
1.40k
    case binary_format:
1971
1.40k
        fp->format.category = sequence_data;
1972
1.40k
        fp->format.format = bam;
1973
        /* fall-through */
1974
1.40k
    case bam:
1975
1.40k
        if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
1976
1.40k
        break;
1977
1978
1.40k
    case cram: {
1979
1.40k
        cram_fd *fd = fp->fp.cram;
1980
1.40k
        if (cram_set_header2(fd, h) < 0) return -1;
1981
1.35k
        if (fp->fn_aux)
1982
0
            cram_load_reference(fd, fp->fn_aux);
1983
1.35k
        if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
1984
1.35k
        }
1985
1.35k
        break;
1986
1987
1.40k
    case text_format:
1988
1.40k
        fp->format.category = sequence_data;
1989
1.40k
        fp->format.format = sam;
1990
        /* fall-through */
1991
1.40k
    case sam: {
1992
1.40k
        if (!h->hrecs && !h->text)
1993
0
            return 0;
1994
1.40k
        char *text;
1995
1.40k
        kstring_t hdr_ks = { 0, 0, NULL };
1996
1.40k
        size_t l_text;
1997
1.40k
        ssize_t bytes;
1998
1.40k
        int r = 0, no_sq = 0;
1999
2000
1.40k
        if (h->hrecs) {
2001
1.40k
            if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0)
2002
0
                return -1;
2003
1.40k
            text = hdr_ks.s;
2004
1.40k
            l_text = hdr_ks.l;
2005
1.40k
        } else {
2006
1
            const char *p = NULL;
2007
1
            do {
2008
1
                const char *q = p == NULL ? h->text : p + 4;
2009
1
                p = strstr(q, "@SQ\t");
2010
1
            } while (!(p == NULL || p == h->text || *(p - 1) == '\n'));
2011
1
            no_sq = p == NULL;
2012
1
            text = h->text;
2013
1
            l_text = h->l_text;
2014
1
        }
2015
2016
1.40k
        if (fp->is_bgzf) {
2017
0
            bytes = bgzf_write(fp->fp.bgzf, text, l_text);
2018
1.40k
        } else {
2019
1.40k
            bytes = hwrite(fp->fp.hfile, text, l_text);
2020
1.40k
        }
2021
1.40k
        free(hdr_ks.s);
2022
1.40k
        if (bytes != l_text)
2023
0
            return -1;
2024
2025
1.40k
        if (no_sq) {
2026
0
            int i;
2027
0
            for (i = 0; i < h->n_targets; ++i) {
2028
0
                fp->line.l = 0;
2029
0
                r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0;
2030
0
                r |= kputs(h->target_name[i], &fp->line) < 0;
2031
0
                r |= kputsn("\tLN:", 4, &fp->line) < 0;
2032
0
                r |= kputw(h->target_len[i], &fp->line) < 0;
2033
0
                r |= kputc('\n', &fp->line) < 0;
2034
0
                if (r != 0)
2035
0
                    return -1;
2036
2037
0
                if (fp->is_bgzf) {
2038
0
                    bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
2039
0
                } else {
2040
0
                    bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
2041
0
                }
2042
0
                if (bytes != fp->line.l)
2043
0
                    return -1;
2044
0
            }
2045
0
        }
2046
1.40k
        if (fp->is_bgzf) {
2047
0
            if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2048
1.40k
        } else {
2049
1.40k
            if (hflush(fp->fp.hfile) != 0) return -1;
2050
1.40k
        }
2051
1.40k
        }
2052
1.40k
        break;
2053
2054
1.40k
    case fastq_format:
2055
0
    case fasta_format:
2056
        // Nothing to output; FASTQ has no file headers.
2057
0
        return 0;
2058
0
        break;
2059
2060
0
    default:
2061
0
        errno = EBADF;
2062
0
        return -1;
2063
4.20k
    }
2064
    //only sam,bam and cram reaches here
2065
4.15k
    if (h) {    //the new header
2066
4.15k
        sam_hdr_t *tmp = fp->bam_header;
2067
4.15k
        fp->bam_header = sam_hdr_dup(h);
2068
4.15k
        sam_hdr_destroy(tmp);
2069
4.15k
        if (!fp->bam_header && h)
2070
0
            return -1;  //failed to duplicate
2071
4.15k
    }
2072
4.15k
    return 0;
2073
4.15k
}
2074
2075
static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2076
0
{
2077
0
    char *p, *q, *beg = NULL, *end = NULL, *newtext;
2078
0
    size_t new_l_text;
2079
0
    if (!h || !key)
2080
0
        return -1;
2081
2082
0
    if (h->l_text > 3) {
2083
0
        if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
2084
0
            if ((p = strchr(h->text, '\n')) == 0) return -1;
2085
0
            *p = '\0'; // for strstr call
2086
2087
0
            char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
2088
2089
0
            if ((q = strstr(h->text, tmp)) != 0) { // key exists
2090
0
                *p = '\n'; // change back
2091
2092
                // mark the key:val
2093
0
                beg = q;
2094
0
                for (q += 4; *q != '\n' && *q != '\t'; ++q);
2095
0
                end = q;
2096
2097
0
                if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
2098
0
                    && strlen(val) == end - beg - 4)
2099
0
                     return 0; // val is the same, no need to change
2100
2101
0
            } else {
2102
0
                beg = end = p;
2103
0
                *p = '\n';
2104
0
            }
2105
0
        }
2106
0
    }
2107
0
    if (beg == NULL) { // no @HD
2108
0
        new_l_text = h->l_text;
2109
0
        if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9)
2110
0
            return -1;
2111
0
        new_l_text += strlen(SAM_FORMAT_VERSION) + 8;
2112
0
        if (val) {
2113
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2114
0
                return -1;
2115
0
            new_l_text += strlen(val) + 4;
2116
0
        }
2117
0
        newtext = (char*)malloc(new_l_text + 1);
2118
0
        if (!newtext) return -1;
2119
2120
0
        if (val)
2121
0
            snprintf(newtext, new_l_text + 1,
2122
0
                    "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
2123
0
        else
2124
0
            snprintf(newtext, new_l_text + 1,
2125
0
                    "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
2126
0
    } else { // has @HD but different or no key
2127
0
        new_l_text = (beg - h->text) + (h->text + h->l_text - end);
2128
0
        if (val) {
2129
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2130
0
                return -1;
2131
0
            new_l_text += strlen(val) + 4;
2132
0
        }
2133
0
        newtext = (char*)malloc(new_l_text + 1);
2134
0
        if (!newtext) return -1;
2135
2136
0
        if (val) {
2137
0
            snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s",
2138
0
                    (int) (beg - h->text), h->text, key, val, end);
2139
0
        } else { //delete key
2140
0
            snprintf(newtext, new_l_text + 1, "%.*s%s",
2141
0
                    (int) (beg - h->text), h->text, end);
2142
0
        }
2143
0
    }
2144
0
    free(h->text);
2145
0
    h->text = newtext;
2146
0
    h->l_text = new_l_text;
2147
0
    return 0;
2148
0
}
2149
2150
2151
int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2152
0
{
2153
0
    if (!h || !key)
2154
0
        return -1;
2155
2156
0
    if (!h->hrecs)
2157
0
        return old_sam_hdr_change_HD(h, key, val);
2158
2159
0
    if (val) {
2160
0
        if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0)
2161
0
            return -1;
2162
0
    } else {
2163
0
        if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0)
2164
0
            return -1;
2165
0
    }
2166
0
    return sam_hdr_rebuild(h);
2167
0
}
2168
2169
/* releases existing header and sets new one; increments ref count if not
2170
duplicating */
2171
int sam_hdr_set(samFile *fp, sam_hdr_t *h, int duplicate)
2172
0
{
2173
0
    if (!fp)
2174
0
        return -1;
2175
2176
0
    if (duplicate) {
2177
0
        sam_hdr_t *tmp = fp->bam_header;
2178
0
        fp->bam_header = sam_hdr_dup(h);
2179
0
        sam_hdr_destroy(tmp);
2180
0
        if (!fp->bam_header && h)
2181
0
            return -1;  //duplicate failed
2182
0
    } else {
2183
0
        if (fp->bam_header != h) {  //if not the same
2184
0
            sam_hdr_destroy(fp->bam_header);
2185
0
            fp->bam_header = h;
2186
0
            sam_hdr_incr_ref(fp->bam_header);
2187
0
        }
2188
0
    }
2189
2190
0
    return 0;
2191
0
}
2192
2193
//return the bam_header, user has to use sam_hdr_incr_ref where ever required
2194
sam_hdr_t* sam_hdr_get(samFile* fp)
2195
0
{
2196
0
    if (!fp)
2197
0
        return NULL;
2198
0
    return fp->bam_header;
2199
0
}
2200
2201
/**********************
2202
 *** SAM record I/O ***
2203
 **********************/
2204
2205
// The speed of this code can vary considerably depending on minor code
2206
// changes elsewhere as some of the tight loops are particularly prone to
2207
// speed changes when the instruction blocks are split over a 32-byte
2208
// boundary.  To protect against this, we explicitly specify an alignment
2209
// for this function.  If this is insufficient, we may also wish to
2210
// consider alignment of blocks within this function via
2211
// __attribute__((optimize("align-loops=5"))) (gcc) or clang equivalents.
2212
// However it's not very portable.
2213
// Instead we break into separate functions so we can explicitly specify
2214
// use __attribute__((aligned(32))) instead and force consistent loop
2215
// alignment.
2216
239k
static inline int64_t grow_B_array(bam1_t *b, uint32_t *n, size_t size) {
2217
    // Avoid overflow on 32-bit platforms, but it breaks BAM anyway
2218
239k
    if (*n > INT32_MAX*0.666) {
2219
0
        errno = ENOMEM;
2220
0
        return -1;
2221
0
    }
2222
2223
239k
    size_t bytes = (size_t)size * (size_t)(*n>>1);
2224
239k
    if (possibly_expand_bam_data(b, bytes) < 0) {
2225
0
        hts_log_error("Out of memory");
2226
0
        return -1;
2227
0
    }
2228
2229
239k
    (*n)+=*n>>1;
2230
239k
    return 0;
2231
239k
}
2232
2233
2234
// This ensures that q always ends up at the next comma after
2235
// reading a number even if it's followed by junk.  It
2236
// prevents the possibility of trying to read more than n items.
2237
7.62M
#define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0)
2238
2239
HTS_ALIGN32
2240
static char *sam_parse_Bc_vals(bam1_t *b, char *q, uint32_t *nused,
2241
22.0k
                               uint32_t *nalloc, int *overflow) {
2242
1.46M
    while (*q == ',') {
2243
1.43M
        if ((*nused)++ >= (*nalloc)) {
2244
414
            if (grow_B_array(b, nalloc, 1) < 0)
2245
0
                return NULL;
2246
414
        }
2247
1.43M
        *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, overflow);
2248
1.43M
        b->l_data++;
2249
1.43M
    }
2250
22.0k
    return q;
2251
22.0k
}
2252
2253
HTS_ALIGN32
2254
static char *sam_parse_BC_vals(bam1_t *b, char *q, uint32_t *nused,
2255
19.0k
                               uint32_t *nalloc, int *overflow) {
2256
622k
    while (*q == ',') {
2257
603k
        if ((*nused)++ >= (*nalloc)) {
2258
1.52k
            if (grow_B_array(b, nalloc, 1) < 0)
2259
0
                return NULL;
2260
1.52k
        }
2261
603k
        if (q[1] != '-') {
2262
590k
            *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, overflow);
2263
590k
            b->l_data++;
2264
590k
        } else {
2265
12.8k
            *overflow = 1;
2266
12.8k
            q++;
2267
12.8k
            skip_to_comma_(q);
2268
12.8k
        }
2269
603k
    }
2270
19.0k
    return q;
2271
19.0k
}
2272
2273
HTS_ALIGN32
2274
static char *sam_parse_Bs_vals(bam1_t *b, char *q, uint32_t *nused,
2275
8.18k
                               uint32_t *nalloc, int *overflow) {
2276
860k
    while (*q == ',') {
2277
852k
        if ((*nused)++ >= (*nalloc)) {
2278
3.32k
            if (grow_B_array(b, nalloc, 2) < 0)
2279
0
                return NULL;
2280
3.32k
        }
2281
852k
        i16_to_le(hts_str2int(q + 1, &q, 16, overflow),
2282
852k
                  b->data + b->l_data);
2283
852k
        b->l_data += 2;
2284
852k
    }
2285
8.18k
    return q;
2286
8.18k
}
2287
2288
HTS_ALIGN32
2289
static char *sam_parse_BS_vals(bam1_t *b, char *q, uint32_t *nused,
2290
4.84k
                               uint32_t *nalloc, int *overflow) {
2291
2.90M
    while (*q == ',') {
2292
2.89M
        if ((*nused)++ >= (*nalloc)) {
2293
6.90k
            if (grow_B_array(b, nalloc, 2) < 0)
2294
0
                return NULL;
2295
6.90k
        }
2296
2.89M
        if (q[1] != '-') {
2297
2.85M
            u16_to_le(hts_str2uint(q + 1, &q, 16, overflow),
2298
2.85M
                      b->data + b->l_data);
2299
2.85M
            b->l_data += 2;
2300
2.85M
        } else {
2301
44.2k
            *overflow = 1;
2302
44.2k
            q++;
2303
44.2k
            skip_to_comma_(q);
2304
44.2k
        }
2305
2.89M
    }
2306
4.84k
    return q;
2307
4.84k
}
2308
2309
HTS_ALIGN32
2310
static char *sam_parse_Bi_vals(bam1_t *b, char *q, uint32_t *nused,
2311
22.5k
                               uint32_t *nalloc, int *overflow) {
2312
6.16M
    while (*q == ',') {
2313
6.14M
        if ((*nused)++ >= (*nalloc)) {
2314
78
            if (grow_B_array(b, nalloc, 4) < 0)
2315
0
                return NULL;
2316
78
        }
2317
6.14M
        i32_to_le(hts_str2int(q + 1, &q, 32, overflow),
2318
6.14M
                  b->data + b->l_data);
2319
6.14M
        b->l_data += 4;
2320
6.14M
    }
2321
22.5k
    return q;
2322
22.5k
}
2323
2324
HTS_ALIGN32
2325
static char *sam_parse_BI_vals(bam1_t *b, char *q, uint32_t *nused,
2326
67.8k
                               uint32_t *nalloc, int *overflow) {
2327
1.83M
    while (*q == ',') {
2328
1.76M
        if ((*nused)++ >= (*nalloc)) {
2329
188k
            if (grow_B_array(b, nalloc, 4) < 0)
2330
0
                return NULL;
2331
188k
        }
2332
1.76M
        if (q[1] != '-') {
2333
1.66M
            u32_to_le(hts_str2uint(q + 1, &q, 32, overflow),
2334
1.66M
                      b->data + b->l_data);
2335
1.66M
            b->l_data += 4;
2336
1.66M
        } else {
2337
92.7k
            *overflow = 1;
2338
92.7k
            q++;
2339
92.7k
            skip_to_comma_(q);
2340
92.7k
        }
2341
1.76M
    }
2342
67.8k
    return q;
2343
67.8k
}
2344
2345
HTS_ALIGN32
2346
static char *sam_parse_Bf_vals(bam1_t *b, char *q, uint32_t *nused,
2347
13.2k
                               uint32_t *nalloc, int *overflow) {
2348
312k
    while (*q == ',') {
2349
299k
        if ((*nused)++ >= (*nalloc)) {
2350
38.8k
            if (grow_B_array(b, nalloc, 4) < 0)
2351
0
                return NULL;
2352
38.8k
        }
2353
299k
        float_to_le(strtod(q + 1, &q), b->data + b->l_data);
2354
299k
        b->l_data += 4;
2355
299k
    }
2356
13.2k
    return q;
2357
13.2k
}
2358
2359
HTS_ALIGN32
2360
static int sam_parse_B_vals_r(char type, uint32_t nalloc, char *in,
2361
                              char **end, bam1_t *b,
2362
158k
                              int *ctr) {
2363
    // Protect against infinite recursion when dealing with invalid input.
2364
    // An example string is "XX:B:C,-".  The lack of a number means min=0,
2365
    // but it overflowed due to "-" and so we repeat ad-infinitum.
2366
    //
2367
    // Loop detection is the safest solution incase there are other
2368
    // strange corner cases with malformed inputs.
2369
158k
    if (++(*ctr) > 2) {
2370
3
        hts_log_error("Malformed data in B:%c array", type);
2371
3
        return -1;
2372
3
    }
2373
2374
158k
    int orig_l = b->l_data;
2375
158k
    char *q = in;
2376
158k
    int32_t size;
2377
158k
    size_t bytes;
2378
158k
    int overflow = 0;
2379
2380
158k
    size = aux_type2size(type);
2381
158k
    if (size <= 0 || size > 4) {
2382
0
        hts_log_error("Unrecognized type B:%c", type);
2383
0
        return -1;
2384
0
    }
2385
2386
    // Ensure space for type + values.
2387
    // The first pass through here we don't know the number of entries and
2388
    // nalloc == 0.  We start with a small working set and then parse the
2389
    // data, growing as needed.
2390
    //
2391
    // If we have a second pass through we do know the number of entries
2392
    // and nalloc is already known.  We have no need to expand the bam data.
2393
158k
    if (!nalloc)
2394
110k
         nalloc=7;
2395
2396
    // Ensure allocated memory is big enough (for current nalloc estimate)
2397
158k
    bytes = (size_t) nalloc * (size_t) size;
2398
158k
    if (bytes / size != nalloc
2399
158k
        || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) {
2400
0
        hts_log_error("Out of memory");
2401
0
        return -1;
2402
0
    }
2403
2404
158k
    uint32_t nused = 0;
2405
2406
158k
    b->data[b->l_data++] = 'B';
2407
158k
    b->data[b->l_data++] = type;
2408
    // 32-bit B-array length is inserted later once we know it.
2409
158k
    int b_len_idx = b->l_data;
2410
158k
    b->l_data += sizeof(uint32_t);
2411
2412
158k
    if (type == 'c') {
2413
22.0k
        if (!(q = sam_parse_Bc_vals(b, q, &nused, &nalloc, &overflow)))
2414
0
            return -1;
2415
135k
    } else if (type == 'C') {
2416
19.0k
        if (!(q = sam_parse_BC_vals(b, q, &nused, &nalloc, &overflow)))
2417
0
            return -1;
2418
116k
    } else if (type == 's') {
2419
8.18k
        if (!(q = sam_parse_Bs_vals(b, q, &nused, &nalloc, &overflow)))
2420
0
            return -1;
2421
108k
    } else if (type == 'S') {
2422
4.84k
        if (!(q = sam_parse_BS_vals(b, q, &nused, &nalloc, &overflow)))
2423
0
            return -1;
2424
103k
    } else if (type == 'i') {
2425
22.5k
        if (!(q = sam_parse_Bi_vals(b, q, &nused, &nalloc, &overflow)))
2426
0
            return -1;
2427
81.3k
    } else if (type == 'I') {
2428
67.8k
        if (!(q = sam_parse_BI_vals(b, q, &nused, &nalloc, &overflow)))
2429
0
            return -1;
2430
67.8k
    } else if (type == 'f') {
2431
13.2k
        if (!(q = sam_parse_Bf_vals(b, q, &nused, &nalloc, &overflow)))
2432
0
            return -1;
2433
13.2k
    }
2434
158k
    if (*q != '\t' && *q != '\0') {
2435
        // Unknown B array type or junk in the numbers
2436
58
        hts_log_error("Malformed B:%c", type);
2437
58
        return -1;
2438
58
    }
2439
157k
    i32_to_le(nused, b->data + b_len_idx);
2440
2441
157k
    if (!overflow) {
2442
110k
        *end = q;
2443
110k
        return 0;
2444
110k
    } else {
2445
47.3k
        int64_t max = 0, min = 0, val;
2446
        // Given type was incorrect.  Try to rescue the situation.
2447
47.3k
        char *r = q;
2448
47.3k
        q = in;
2449
47.3k
        overflow = 0;
2450
47.3k
        b->l_data = orig_l;
2451
        // Find out what range of values is present
2452
6.90M
        while (q < r) {
2453
6.85M
            val = hts_str2int(q + 1, &q, 64, &overflow);
2454
6.85M
            if (max < val) max = val;
2455
6.85M
            if (min > val) min = val;
2456
6.85M
            skip_to_comma_(q);
2457
6.85M
        }
2458
        // Retry with appropriate type
2459
47.3k
        if (!overflow) {
2460
47.3k
            if (min < 0) {
2461
46.5k
                if (min >= INT8_MIN && max <= INT8_MAX) {
2462
21.5k
                    return sam_parse_B_vals_r('c', nalloc, in, end, b, ctr);
2463
24.9k
                } else if (min >= INT16_MIN && max <= INT16_MAX) {
2464
2.31k
                    return sam_parse_B_vals_r('s', nalloc, in, end, b, ctr);
2465
22.5k
                } else if (min >= INT32_MIN && max <= INT32_MAX) {
2466
22.5k
                    return sam_parse_B_vals_r('i', nalloc, in, end, b, ctr);
2467
22.5k
                }
2468
46.5k
            } else {
2469
873
                if (max < UINT8_MAX) {
2470
6
                    return sam_parse_B_vals_r('C', nalloc, in, end, b, ctr);
2471
867
                } else if (max <= UINT16_MAX) {
2472
223
                    return sam_parse_B_vals_r('S', nalloc, in, end, b, ctr);
2473
644
                } else if (max <= UINT32_MAX) {
2474
643
                    return sam_parse_B_vals_r('I', nalloc, in, end, b, ctr);
2475
643
                }
2476
873
            }
2477
47.3k
        }
2478
        // If here then at least one of the values is too big to store
2479
21
        hts_log_error("Numeric value in B array out of allowed range");
2480
21
        return -1;
2481
47.3k
    }
2482
157k
#undef skip_to_comma_
2483
157k
}
2484
2485
HTS_ALIGN32
2486
static int sam_parse_B_vals(char type, char *in, char **end, bam1_t *b)
2487
110k
{
2488
110k
    int ctr = 0;
2489
110k
    uint32_t nalloc = 0;
2490
110k
    return sam_parse_B_vals_r(type, nalloc, in, end, b, &ctr);
2491
110k
}
2492
2493
117k
static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) {
2494
117k
    if (*v >= '1' && *v <= '9') {
2495
21.3k
        return hts_str2uint(v, rv, 16, overflow);
2496
21.3k
    }
2497
96.5k
    else if (*v == '0') {
2498
        // handle single-digit "0" directly; otherwise it's hex or octal
2499
43.5k
        if (v[1] == '\t') { *rv = v+1; return 0; }
2500
1.40k
        else {
2501
1.40k
            unsigned long val = strtoul(v, rv, 0);
2502
1.40k
            if (val > 65535) { *overflow = 1; return 65535; }
2503
1.40k
            return val;
2504
1.40k
        }
2505
43.5k
    }
2506
52.9k
    else {
2507
        // TODO implement symbolic flag letters
2508
52.9k
        *rv = v;
2509
52.9k
        return 0;
2510
52.9k
    }
2511
117k
}
2512
2513
// Parse tag line and append to bam object b.
2514
// Shared by both SAM and FASTQ parsers.
2515
//
2516
// The difference between the two is how lenient we are to recognising
2517
// non-compliant strings.  The FASTQ parser glosses over arbitrary
2518
// non-SAM looking strings.
2519
static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient,
2520
117k
                            khash_t(tag) *tag_whitelist) {
2521
117k
    int overflow = 0;
2522
117k
    int checkpoint;
2523
117k
    char logbuf[40];
2524
117k
    char *q = start, *p = end;
2525
2526
117k
#define _parse_err(cond, ...)                   \
2527
4.53M
    do {                                        \
2528
9.69M
        if (cond) {                             \
2529
278
            if (lenient) {                      \
2530
0
                while (q < p && !isspace_c(*q))   \
2531
0
                    q++;                        \
2532
0
                while (q < p && isspace_c(*q))    \
2533
0
                    q++;                        \
2534
0
                b->l_data = checkpoint;         \
2535
0
                goto loop;                      \
2536
278
            } else {                            \
2537
278
                hts_log_error(__VA_ARGS__);     \
2538
278
                goto err_ret;                   \
2539
278
            }                                   \
2540
278
        }                                       \
2541
4.53M
    } while (0)
2542
2543
4.30M
    while (q < p) loop: {
2544
4.30M
        char type;
2545
4.30M
        checkpoint = b->l_data;
2546
4.30M
        if (p - q < 5) {
2547
44
            if (lenient) {
2548
0
                break;
2549
44
            } else {
2550
44
                hts_log_error("Incomplete aux field");
2551
44
                goto err_ret;
2552
44
            }
2553
44
        }
2554
2.15M
        _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id");
2555
2556
2.15M
        if (lenient && (q[2] | q[4]) != ':') {
2557
0
            while (q < p && !isspace_c(*q))
2558
0
                q++;
2559
0
            while (q < p && isspace_c(*q))
2560
0
                q++;
2561
0
            continue;
2562
0
        }
2563
2564
2.15M
        if (tag_whitelist) {
2565
0
            int tt = q[0]*256 + q[1];
2566
0
            if (kh_get(tag, tag_whitelist, tt) == kh_end(tag_whitelist)) {
2567
0
                while (q < p && *q != '\t')
2568
0
                    q++;
2569
0
                continue;
2570
0
            }
2571
0
        }
2572
2573
        // Copy over id
2574
2.15M
        if (possibly_expand_bam_data(b, 2) < 0) goto err_ret;
2575
2.15M
        memcpy(b->data + b->l_data, q, 2); b->l_data += 2;
2576
2.15M
        q += 3; type = *q++; ++q; // q points to value
2577
2.15M
        if (type != 'Z' && type != 'H') // the only zero length acceptable fields
2578
1.90M
            _parse_err(*q <= '\t', "incomplete aux field");
2579
2580
        // Ensure enough space for a double + type allocated.
2581
2.15M
        if (possibly_expand_bam_data(b, 16) < 0) goto err_ret;
2582
2583
2.15M
        if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
2584
706k
            b->data[b->l_data++] = 'A';
2585
706k
            b->data[b->l_data++] = *q++;
2586
1.44M
        } else if (type == 'i' || type == 'I') {
2587
1.02M
            if (*q == '-') {
2588
852k
                int32_t x = hts_str2int(q, &q, 32, &overflow);
2589
852k
                if (x >= INT8_MIN) {
2590
429k
                    b->data[b->l_data++] = 'c';
2591
429k
                    b->data[b->l_data++] = x;
2592
429k
                } else if (x >= INT16_MIN) {
2593
122k
                    b->data[b->l_data++] = 's';
2594
122k
                    i16_to_le(x, b->data + b->l_data);
2595
122k
                    b->l_data += 2;
2596
300k
                } else {
2597
300k
                    b->data[b->l_data++] = 'i';
2598
300k
                    i32_to_le(x, b->data + b->l_data);
2599
300k
                    b->l_data += 4;
2600
300k
                }
2601
852k
            } else {
2602
177k
                uint32_t x = hts_str2uint(q, &q, 32, &overflow);
2603
177k
                if (x <= UINT8_MAX) {
2604
103k
                    b->data[b->l_data++] = 'C';
2605
103k
                    b->data[b->l_data++] = x;
2606
103k
                } else if (x <= UINT16_MAX) {
2607
60.1k
                    b->data[b->l_data++] = 'S';
2608
60.1k
                    u16_to_le(x, b->data + b->l_data);
2609
60.1k
                    b->l_data += 2;
2610
60.1k
                } else {
2611
13.6k
                    b->data[b->l_data++] = 'I';
2612
13.6k
                    u32_to_le(x, b->data + b->l_data);
2613
13.6k
                    b->l_data += 4;
2614
13.6k
                }
2615
177k
            }
2616
1.02M
        } else if (type == 'f') {
2617
27.9k
            b->data[b->l_data++] = 'f';
2618
27.9k
            float_to_le(strtod(q, &q), b->data + b->l_data);
2619
27.9k
            b->l_data += sizeof(float);
2620
388k
        } else if (type == 'd') {
2621
27.6k
            b->data[b->l_data++] = 'd';
2622
27.6k
            double_to_le(strtod(q, &q), b->data + b->l_data);
2623
27.6k
            b->l_data += sizeof(double);
2624
360k
        } else if (type == 'Z' || type == 'H') {
2625
250k
            char *end = strchr(q, '\t');
2626
250k
            if (!end) end = q + strlen(q);
2627
250k
            _parse_err(type == 'H' && ((end-q)&1) != 0,
2628
250k
                       "hex field does not have an even number of digits");
2629
250k
            b->data[b->l_data++] = type;
2630
250k
            if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret;
2631
250k
            memcpy(b->data + b->l_data, q, end - q);
2632
250k
            b->l_data += end - q;
2633
250k
            b->data[b->l_data++] = '\0';
2634
250k
            q = end;
2635
250k
        } else if (type == 'B') {
2636
110k
            type = *q++; // q points to the first ',' following the typing byte
2637
110k
            _parse_err(*q && *q != ',' && *q != '\t',
2638
110k
                       "B aux field type not followed by ','");
2639
2640
110k
            if (sam_parse_B_vals(type, q, &q, b) < 0)
2641
82
                goto err_ret;
2642
110k
        } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1));
2643
2644
12.2M
        while (*q > '\t') { q++; } // Skip any junk to next tab
2645
2.15M
        q++;
2646
2.15M
    }
2647
2648
116k
    _parse_err(!lenient && overflow != 0, "numeric value out of allowed range");
2649
116k
#undef _parse_err
2650
2651
116k
    return 0;
2652
2653
404
err_ret:
2654
404
    return -2;
2655
116k
}
2656
2657
int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
2658
117k
{
2659
487k
#define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0)
2660
2661
117k
#if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff
2662
2663
// Macro that operates on 64-bits at a time.
2664
117k
#define COPY_MINUS_N(to,from,n,l,failed)                        \
2665
117k
    do {                                                        \
2666
82.2k
        uint64_u *from8 = (uint64_u *)(from);                   \
2667
82.2k
        uint64_u *to8 = (uint64_u *)(to);                       \
2668
82.2k
        uint64_t uflow = 0;                                     \
2669
82.2k
        size_t l8 = (l)>>3, i;                                  \
2670
82.2k
        for (i = 0; i < l8; i++) {                              \
2671
0
            to8[i] = from8[i] - (n)*0x0101010101010101UL;       \
2672
0
            uflow |= to8[i];                                    \
2673
0
        }                                                       \
2674
83.3k
        for (i<<=3; i < (l); ++i) {                             \
2675
1.10k
            to[i] = from[i] - (n);                              \
2676
1.10k
            uflow |= to[i];                                     \
2677
1.10k
        }                                                       \
2678
82.2k
        failed = (uflow & 0x8080808080808080UL) > 0;            \
2679
82.2k
    } while (0)
2680
2681
#else
2682
2683
// Basic version which operates a byte at a time
2684
#define COPY_MINUS_N(to,from,n,l,failed) do {                \
2685
        uint8_t uflow = 0;                                   \
2686
        for (i = 0; i < (l); ++i) {                          \
2687
            (to)[i] = (from)[i] - (n);                       \
2688
            uflow |= (uint8_t) (to)[i];                      \
2689
        }                                                    \
2690
        failed = (uflow & 0x80) > 0;                         \
2691
    } while (0)
2692
2693
#endif
2694
2695
210k
#define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l)
2696
1.45M
#define _parse_err(cond, ...) do { if (cond) { hts_log_error(__VA_ARGS__); goto err_ret; } } while (0)
2697
425k
#define _parse_warn(cond, ...) do { if (cond) { hts_log_warning(__VA_ARGS__); } } while (0)
2698
2699
117k
    uint8_t *t;
2700
2701
117k
    char *p = s->s, *q;
2702
117k
    int i, overflow = 0;
2703
117k
    char logbuf[40];
2704
117k
    hts_pos_t cigreflen;
2705
117k
    bam1_core_t *c = &b->core;
2706
2707
117k
    b->l_data = 0;
2708
117k
    memset(c, 0, 32);
2709
2710
    // qname
2711
117k
    q = _read_token(p);
2712
2713
117k
    _parse_warn(p - q <= 1, "empty query name");
2714
117k
    _parse_err(p - q > 255, "query name too long");
2715
    // resize large enough for name + extranul
2716
117k
    if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret;
2717
117k
    memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q;
2718
2719
117k
    c->l_extranul = (4 - (b->l_data & 3)) & 3;
2720
117k
    memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul);
2721
117k
    b->l_data += c->l_extranul;
2722
2723
117k
    c->l_qname = p - q + c->l_extranul;
2724
2725
    // flag
2726
117k
    c->flag = parse_sam_flag(p, &p, &overflow);
2727
117k
    if (*p++ != '\t') goto err_ret; // malformated flag
2728
2729
    // chr
2730
117k
    q = _read_token(p);
2731
117k
    if (strcmp(q, "*")) {
2732
104k
        _parse_err(h->n_targets == 0, "no SQ lines present in the header");
2733
104k
        c->tid = bam_name2id(h, q);
2734
104k
        _parse_err(c->tid < -1, "failed to parse header");
2735
104k
        _parse_warn(c->tid < 0, "unrecognized reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2736
104k
    } else c->tid = -1;
2737
2738
    // pos
2739
117k
    c->pos = hts_str2uint(p, &p, 62, &overflow) - 1;
2740
117k
    if (*p++ != '\t') goto err_ret;
2741
117k
    if (c->pos < 0 && c->tid >= 0) {
2742
13.3k
        _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
2743
13.3k
        c->tid = -1;
2744
13.3k
    }
2745
117k
    if (c->tid < 0) c->flag |= BAM_FUNMAP;
2746
2747
    // mapq
2748
117k
    c->qual = hts_str2uint(p, &p, 8, &overflow);
2749
117k
    if (*p++ != '\t') goto err_ret;
2750
    // cigar
2751
117k
    if (*p != '*') {
2752
101k
        uint32_t *cigar = NULL;
2753
101k
        int old_l_data = b->l_data;
2754
101k
        int n_cigar = bam_parse_cigar(p, &p, b);
2755
101k
        if (n_cigar < 1 || *p++ != '\t') goto err_ret;
2756
100k
        cigar = (uint32_t *)(b->data + old_l_data);
2757
2758
        // can't use bam_endpos() directly as some fields not yet set up
2759
100k
        cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
2760
100k
        if (cigreflen == 0) cigreflen = 1;
2761
100k
    } else {
2762
16.5k
        _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
2763
16.5k
        c->flag |= BAM_FUNMAP;
2764
16.5k
        q = _read_token(p);
2765
16.5k
        cigreflen = 1;
2766
16.5k
    }
2767
117k
    _parse_err(HTS_POS_MAX - cigreflen <= c->pos,
2768
117k
               "read ends beyond highest supported position");
2769
117k
    c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5);
2770
    // mate chr
2771
117k
    q = _read_token(p);
2772
117k
    if (strcmp(q, "=") == 0) {
2773
0
        c->mtid = c->tid;
2774
117k
    } else if (strcmp(q, "*") == 0) {
2775
0
        c->mtid = -1;
2776
117k
    } else {
2777
117k
        c->mtid = bam_name2id(h, q);
2778
117k
        _parse_err(c->mtid < -1, "failed to parse header");
2779
117k
        _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2780
117k
    }
2781
    // mpos
2782
117k
    c->mpos = hts_str2uint(p, &p, 62, &overflow) - 1;
2783
117k
    if (*p++ != '\t') goto err_ret;
2784
117k
    if (c->mpos < 0 && c->mtid >= 0) {
2785
55.6k
        _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
2786
55.6k
        c->mtid = -1;
2787
55.6k
    }
2788
    // tlen
2789
117k
    c->isize = hts_str2int(p, &p, 63, &overflow);
2790
117k
    if (*p++ != '\t') goto err_ret;
2791
117k
    _parse_err(overflow, "number outside allowed range");
2792
    // seq
2793
117k
    q = _read_token(p);
2794
117k
    if (strcmp(q, "*")) {
2795
93.7k
        _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long");
2796
93.7k
        c->l_qseq = p - q - 1;
2797
93.7k
        hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname));
2798
93.7k
        _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length");
2799
93.7k
        i = (c->l_qseq + 1) >> 1;
2800
93.7k
        _get_mem(uint8_t, &t, b, i);
2801
2802
93.7k
        unsigned int lqs2 = c->l_qseq&~1, i;
2803
106k
        for (i = 0; i < lqs2; i+=2)
2804
12.6k
            t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]];
2805
129k
        for (; i < c->l_qseq; ++i)
2806
35.2k
            t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2);
2807
93.7k
    } else c->l_qseq = 0;
2808
    // qual
2809
234k
    _get_mem(uint8_t, &t, b, c->l_qseq);
2810
234k
    if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) {
2811
34.9k
        memset(t, 0xff, c->l_qseq);
2812
34.9k
        p += 2;
2813
82.3k
    } else {
2814
82.3k
        int failed = 0;
2815
82.3k
        _parse_err(s->l - (p - s->s) < c->l_qseq
2816
82.3k
                   || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'),
2817
82.3k
                   "SEQ and QUAL are of different length");
2818
82.2k
        COPY_MINUS_N(t, p, 33, c->l_qseq, failed);
2819
82.2k
        _parse_err(failed, "invalid QUAL character");
2820
82.2k
        p += c->l_qseq + 1;
2821
82.2k
    }
2822
2823
    // aux
2824
117k
    if (aux_parse(p, s->s + s->l, b, 0, NULL) < 0)
2825
404
        goto err_ret;
2826
2827
116k
    if (bam_tag2cigar(b, 1, 1) < 0)
2828
0
        return -2;
2829
116k
    return 0;
2830
2831
0
#undef _parse_warn
2832
0
#undef _parse_err
2833
0
#undef _get_mem
2834
0
#undef _read_token
2835
1.14k
err_ret:
2836
1.14k
    return -2;
2837
116k
}
2838
2839
101k
static uint32_t read_ncigar(const char *q) {
2840
101k
    uint32_t n_cigar = 0;
2841
1.10M
    for (; *q && *q != '\t'; ++q)
2842
1.00M
        if (!isdigit_c(*q)) ++n_cigar;
2843
101k
    if (!n_cigar) {
2844
36
        hts_log_error("No CIGAR operations");
2845
36
        return 0;
2846
36
    }
2847
101k
    if (n_cigar >= 2147483647) {
2848
0
        hts_log_error("Too many CIGAR operations");
2849
0
        return 0;
2850
0
    }
2851
2852
101k
    return n_cigar;
2853
101k
}
2854
2855
/*! @function
2856
 @abstract  Parse a CIGAR string into preallocated a uint32_t array
2857
 @param  in      [in]  pointer to the source string
2858
 @param  a_cigar [out]  address of the destination uint32_t buffer
2859
 @return         number of processed input characters; 0 on error
2860
 */
2861
101k
static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) {
2862
101k
    int i, overflow = 0;
2863
101k
    const char *p = in;
2864
347k
    for (i = 0; i < n_cigar; i++) {
2865
246k
        uint32_t len;
2866
246k
        int op;
2867
246k
        char *q;
2868
246k
        len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT;
2869
246k
        if (q == p) {
2870
48
            hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p);
2871
48
            return 0;
2872
48
        }
2873
246k
        if (overflow) {
2874
12
            hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p);
2875
12
            return 0;
2876
12
        }
2877
246k
        p = q;
2878
246k
        op = bam_cigar_table[(unsigned char)*p++];
2879
246k
        if (op < 0) {
2880
97
            hts_log_error("Unrecognized CIGAR operator");
2881
97
            return 0;
2882
97
        }
2883
246k
        a_cigar[i] = len;
2884
246k
        a_cigar[i] |= op;
2885
246k
    }
2886
2887
100k
    return p-in;
2888
101k
}
2889
2890
0
ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) {
2891
0
    size_t n_cigar = 0;
2892
0
    int diff;
2893
2894
0
    if (!in || !a_cigar || !a_mem) {
2895
0
        hts_log_error("NULL pointer arguments");
2896
0
        return -1;
2897
0
    }
2898
0
    if (end) *end = (char *)in;
2899
2900
0
    if (*in == '*') {
2901
0
        if (end) (*end)++;
2902
0
        return 0;
2903
0
    }
2904
0
    n_cigar = read_ncigar(in);
2905
0
    if (!n_cigar) return 0;
2906
0
    if (n_cigar > *a_mem) {
2907
0
        uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar));
2908
0
        if (a_tmp) {
2909
0
            *a_cigar = a_tmp;
2910
0
            *a_mem = n_cigar;
2911
0
        } else {
2912
0
            hts_log_error("Memory allocation error");
2913
0
            return -1;
2914
0
        }
2915
0
    }
2916
2917
0
    if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1;
2918
0
    if (end) *end = (char *)in+diff;
2919
2920
0
    return n_cigar;
2921
0
}
2922
2923
101k
ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) {
2924
101k
    size_t n_cigar = 0;
2925
101k
    int diff;
2926
2927
101k
    if (!in || !b) {
2928
0
        hts_log_error("NULL pointer arguments");
2929
0
        return -1;
2930
0
    }
2931
101k
    if (end) *end = (char *)in;
2932
2933
101k
    n_cigar = (*in == '*') ? 0 : read_ncigar(in);
2934
101k
    if (!n_cigar && b->core.n_cigar == 0) {
2935
36
        if (end) *end = (char *)in+1;
2936
36
        return 0;
2937
36
    }
2938
2939
101k
    ssize_t cig_diff = n_cigar - b->core.n_cigar;
2940
101k
    if (cig_diff > 0 &&
2941
101k
        possibly_expand_bam_data(b, cig_diff * sizeof(uint32_t)) < 0) {
2942
0
        hts_log_error("Memory allocation error");
2943
0
        return -1;
2944
0
    }
2945
2946
101k
    uint32_t *cig = bam_get_cigar(b);
2947
101k
    if ((uint8_t *)cig != b->data + b->l_data) {
2948
        // Modifying an BAM existing BAM record
2949
0
        uint8_t  *seq = bam_get_seq(b);
2950
0
        memmove(cig + n_cigar, seq, (b->data + b->l_data) - seq);
2951
0
    }
2952
2953
101k
    if (n_cigar) {
2954
101k
        if (!(diff = parse_cigar(in, cig, n_cigar)))
2955
157
            return -1;
2956
101k
    } else {
2957
0
        diff = 1; // handle "*"
2958
0
    }
2959
2960
100k
    b->l_data += cig_diff * sizeof(uint32_t);
2961
100k
    b->core.n_cigar = n_cigar;
2962
100k
    if (end) *end = (char *)in + diff;
2963
2964
100k
    return n_cigar;
2965
101k
}
2966
2967
/*
2968
 * -----------------------------------------------------------------------------
2969
 * SAM threading
2970
 */
2971
// Size of SAM text block (reading)
2972
0
#define SAM_NBYTES 240000
2973
2974
// Number of BAM records (writing, up to NB_mem in size)
2975
0
#define SAM_NBAM 1000
2976
2977
struct SAM_state;
2978
2979
// Output job - a block of BAM records
2980
typedef struct sp_bams {
2981
    struct sp_bams *next;
2982
    int serial;
2983
2984
    bam1_t *bams;
2985
    int nbams, abams; // used and alloc for bams[] array
2986
    size_t bam_mem;   // very approximate total size
2987
2988
    struct SAM_state *fd;
2989
} sp_bams;
2990
2991
// Input job - a block of SAM text
2992
typedef struct sp_lines {
2993
    struct sp_lines *next;
2994
    int serial;
2995
2996
    char *data;
2997
    int data_size;
2998
    int alloc;
2999
3000
    struct SAM_state *fd;
3001
    sp_bams *bams;
3002
} sp_lines;
3003
3004
enum sam_cmd {
3005
    SAM_NONE = 0,
3006
    SAM_CLOSE,
3007
    SAM_CLOSE_DONE,
3008
    SAM_AT_EOF,
3009
};
3010
3011
typedef struct SAM_state {
3012
    sam_hdr_t *h;
3013
3014
    hts_tpool *p;
3015
    int own_pool;
3016
    pthread_mutex_t lines_m;
3017
    hts_tpool_process *q;
3018
    pthread_t dispatcher;
3019
    int dispatcher_set;
3020
3021
    sp_lines *lines;
3022
    sp_bams *bams;
3023
3024
    sp_bams *curr_bam;
3025
    int curr_idx;
3026
    int serial;
3027
3028
    // Be warned: moving these mutexes around in this struct can reduce
3029
    // threading performance by up to 70%!
3030
    pthread_mutex_t command_m;
3031
    pthread_cond_t command_c;
3032
    enum sam_cmd command;
3033
3034
    // One of the E* errno codes
3035
    int errcode;
3036
3037
    htsFile *fp;
3038
} SAM_state;
3039
3040
// Returns a SAM_state struct from a generic hFILE.
3041
//
3042
// Returns NULL on failure.
3043
0
static SAM_state *sam_state_create(htsFile *fp) {
3044
    // Ideally sam_open wouldn't be a #define to hts_open but instead would
3045
    // be a redirect call with an additional 'S' mode.  This in turn would
3046
    // correctly set the designed format to sam instead of a generic
3047
    // text_format.
3048
0
    if (fp->format.format != sam && fp->format.format != text_format)
3049
0
        return NULL;
3050
3051
0
    SAM_state *fd = calloc(1, sizeof(*fd));
3052
0
    if (!fd)
3053
0
        return NULL;
3054
3055
0
    fp->state = fd;
3056
0
    fd->fp = fp;
3057
3058
0
    return fd;
3059
0
}
3060
3061
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str);
3062
static void *sam_format_worker(void *arg);
3063
3064
0
static void sam_state_err(SAM_state *fd, int errcode) {
3065
0
    pthread_mutex_lock(&fd->command_m);
3066
0
    if (!fd->errcode)
3067
0
        fd->errcode = errcode;
3068
0
    pthread_mutex_unlock(&fd->command_m);
3069
0
}
3070
3071
0
static void sam_free_sp_bams(sp_bams *b) {
3072
0
    if (!b)
3073
0
        return;
3074
3075
0
    if (b->bams) {
3076
0
        int i;
3077
0
        for (i = 0; i < b->abams; i++) {
3078
0
            if (b->bams[i].data)
3079
0
                free(b->bams[i].data);
3080
0
        }
3081
0
        free(b->bams);
3082
0
    }
3083
0
    free(b);
3084
0
}
3085
3086
// Destroys the state produce by sam_state_create.
3087
4.87k
int sam_state_destroy(htsFile *fp) {
3088
4.87k
    int ret = 0;
3089
3090
4.87k
    if (!fp->state)
3091
4.87k
        return 0;
3092
3093
0
    SAM_state *fd = fp->state;
3094
0
    if (fd->p) {
3095
0
        if (fd->h) {
3096
            // Notify sam_dispatcher we're closing
3097
0
            pthread_mutex_lock(&fd->command_m);
3098
0
            if (fd->command != SAM_CLOSE_DONE)
3099
0
                fd->command = SAM_CLOSE;
3100
0
            pthread_cond_signal(&fd->command_c);
3101
0
            ret = -fd->errcode;
3102
0
            if (fd->q)
3103
0
                hts_tpool_wake_dispatch(fd->q); // unstick the reader
3104
3105
0
            if (!fp->is_write && fd->q && fd->dispatcher_set) {
3106
0
                for (;;) {
3107
                    // Avoid deadlocks with dispatcher
3108
0
                    if (fd->command == SAM_CLOSE_DONE)
3109
0
                        break;
3110
0
                    hts_tpool_wake_dispatch(fd->q);
3111
0
                    pthread_mutex_unlock(&fd->command_m);
3112
0
                    hts_usleep(10000);
3113
0
                    pthread_mutex_lock(&fd->command_m);
3114
0
                }
3115
0
            }
3116
0
            pthread_mutex_unlock(&fd->command_m);
3117
3118
0
            if (fp->is_write) {
3119
                // Dispatch the last partial block.
3120
0
                sp_bams *gb = fd->curr_bam;
3121
0
                if (!ret && gb && gb->nbams > 0 && fd->q)
3122
0
                    ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb);
3123
3124
                // Flush and drain output
3125
0
                if (fd->q)
3126
0
                    hts_tpool_process_flush(fd->q);
3127
0
                pthread_mutex_lock(&fd->command_m);
3128
0
                if (!ret) ret = -fd->errcode;
3129
0
                pthread_mutex_unlock(&fd->command_m);
3130
3131
0
                while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) {
3132
0
                    hts_usleep(10000);
3133
0
                    pthread_mutex_lock(&fd->command_m);
3134
0
                    ret = -fd->errcode;
3135
                    // not empty but shutdown implies error
3136
0
                    if (hts_tpool_process_is_shutdown(fd->q) && !ret)
3137
0
                        ret = EIO;
3138
0
                    pthread_mutex_unlock(&fd->command_m);
3139
0
                }
3140
0
                if (fd->q)
3141
0
                    hts_tpool_process_shutdown(fd->q);
3142
0
            }
3143
3144
            // Wait for it to acknowledge
3145
0
            if (fd->dispatcher_set)
3146
0
                pthread_join(fd->dispatcher, NULL);
3147
0
            if (!ret) ret = -fd->errcode;
3148
0
        }
3149
3150
        // Tidy up memory
3151
0
        if (fd->q)
3152
0
            hts_tpool_process_destroy(fd->q);
3153
3154
0
        if (fd->own_pool && fp->format.compression == no_compression) {
3155
0
            hts_tpool_destroy(fd->p);
3156
0
            fd->p = NULL;
3157
0
        }
3158
0
        pthread_mutex_destroy(&fd->lines_m);
3159
0
        pthread_mutex_destroy(&fd->command_m);
3160
0
        pthread_cond_destroy(&fd->command_c);
3161
3162
0
        sp_lines *l = fd->lines;
3163
0
        while (l) {
3164
0
            sp_lines *n = l->next;
3165
0
            free(l->data);
3166
0
            free(l);
3167
0
            l = n;
3168
0
        }
3169
3170
0
        sp_bams *b = fd->bams;
3171
0
        while (b) {
3172
0
            if (fd->curr_bam == b)
3173
0
                fd->curr_bam = NULL;
3174
0
            sp_bams *n = b->next;
3175
0
            sam_free_sp_bams(b);
3176
0
            b = n;
3177
0
        }
3178
3179
0
        if (fd->curr_bam)
3180
0
            sam_free_sp_bams(fd->curr_bam);
3181
3182
        // Decrement counter by one, maybe destroying too.
3183
        // This is to permit the caller using bam_hdr_destroy
3184
        // before sam_close without triggering decode errors
3185
        // in the background threads.
3186
0
        bam_hdr_destroy(fd->h);
3187
0
    }
3188
3189
0
    free(fp->state);
3190
0
    fp->state = NULL;
3191
0
    return ret;
3192
4.87k
}
3193
3194
// Cleanup function - job for sam_parse_worker; result for sam_format_worker
3195
0
static void cleanup_sp_lines(void *arg) {
3196
0
    sp_lines *gl = (sp_lines *)arg;
3197
0
    if (!gl) return;
3198
3199
    // Should always be true for lines passed to / from thread workers.
3200
0
    assert(gl->next == NULL);
3201
3202
0
    free(gl->data);
3203
0
    sam_free_sp_bams(gl->bams);
3204
0
    free(gl);
3205
0
}
3206
3207
// Run from one of the worker threads.
3208
// Convert a passed in array of lines to array of BAMs, returning
3209
// the result back to the thread queue.
3210
0
static void *sam_parse_worker(void *arg) {
3211
0
    sp_lines *gl = (sp_lines *)arg;
3212
0
    sp_bams *gb = NULL;
3213
0
    char *lines = gl->data;
3214
0
    int i;
3215
0
    bam1_t *b;
3216
0
    SAM_state *fd = gl->fd;
3217
3218
    // Use a block of BAM structs we had earlier if available.
3219
0
    pthread_mutex_lock(&fd->lines_m);
3220
0
    if (fd->bams) {
3221
0
        gb = fd->bams;
3222
0
        fd->bams = gb->next;
3223
0
    }
3224
0
    pthread_mutex_unlock(&fd->lines_m);
3225
3226
0
    if (gb == NULL) {
3227
0
        gb = calloc(1, sizeof(*gb));
3228
0
        if (!gb) {
3229
0
            return NULL;
3230
0
        }
3231
0
        gb->abams = 100;
3232
0
        gb->bams = b = calloc(gb->abams, sizeof(*b));
3233
0
        if (!gb->bams) {
3234
0
            sam_state_err(fd, ENOMEM);
3235
0
            goto err;
3236
0
        }
3237
0
        gb->nbams = 0;
3238
0
        gb->bam_mem = 0;
3239
0
    }
3240
0
    gb->serial = gl->serial;
3241
0
    gb->next = NULL;
3242
3243
0
    b = (bam1_t *)gb->bams;
3244
0
    if (!b) {
3245
0
        sam_state_err(fd, ENOMEM);
3246
0
        goto err;
3247
0
    }
3248
3249
0
    i = 0;
3250
0
    char *cp = lines, *cp_end = lines + gl->data_size;
3251
0
    while (cp < cp_end) {
3252
0
        if (i >= gb->abams) {
3253
0
            int old_abams = gb->abams;
3254
0
            gb->abams *= 2;
3255
0
            b = (bam1_t *)realloc(gb->bams, gb->abams*sizeof(bam1_t));
3256
0
            if (!b) {
3257
0
                gb->abams /= 2;
3258
0
                sam_state_err(fd, ENOMEM);
3259
0
                goto err;
3260
0
            }
3261
0
            memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b));
3262
0
            gb->bams = b;
3263
0
        }
3264
3265
        // Ideally we'd get sam_parse1 to return the number of
3266
        // bytes decoded and to be able to stop on newline as
3267
        // well as \0.
3268
        //
3269
        // We can then avoid the additional strchr loop.
3270
        // It's around 6% of our CPU cost, albeit threadable.
3271
        //
3272
        // However this is an API change so for now we copy.
3273
3274
0
        char *nl = strchr(cp, '\n');
3275
0
        char *line_end;
3276
0
        if (nl) {
3277
0
            line_end = nl;
3278
0
            if (line_end > cp && *(line_end - 1) == '\r')
3279
0
                line_end--;
3280
0
            nl++;
3281
0
        } else {
3282
0
            nl = line_end = cp_end;
3283
0
        }
3284
0
        *line_end = '\0';
3285
0
        kstring_t ks = { line_end - cp, gl->alloc, cp };
3286
0
        if (sam_parse1(&ks, fd->h, &b[i]) < 0) {
3287
0
            sam_state_err(fd, errno ? errno : EIO);
3288
0
            cleanup_sp_lines(gl);
3289
0
            goto err;
3290
0
        }
3291
3292
0
        cp = nl;
3293
0
        i++;
3294
0
    }
3295
0
    gb->nbams = i;
3296
3297
0
    pthread_mutex_lock(&fd->lines_m);
3298
0
    gl->next = fd->lines;
3299
0
    fd->lines = gl;
3300
0
    pthread_mutex_unlock(&fd->lines_m);
3301
0
    return gb;
3302
3303
0
 err:
3304
0
    sam_free_sp_bams(gb);
3305
0
    return NULL;
3306
0
}
3307
3308
0
static void *sam_parse_eof(void *arg) {
3309
0
    return NULL;
3310
0
}
3311
3312
// Cleanup function - result for sam_parse_worker; job for sam_format_worker
3313
0
static void cleanup_sp_bams(void *arg) {
3314
0
    sam_free_sp_bams((sp_bams *) arg);
3315
0
}
3316
3317
// Runs in its own thread.
3318
// Reads a block of text (SAM) and sends a new job to the thread queue to
3319
// translate this to BAM.
3320
0
static void *sam_dispatcher_read(void *vp) {
3321
0
    htsFile *fp = vp;
3322
0
    kstring_t line = {0};
3323
0
    int line_frag = 0;
3324
0
    SAM_state *fd = fp->state;
3325
0
    sp_lines *l = NULL;
3326
3327
    // Pre-allocate buffer for left-over bits of line (exact size doesn't
3328
    // matter as it will grow if necessary).
3329
0
    if (ks_resize(&line, 1000) < 0)
3330
0
        goto err;
3331
3332
0
    for (;;) {
3333
        // Check for command
3334
0
        pthread_mutex_lock(&fd->command_m);
3335
0
        switch (fd->command) {
3336
3337
0
        case SAM_CLOSE:
3338
0
            pthread_cond_signal(&fd->command_c);
3339
0
            pthread_mutex_unlock(&fd->command_m);
3340
0
            hts_tpool_process_shutdown(fd->q);
3341
0
            goto tidyup;
3342
3343
0
        default:
3344
0
            break;
3345
0
        }
3346
0
        pthread_mutex_unlock(&fd->command_m);
3347
3348
0
        pthread_mutex_lock(&fd->lines_m);
3349
0
        if (fd->lines) {
3350
            // reuse existing line buffer
3351
0
            l = fd->lines;
3352
0
            fd->lines = l->next;
3353
0
        }
3354
0
        pthread_mutex_unlock(&fd->lines_m);
3355
3356
0
        if (l == NULL) {
3357
            // none to reuse, to create a new one
3358
0
            l = calloc(1, sizeof(*l));
3359
0
            if (!l)
3360
0
                goto err;
3361
0
            l->alloc = SAM_NBYTES;
3362
0
            l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1
3363
0
            if (!l->data) {
3364
0
                free(l);
3365
0
                l = NULL;
3366
0
                goto err;
3367
0
            }
3368
0
            l->fd = fd;
3369
0
        }
3370
0
        l->next = NULL;
3371
3372
0
        if (l->alloc < line_frag+SAM_NBYTES/2) {
3373
0
            char *rp = realloc(l->data, line_frag+SAM_NBYTES/2 +8);
3374
0
            if (!rp)
3375
0
                goto err;
3376
0
            l->alloc = line_frag+SAM_NBYTES/2;
3377
0
            l->data = rp;
3378
0
        }
3379
0
        memcpy(l->data, line.s, line_frag);
3380
3381
0
        l->data_size = line_frag;
3382
0
        ssize_t nbytes;
3383
0
    longer_line:
3384
0
        if (fp->is_bgzf)
3385
0
            nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag);
3386
0
        else
3387
0
            nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag);
3388
0
        if (nbytes < 0) {
3389
0
            sam_state_err(fd, errno ? errno : EIO);
3390
0
            goto err;
3391
0
        } else if (nbytes == 0)
3392
0
            break; // EOF
3393
0
        l->data_size += nbytes;
3394
3395
        // trim to last \n. Maybe \r\n, but that's still fine
3396
0
        if (nbytes == l->alloc - line_frag) {
3397
0
            char *cp_end = l->data + l->data_size;
3398
0
            char *cp = cp_end-1;
3399
3400
0
            while (cp > (char *)l->data && *cp != '\n')
3401
0
                cp--;
3402
3403
            // entire buffer is part of a single line
3404
0
            if (cp == l->data) {
3405
0
                line_frag = l->data_size;
3406
0
                char *rp = realloc(l->data, l->alloc * 2 + 8);
3407
0
                if (!rp)
3408
0
                    goto err;
3409
0
                l->alloc *= 2;
3410
0
                l->data = rp;
3411
0
                assert(l->alloc >= l->data_size);
3412
0
                assert(l->alloc >= line_frag);
3413
0
                assert(l->alloc >= l->alloc - line_frag);
3414
0
                goto longer_line;
3415
0
            }
3416
0
            cp++;
3417
3418
            // line holds the remainder of our line.
3419
0
            if (ks_resize(&line, cp_end - cp) < 0)
3420
0
                goto err;
3421
0
            memcpy(line.s, cp, cp_end - cp);
3422
0
            line_frag = cp_end - cp;
3423
0
            l->data_size = l->alloc - line_frag;
3424
0
        } else {
3425
            // out of buffer
3426
0
            line_frag = 0;
3427
0
        }
3428
3429
0
        l->serial = fd->serial++;
3430
        //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial);
3431
0
        if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l,
3432
0
                                cleanup_sp_lines, cleanup_sp_bams, 0) < 0)
3433
0
            goto err;
3434
0
        pthread_mutex_lock(&fd->command_m);
3435
0
        if (fd->command == SAM_CLOSE) {
3436
0
            pthread_mutex_unlock(&fd->command_m);
3437
0
            l = NULL;
3438
0
            goto tidyup;
3439
0
        }
3440
0
        l = NULL;  // Now "owned" by sam_parse_worker()
3441
0
        pthread_mutex_unlock(&fd->command_m);
3442
0
    }
3443
3444
    // Submit a NULL sp_bams entry to act as an EOF marker
3445
0
    if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0)
3446
0
        goto err;
3447
3448
    // At EOF, wait for close request.
3449
    // (In future if we add support for seek, this is where we need to catch it.)
3450
0
    for (;;) {
3451
0
        pthread_mutex_lock(&fd->command_m);
3452
0
        if (fd->command == SAM_NONE)
3453
0
            pthread_cond_wait(&fd->command_c, &fd->command_m);
3454
0
        switch (fd->command) {
3455
0
        case SAM_CLOSE:
3456
0
            pthread_cond_signal(&fd->command_c);
3457
0
            pthread_mutex_unlock(&fd->command_m);
3458
0
            hts_tpool_process_shutdown(fd->q);
3459
0
            goto tidyup;
3460
3461
0
        default:
3462
0
            pthread_mutex_unlock(&fd->command_m);
3463
0
            break;
3464
0
        }
3465
0
    }
3466
3467
0
 tidyup:
3468
0
    pthread_mutex_lock(&fd->command_m);
3469
0
    fd->command = SAM_CLOSE_DONE;
3470
0
    pthread_cond_signal(&fd->command_c);
3471
0
    pthread_mutex_unlock(&fd->command_m);
3472
3473
0
    if (l) {
3474
0
        pthread_mutex_lock(&fd->lines_m);
3475
0
        l->next = fd->lines;
3476
0
        fd->lines = l;
3477
0
        pthread_mutex_unlock(&fd->lines_m);
3478
0
    }
3479
0
    free(line.s);
3480
3481
0
    return NULL;
3482
3483
0
 err:
3484
0
    sam_state_err(fd, errno ? errno : ENOMEM);
3485
0
    hts_tpool_process_shutdown(fd->q);
3486
0
    goto tidyup;
3487
0
}
3488
3489
// Runs in its own thread.
3490
// Takes encoded blocks of SAM off the thread results queue and writes them
3491
// to our output stream.
3492
0
static void *sam_dispatcher_write(void *vp) {
3493
0
    htsFile *fp = vp;
3494
0
    SAM_state *fd = fp->state;
3495
0
    hts_tpool_result *r;
3496
3497
    // Iterates until result queue is shutdown, where it returns NULL.
3498
0
    while ((r = hts_tpool_next_result_wait(fd->q))) {
3499
0
        sp_lines *gl = (sp_lines *)hts_tpool_result_data(r);
3500
0
        if (!gl) {
3501
0
            sam_state_err(fd, ENOMEM);
3502
0
            goto err;
3503
0
        }
3504
3505
0
        if (fp->idx) {
3506
0
            sp_bams *gb = gl->bams;
3507
0
            int i = 0, count = 0;
3508
0
            while (i < gl->data_size) {
3509
0
                int j = i;
3510
0
                while (i < gl->data_size && gl->data[i] != '\n')
3511
0
                    i++;
3512
0
                if (i < gl->data_size)
3513
0
                    i++;
3514
3515
0
                if (fp->is_bgzf) {
3516
0
                    if (bgzf_flush_try(fp->fp.bgzf, i-j) < 0)
3517
0
                        goto err;
3518
0
                    if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j)
3519
0
                        goto err;
3520
0
                } else {
3521
0
                    if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j)
3522
0
                        goto err;
3523
0
                }
3524
3525
0
                bam1_t *b = &gb->bams[count++];
3526
0
                if (fp->format.compression == bgzf) {
3527
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
3528
0
                                      b->core.tid, b->core.pos, bam_endpos(b),
3529
0
                                      bgzf_tell(fp->fp.bgzf),
3530
0
                                      !(b->core.flag&BAM_FUNMAP)) < 0) {
3531
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3532
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3533
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3534
0
                        goto err;
3535
0
                    }
3536
0
                } else {
3537
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
3538
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
3539
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3540
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3541
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3542
0
                        goto err;
3543
0
                    }
3544
0
                }
3545
0
            }
3546
3547
0
            assert(count == gb->nbams);
3548
3549
            // Add bam array to free-list
3550
0
            pthread_mutex_lock(&fd->lines_m);
3551
0
            gb->next = fd->bams;
3552
0
            fd->bams = gl->bams;
3553
0
            gl->bams = NULL;
3554
0
            pthread_mutex_unlock(&fd->lines_m);
3555
0
        } else {
3556
0
            if (fp->is_bgzf) {
3557
                // We keep track of how much in the current block we have
3558
                // remaining => R.  We look for the last newline in input
3559
                // [i] to [i+R], backwards => position N.
3560
                //
3561
                // If we find a newline, we write out bytes i to N.
3562
                // We know we cannot fit the next record in this bgzf block,
3563
                // so we flush what we have and copy input N to i+R into
3564
                // the start of a new block, and recompute a new R for that.
3565
                //
3566
                // If we don't find a newline (i==N) then we cannot extend
3567
                // the current block at all, so flush whatever is in it now
3568
                // if it ends on a newline.
3569
                // We still copy i(==N) to i+R to the next block and
3570
                // continue as before with a new R.
3571
                //
3572
                // The only exception on the flush is when we run out of
3573
                // data in the input.  In that case we skip it as we don't
3574
                // yet know if the next record will fit.
3575
                //
3576
                // Both conditions share the same code here:
3577
                // - Look for newline (pos N)
3578
                // - Write i to N (which maybe 0)
3579
                // - Flush if block ends on newline and not end of input
3580
                // - write N to i+R
3581
3582
0
                int i = 0;
3583
0
                BGZF *fb = fp->fp.bgzf;
3584
0
                while (i < gl->data_size) {
3585
                    // remaining space in block
3586
0
                    int R = BGZF_BLOCK_SIZE - fb->block_offset;
3587
0
                    int eod = 0;
3588
0
                    if (R > gl->data_size-i)
3589
0
                        R = gl->data_size-i, eod = 1;
3590
3591
                    // Find last newline in input data
3592
0
                    int N = i + R;
3593
0
                    while (--N > i) {
3594
0
                        if (gl->data[N] == '\n')
3595
0
                            break;
3596
0
                    }
3597
3598
0
                    if (N != i) {
3599
                        // Found a newline
3600
0
                        N++;
3601
0
                        if (bgzf_write(fb, &gl->data[i], N-i) != N-i)
3602
0
                            goto err;
3603
0
                    }
3604
3605
                    // Flush bgzf block
3606
0
                    int b_off = fb->block_offset;
3607
0
                    if (!eod && b_off &&
3608
0
                        ((char *)fb->uncompressed_block)[b_off-1] == '\n')
3609
0
                        if (bgzf_flush_try(fb, BGZF_BLOCK_SIZE) < 0)
3610
0
                            goto err;
3611
3612
                    // Copy from N onwards into next block
3613
0
                    if (i+R > N)
3614
0
                        if (bgzf_write(fb, &gl->data[N], i+R - N)
3615
0
                            != i+R - N)
3616
0
                            goto err;
3617
3618
0
                    i = i+R;
3619
0
                }
3620
0
            } else {
3621
0
                if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size)
3622
0
                    goto err;
3623
0
            }
3624
0
        }
3625
3626
0
        hts_tpool_delete_result(r, 0);
3627
3628
        // Also updated by main thread
3629
0
        pthread_mutex_lock(&fd->lines_m);
3630
0
        gl->next = fd->lines;
3631
0
        fd->lines = gl;
3632
0
        pthread_mutex_unlock(&fd->lines_m);
3633
0
    }
3634
3635
0
    sam_state_err(fd, 0); // success
3636
0
    hts_tpool_process_shutdown(fd->q);
3637
0
    return NULL;
3638
3639
0
 err:
3640
0
    sam_state_err(fd, errno ? errno : EIO);
3641
0
    return (void *)-1;
3642
0
}
3643
3644
// Run from one of the worker threads.
3645
// Convert a passed in array of BAMs (sp_bams) and converts to a block
3646
// of text SAM records (sp_lines).
3647
0
static void *sam_format_worker(void *arg) {
3648
0
    sp_bams *gb = (sp_bams *)arg;
3649
0
    sp_lines *gl = NULL;
3650
0
    int i;
3651
0
    SAM_state *fd = gb->fd;
3652
0
    htsFile *fp = fd->fp;
3653
3654
    // Use a block of SAM strings we had earlier if available.
3655
0
    pthread_mutex_lock(&fd->lines_m);
3656
0
    if (fd->lines) {
3657
0
        gl = fd->lines;
3658
0
        fd->lines = gl->next;
3659
0
    }
3660
0
    pthread_mutex_unlock(&fd->lines_m);
3661
3662
0
    if (gl == NULL) {
3663
0
        gl = calloc(1, sizeof(*gl));
3664
0
        if (!gl) {
3665
0
            sam_state_err(fd, ENOMEM);
3666
0
            return NULL;
3667
0
        }
3668
0
        gl->alloc = gl->data_size = 0;
3669
0
        gl->data = NULL;
3670
0
    }
3671
0
    gl->serial = gb->serial;
3672
0
    gl->next = NULL;
3673
3674
0
    kstring_t ks = {0, gl->alloc, gl->data};
3675
3676
0
    for (i = 0; i < gb->nbams; i++) {
3677
0
        if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) {
3678
0
            sam_state_err(fd, errno ? errno : EIO);
3679
0
            goto err;
3680
0
        }
3681
0
        kputc('\n', &ks);
3682
0
    }
3683
3684
0
    pthread_mutex_lock(&fd->lines_m);
3685
0
    gl->data_size = ks.l;
3686
0
    gl->alloc = ks.m;
3687
0
    gl->data = ks.s;
3688
3689
0
    if (fp->idx) {
3690
        // Keep hold of the bam array a little longer as
3691
        // sam_dispatcher_write needs to use them for building the index.
3692
0
        gl->bams = gb;
3693
0
    } else {
3694
        // Add bam array to free-list
3695
0
        gb->next = fd->bams;
3696
0
        fd->bams = gb;
3697
0
    }
3698
0
    pthread_mutex_unlock(&fd->lines_m);
3699
3700
0
    return gl;
3701
3702
0
 err:
3703
    // Possible race between this and fd->curr_bam.
3704
    // Easier to not free and leave it on the input list so it
3705
    // gets freed there instead?
3706
    // sam_free_sp_bams(gb);
3707
0
    if (gl) {
3708
0
        free(gl->data);
3709
0
        free(gl);
3710
0
    }
3711
0
    return NULL;
3712
0
}
3713
3714
0
int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) {
3715
0
    if (fp->state)
3716
0
        return -2;   //already exists!
3717
3718
0
    if (!(fp->state = sam_state_create(fp)))
3719
0
        return -1;
3720
0
    SAM_state *fd = (SAM_state *)fp->state;
3721
3722
0
    pthread_mutex_init(&fd->lines_m, NULL);
3723
0
    pthread_mutex_init(&fd->command_m, NULL);
3724
0
    pthread_cond_init(&fd->command_c, NULL);
3725
0
    fd->p = p->pool;
3726
0
    int qsize = p->qsize;
3727
0
    if (!qsize)
3728
0
        qsize = 2*hts_tpool_size(fd->p);
3729
0
    fd->q = hts_tpool_process_init(fd->p, qsize, 0);
3730
0
    if (!fd->q) {
3731
0
        sam_state_destroy(fp);
3732
0
        return -1;
3733
0
    }
3734
3735
0
    if (fp->format.compression == bgzf)
3736
0
        return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize);
3737
3738
0
    return 0;
3739
0
}
3740
3741
0
int sam_set_threads(htsFile *fp, int nthreads) {
3742
0
    if (nthreads <= 0)
3743
0
        return 0;
3744
3745
0
    htsThreadPool p;
3746
0
    p.pool = hts_tpool_init(nthreads);
3747
0
    p.qsize = nthreads*2;
3748
3749
0
    int ret = sam_set_thread_pool(fp, &p);
3750
0
    if (ret < 0) {
3751
0
        if (p.pool)
3752
0
            hts_tpool_destroy(p.pool);
3753
0
        return ret;
3754
0
    }
3755
3756
0
    SAM_state *fd = (SAM_state *)fp->state;
3757
0
    fd->own_pool = 1;
3758
3759
0
    return 0;
3760
0
}
3761
3762
0
#define UMI_TAGS 5
3763
typedef struct {
3764
    kstring_t name;
3765
    kstring_t comment; // NB: pointer into name, do not free
3766
    kstring_t seq;
3767
    kstring_t qual;
3768
    int casava;
3769
    int aux;
3770
    int rnum;
3771
    char BC[3];         // aux tag ID for barcode
3772
    char UMI[UMI_TAGS][3]; // aux tag list for UMIs.
3773
    khash_t(tag) *tags; // which aux tags to use (if empty, use all).
3774
    char nprefix;
3775
    int sra_names;
3776
    regex_t regex;
3777
} fastq_state;
3778
3779
// Initialise fastq state.
3780
// Name char of '@' or '>' distinguishes fastq vs fasta variant
3781
1.00k
static fastq_state *fastq_state_init(int name_char) {
3782
1.00k
    fastq_state *x = (fastq_state *)calloc(1, sizeof(*x));
3783
1.00k
    if (!x)
3784
0
        return NULL;
3785
1.00k
    strcpy(x->BC, "BC");
3786
1.00k
    x->nprefix = name_char;
3787
    // Default Illumina naming convention
3788
1.00k
    char *re = "^[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:([^:#/]+)";
3789
1.00k
    if (regcomp(&x->regex, re, REG_EXTENDED) != 0) {
3790
0
        free(x);
3791
0
        return NULL;
3792
0
    }
3793
3794
1.00k
    return x;
3795
1.00k
}
3796
3797
1.34k
void fastq_state_destroy(htsFile *fp) {
3798
1.34k
    if (fp->state) {
3799
1.00k
        fastq_state *x = (fastq_state *)fp->state;
3800
1.00k
        if (x->tags)
3801
0
            kh_destroy(tag, x->tags);
3802
1.00k
        ks_free(&x->name);
3803
1.00k
        ks_free(&x->seq);
3804
1.00k
        ks_free(&x->qual);
3805
1.00k
        regfree(&x->regex);
3806
1.00k
        free(fp->state);
3807
1.00k
    }
3808
1.34k
}
3809
3810
0
int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
3811
0
    va_list args;
3812
3813
0
    if (!fp)
3814
0
        return -1;
3815
0
    if (!fp->state)
3816
0
        if (!(fp->state = fastq_state_init(fp->format.format == fastq_format
3817
0
                                           ? '@' : '>')))
3818
0
            return -1;
3819
3820
0
    fastq_state *x = (fastq_state *)fp->state;
3821
3822
0
    switch (opt) {
3823
0
    case FASTQ_OPT_CASAVA:
3824
0
        x->casava = 1;
3825
0
        break;
3826
3827
0
    case FASTQ_OPT_NAME2:
3828
0
        x->sra_names = 1;
3829
0
        break;
3830
3831
0
    case FASTQ_OPT_AUX: {
3832
0
        va_start(args, opt);
3833
0
        x->aux = 1;
3834
0
        char *tag = va_arg(args, char *);
3835
0
        va_end(args);
3836
0
        if (tag && strcmp(tag, "1") != 0) {
3837
0
            if (!x->tags)
3838
0
                if (!(x->tags = kh_init(tag)))
3839
0
                    return -1;
3840
3841
0
            size_t i, tlen = strlen(tag);
3842
0
            for (i = 0; i+3 <= tlen+1; i += 3) {
3843
0
                if (tag[i+0] == ',' || tag[i+1] == ',' ||
3844
0
                    !(tag[i+2] == ',' || tag[i+2] == '\0')) {
3845
0
                    hts_log_warning("Bad tag format '%.3s'; skipping option", tag+i);
3846
0
                    break;
3847
0
                }
3848
0
                int ret, tcode = tag[i+0]*256 + tag[i+1];
3849
0
                kh_put(tag, x->tags, tcode, &ret);
3850
0
                if (ret < 0)
3851
0
                    return -1;
3852
0
            }
3853
0
        }
3854
0
        break;
3855
0
    }
3856
3857
0
    case FASTQ_OPT_BARCODE: {
3858
0
        va_start(args, opt);
3859
0
        char *bc = va_arg(args, char *);
3860
0
        va_end(args);
3861
0
        strncpy(x->BC, bc, 2);
3862
0
        x->BC[2] = 0;
3863
0
        break;
3864
0
    }
3865
3866
0
    case FASTQ_OPT_UMI: {
3867
        // UMI tag: an empty string disables UMI by setting x->UMI[0] to \0\0\0
3868
0
        va_start(args, opt);
3869
0
        char *bc = va_arg(args, char *), *bc_orig = bc;
3870
0
        va_end(args);
3871
0
        if (!bc || strcmp(bc, "1") == 0)
3872
0
            bc = "RX";
3873
0
        int ntags = 0, err = 0;
3874
0
        for (ntags = 0; *bc && ntags < UMI_TAGS; ntags++) {
3875
0
            if (!isalpha(bc[0]) || !isalnum_c(bc[1])) {
3876
0
                err = 1;
3877
0
                break;
3878
0
            }
3879
3880
0
            strncpy(x->UMI[ntags], bc, 3);
3881
0
            bc += 2;
3882
0
            if (*bc && *bc != ',') {
3883
0
                err = 1;
3884
0
                break;
3885
0
            }
3886
0
            bc+=(*bc==',');
3887
0
            x->UMI[ntags][2] = 0;
3888
0
        }
3889
0
        for (; ntags < UMI_TAGS; ntags++)
3890
0
            x->UMI[ntags][0] = x->UMI[ntags][1] = x->UMI[ntags][2] = 0;
3891
3892
3893
0
        if (err)
3894
0
            hts_log_warning("Bad UMI tag list '%s'", bc_orig);
3895
3896
0
        break;
3897
0
    }
3898
3899
0
    case FASTQ_OPT_UMI_REGEX: {
3900
0
        va_start(args, opt);
3901
0
        char *re = va_arg(args, char *);
3902
0
        va_end(args);
3903
3904
0
        regfree(&x->regex);
3905
0
        if (regcomp(&x->regex, re, REG_EXTENDED) != 0) {
3906
0
            hts_log_error("Regular expression '%s' is not supported", re);
3907
0
            return -1;
3908
0
        }
3909
0
        break;
3910
0
    }
3911
3912
0
    case FASTQ_OPT_RNUM:
3913
0
        x->rnum = 1;
3914
0
        break;
3915
3916
0
    default:
3917
0
        break;
3918
0
    }
3919
0
    return 0;
3920
0
}
3921
3922
7.94M
static int fastq_parse1(htsFile *fp, bam1_t *b) {
3923
7.94M
    fastq_state *x = (fastq_state *)fp->state;
3924
7.94M
    size_t i, l;
3925
7.94M
    int ret = 0;
3926
3927
7.94M
    if (fp->format.format == fasta_format && fp->line.s) {
3928
        // For FASTA we've already read the >name line; steal it
3929
        // Not the most efficient, but we don't optimise for fasta reading.
3930
7.94M
        if (fp->line.l == 0)
3931
543
            return -1; // EOF
3932
3933
7.94M
        free(x->name.s);
3934
7.94M
        x->name = fp->line;
3935
7.94M
        fp->line.l = fp->line.m = 0;
3936
7.94M
        fp->line.s = NULL;
3937
7.94M
    } else {
3938
        // Read a FASTQ format entry.
3939
1.02k
        ret = hts_getline(fp, KS_SEP_LINE, &x->name);
3940
1.02k
        if (ret == -1)
3941
0
            return -1;  // EOF
3942
1.02k
        else if (ret < -1)
3943
18
            return ret; // ERR
3944
1.02k
    }
3945
3946
    // Name
3947
7.94M
    if (*x->name.s != x->nprefix)
3948
15
        return -2;
3949
3950
    // Reverse the SRA strangeness of putting the run_name.number before
3951
    // the read name.
3952
7.94M
    i = 0;
3953
7.94M
    char *name = x->name.s+1;
3954
7.94M
    if (x->sra_names) {
3955
0
        char *cp = strpbrk(x->name.s, " \t");
3956
0
        if (cp) {
3957
0
            while (*cp == ' ' || *cp == '\t')
3958
0
                cp++;
3959
0
            *--cp = '@';
3960
0
            i = cp - x->name.s;
3961
0
            name = cp+1;
3962
0
        }
3963
0
    }
3964
3965
7.94M
    l = x->name.l;
3966
7.94M
    char *s = x->name.s;
3967
30.7M
    while (i < l && !isspace_c(s[i]))
3968
22.8M
        i++;
3969
7.94M
    if (i < l) {
3970
60.6k
        s[i] = 0;
3971
60.6k
        x->name.l = i++;
3972
60.6k
    }
3973
3974
    // Comment; a kstring struct, but pointer into name line.  (Do not free)
3975
8.14M
    while (i < l && isspace_c(s[i]))
3976
203k
        i++;
3977
7.94M
    x->comment.s = s+i;
3978
7.94M
    x->comment.l = l - i;
3979
3980
    // Seq
3981
7.94M
    x->seq.l = 0;
3982
51.7M
    for (;;) {
3983
51.7M
        if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0)
3984
927
            if (fp->format.format == fastq_format || ret < -1)
3985
375
                return -2;
3986
51.7M
        if (ret == -1 ||
3987
51.7M
            *fp->line.s == (fp->format.format == fastq_format ? '+' : '>'))
3988
7.94M
            break;
3989
43.7M
        if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0)
3990
0
            return -2;
3991
43.7M
    }
3992
3993
    // Qual
3994
7.94M
    if (fp->format.format == fastq_format) {
3995
18
        size_t remainder = x->seq.l;
3996
18
        x->qual.l = 0;
3997
58.9k
        do {
3998
58.9k
            if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0)
3999
0
                return -2;
4000
58.9k
            if (fp->line.l > remainder)
4001
3
                return -2;
4002
58.9k
            if (kputsn(fp->line.s, fp->line.l, &x->qual) < 0)
4003
0
                return -2;
4004
58.9k
            remainder -= fp->line.l;
4005
58.9k
        } while (remainder > 0);
4006
4007
        // Decr qual
4008
677k
        for (i = 0; i < x->qual.l; i++)
4009
677k
            x->qual.s[i] -= '!';
4010
15
    }
4011
4012
7.94M
    int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED;
4013
7.94M
    if (x->name.l > 2 &&
4014
1.23M
        x->name.s[x->name.l-2] == '/' &&
4015
11.6k
        isdigit_c(x->name.s[x->name.l-1])) {
4016
9.00k
        switch(x->name.s[x->name.l-1]) {
4017
2.37k
        case '1': flag |= BAM_FREAD1 | pflag; break;
4018
3.01k
        case '2': flag |= BAM_FREAD2 | pflag; break;
4019
3.61k
        default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
4020
9.00k
        }
4021
9.00k
        x->name.s[x->name.l-=2] = 0;
4022
9.00k
    }
4023
4024
    // Strip Illumina formatted UMI off read-name
4025
7.94M
    char UMI_seq[256]; // maximum length in spec
4026
7.94M
    size_t UMI_len = 0;
4027
7.94M
    if (x->UMI[0][0]) {
4028
0
        regmatch_t match[3];
4029
0
        if (regexec(&x->regex, x->name.s, 2, match, 0) == 0
4030
0
            && match[0].rm_so >= 0     // whole regex
4031
0
            && match[1].rm_so >= 0) {  // bracketted UMI component
4032
0
            UMI_len = match[1].rm_eo - match[1].rm_so;
4033
0
            if (UMI_len > 255) {
4034
0
                hts_log_error("SAM read name is too long");
4035
0
                return -2;
4036
0
            }
4037
4038
            // The SAMTags spec recommends (but not requires) separating
4039
            // barcodes with hyphen ('-').
4040
0
            size_t i;
4041
0
            for (i = 0; i < UMI_len; i++)
4042
0
                UMI_seq[i] = isalpha_c(x->name.s[i+match[1].rm_so])
4043
0
                    ? x->name.s[i+match[1].rm_so]
4044
0
                    : '-';
4045
4046
            // Move any trailing #num earlier in the name
4047
0
            if (UMI_len) {
4048
0
                UMI_seq[UMI_len++] = 0;
4049
4050
0
                x->name.l = match[1].rm_so;
4051
0
                if (x->name.l > 0 && x->name.s[x->name.l-1] == ':')
4052
0
                    x->name.l--; // remove colon too
4053
0
                char *cp = x->name.s + match[1].rm_eo;
4054
0
                while (*cp)
4055
0
                    x->name.s[x->name.l++] = *cp++;
4056
0
                x->name.s[x->name.l] = 0;
4057
0
            }
4058
0
        }
4059
0
    }
4060
4061
    // Convert to BAM
4062
7.94M
    ret = bam_set1(b,
4063
7.94M
                   x->name.s + x->name.l - name, name,
4064
7.94M
                   flag,
4065
7.94M
                   -1, -1, 0, // ref '*', pos, mapq,
4066
7.94M
                   0, NULL,     // no cigar,
4067
7.94M
                   -1, -1, 0,    // mate
4068
7.94M
                   x->seq.l, x->seq.s, x->qual.s,
4069
7.94M
                   0);
4070
7.94M
    if (ret < 0) return -2;
4071
4072
    // Add UMI tag if removed from read-name above
4073
7.94M
    if (UMI_len) {
4074
0
        if (bam_aux_append(b, x->UMI[0], 'Z', UMI_len, (uint8_t *)UMI_seq) < 0)
4075
0
            ret = -2;
4076
0
    }
4077
4078
    // Identify Illumina CASAVA strings.
4079
    // <read>:<is_filtered>:<control_bits>:<barcode_sequence>
4080
7.94M
    char *barcode = NULL;
4081
7.94M
    int barcode_len = 0;
4082
7.94M
    kstring_t *kc = &x->comment;
4083
7.94M
    char *endptr;
4084
7.94M
    if (x->casava &&
4085
        // \d:[YN]:\d+:[ACGTN]+
4086
0
        kc->l > 6 && (kc->s[1] | kc->s[3]) == ':' && isdigit_c(kc->s[0]) &&
4087
0
        strtol(kc->s+4, &endptr, 10) >= 0 && endptr != kc->s+4
4088
0
        && *endptr == ':') {
4089
4090
        // read num
4091
0
        switch(kc->s[0]) {
4092
0
        case '1': b->core.flag |= BAM_FREAD1 | pflag; break;
4093
0
        case '2': b->core.flag |= BAM_FREAD2 | pflag; break;
4094
0
        default : b->core.flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
4095
0
        }
4096
4097
0
        if (kc->s[2] == 'Y')
4098
0
            b->core.flag |= BAM_FQCFAIL;
4099
4100
        // Barcode, maybe numeric in which case we skip it
4101
0
        if (!isdigit_c(endptr[1])) {
4102
0
            barcode = endptr+1;
4103
0
            for (i = barcode - kc->s; i < kc->l; i++)
4104
0
                if (isspace_c(kc->s[i]))
4105
0
                    break;
4106
4107
0
            kc->s[i] = 0;
4108
0
            barcode_len = i+1-(barcode - kc->s);
4109
0
        }
4110
0
    }
4111
4112
7.94M
    if (ret >= 0 && barcode_len)
4113
0
        if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0)
4114
0
            ret = -2;
4115
4116
7.94M
    if (!x->aux)
4117
7.94M
        return ret;
4118
4119
    // Identify any SAM style aux tags in comments too.
4120
0
    if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0)
4121
0
        ret = -2;
4122
4123
0
    return ret;
4124
7.94M
}
4125
4126
// Internal component of sam_read1 below
4127
193
static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4128
193
    int ret = bam_read1(fp->fp.bgzf, b);
4129
193
    if (h && ret >= 0) {
4130
167
        if (b->core.tid  >= h->n_targets || b->core.tid  < -1 ||
4131
161
            b->core.mtid >= h->n_targets || b->core.mtid < -1) {
4132
6
            errno = ERANGE;
4133
6
            return -3;
4134
6
        }
4135
167
    }
4136
187
    return ret;
4137
193
}
4138
4139
// Internal component of sam_read1 below
4140
701
static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) {
4141
701
    int ret = cram_get_bam_seq(fp->fp.cram, b);
4142
701
    if (ret < 0)
4143
701
        return cram_eof(fp->fp.cram) ? -1 : -2;
4144
4145
0
    if (bam_tag2cigar(*b, 1, 1) < 0)
4146
0
        return -2;
4147
4148
0
    return ret;
4149
0
}
4150
4151
// Internal component of sam_read1 below
4152
119k
static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4153
119k
    int ret;
4154
4155
    // Consume 1st line after header parsing as it wasn't using peek
4156
119k
    if (fp->line.l != 0) {
4157
0
        ret = sam_parse1(&fp->line, h, b);
4158
0
        fp->line.l = 0;
4159
0
        return ret;
4160
0
    }
4161
4162
119k
    if (fp->state) {
4163
0
        SAM_state *fd = (SAM_state *)fp->state;
4164
4165
0
        if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) {
4166
            // We don't support multi-threaded SAM parsing with seeks yet.
4167
0
            int ret;
4168
0
            if ((ret = sam_state_destroy(fp)) < 0) {
4169
0
                errno = -ret;
4170
0
                return -2;
4171
0
            }
4172
0
            if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0)
4173
0
                return -2;
4174
0
            fp->fp.bgzf->seeked = 0;
4175
0
            goto err_recover;
4176
0
        }
4177
4178
0
        if (!fd->h) {
4179
0
            fd->h = h;
4180
0
            fd->h->ref_count++;
4181
            // Ensure hrecs is initialised now as we don't want multiple
4182
            // threads trying to do this simultaneously.
4183
0
            if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0)
4184
0
                return -2;
4185
4186
            // We can only do this once we've got a header
4187
0
            if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read,
4188
0
                               fp) != 0)
4189
0
                return -2;
4190
0
            fd->dispatcher_set = 1;
4191
0
        }
4192
4193
0
        if (fd->h != h) {
4194
0
            hts_log_error("SAM multi-threaded decoding does not support changing header");
4195
0
            return -2;
4196
0
        }
4197
4198
0
        sp_bams *gb = fd->curr_bam;
4199
0
        if (!gb) {
4200
0
            if (fd->errcode) {
4201
                // In case reader failed
4202
0
                errno = fd->errcode;
4203
0
                return -2;
4204
0
            }
4205
4206
0
            pthread_mutex_lock(&fd->command_m);
4207
0
            int cmd = fd->command;
4208
0
            pthread_mutex_unlock(&fd->command_m);
4209
0
            if (cmd == SAM_AT_EOF)
4210
0
                return -1;
4211
4212
0
            hts_tpool_result *r = hts_tpool_next_result_wait(fd->q);
4213
0
            if (!r)
4214
0
                return -2;
4215
0
            fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r);
4216
0
            hts_tpool_delete_result(r, 0);
4217
0
        }
4218
0
        if (!gb) {
4219
0
            pthread_mutex_lock(&fd->command_m);
4220
0
            fd->command = SAM_AT_EOF;
4221
0
            pthread_mutex_unlock(&fd->command_m);
4222
0
            return fd->errcode ? -2 : -1;
4223
0
        }
4224
0
        bam1_t *b_array = (bam1_t *)gb->bams;
4225
0
        if (fd->curr_idx < gb->nbams)
4226
0
            if (!bam_copy1(b, &b_array[fd->curr_idx++]))
4227
0
                return -2;
4228
0
        if (fd->curr_idx == gb->nbams) {
4229
0
            pthread_mutex_lock(&fd->lines_m);
4230
0
            gb->next = fd->bams;
4231
0
            fd->bams = gb;
4232
0
            pthread_mutex_unlock(&fd->lines_m);
4233
4234
0
            fd->curr_bam = NULL;
4235
0
            fd->curr_idx = 0;
4236
        // Consider prefetching next record?  I.e.
4237
        // } else {
4238
        //     __builtin_prefetch(&b_array[fd->curr_idx], 0, 3);
4239
0
        }
4240
4241
0
        ret = 0;
4242
4243
119k
    } else  {
4244
119k
    err_recover:
4245
119k
        ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
4246
119k
        if (ret < 0) return ret;
4247
4248
117k
        ret = sam_parse1(&fp->line, h, b);
4249
117k
        fp->line.l = 0;
4250
117k
        if (ret < 0) {
4251
1.14k
            hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
4252
1.14k
            if (h && h->ignore_sam_err) goto err_recover;
4253
1.14k
        }
4254
117k
    }
4255
4256
117k
    return ret;
4257
119k
}
4258
4259
// Returns 0 on success,
4260
//        -1 on EOF,
4261
//       <-1 on error
4262
int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b)
4263
8.06M
{
4264
8.06M
    int ret, pass_filter;
4265
4266
8.06M
    do {
4267
8.06M
        switch (fp->format.format) {
4268
193
        case bam:
4269
193
            ret = sam_read1_bam(fp, h, b);
4270
193
            break;
4271
4272
701
        case cram:
4273
701
            ret = sam_read1_cram(fp, h, &b);
4274
701
            break;
4275
4276
119k
        case sam:
4277
119k
            ret = sam_read1_sam(fp, h, b);
4278
119k
            break;
4279
4280
7.94M
        case fasta_format:
4281
7.94M
        case fastq_format: {
4282
7.94M
            fastq_state *x = (fastq_state *)fp->state;
4283
7.94M
            if (!x) {
4284
1.00k
                if (!(fp->state = fastq_state_init(fp->format.format
4285
1.00k
                                                   == fastq_format ? '@' : '>')))
4286
0
                    return -2;
4287
1.00k
            }
4288
4289
7.94M
            return fastq_parse1(fp, b);
4290
7.94M
        }
4291
4292
0
        case empty_format:
4293
0
            errno = EPIPE;
4294
0
            return -3;
4295
4296
0
        default:
4297
0
            errno = EFTYPE;
4298
0
            return -3;
4299
8.06M
        }
4300
4301
120k
        pass_filter = (ret >= 0 && fp->filter)
4302
120k
            ? sam_passes_filter(h, b, fp->filter)
4303
120k
            : 1;
4304
120k
    } while (pass_filter == 0);
4305
4306
120k
    return pass_filter < 0 ? -2 : ret;
4307
8.06M
}
4308
4309
// With gcc, -O3 or -ftree-loop-vectorize is really key here as otherwise
4310
// this code isn't vectorised and runs far slower than is necessary (even
4311
// with the restrict keyword being used).
4312
static inline void HTS_OPT3
4313
208
add33(uint8_t *a, const uint8_t * b, int32_t len) {
4314
208
    uint32_t i;
4315
226k
    for (i = 0; i < len; i++)
4316
226k
        a[i] = b[i]+33;
4317
208
}
4318
4319
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4320
2.68M
{
4321
2.68M
    int i, r = 0;
4322
2.68M
    uint8_t *s, *end;
4323
2.68M
    const bam1_core_t *c = &b->core;
4324
4325
2.68M
    if (c->l_qname == 0)
4326
0
        return -1;
4327
2.68M
    r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str);
4328
2.68M
    r |= kputc_('\t', str); // query name
4329
2.68M
    r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag
4330
2.68M
    if (c->tid >= 0) { // chr
4331
26.4k
        r |= kputs(h->target_name[c->tid] , str);
4332
26.4k
        r |= kputc_('\t', str);
4333
2.66M
    } else r |= kputsn_("*\t", 2, str);
4334
2.68M
    r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos
4335
2.68M
    r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual
4336
2.68M
    if (c->n_cigar) { // cigar
4337
34.1k
        uint32_t *cigar = bam_get_cigar(b);
4338
1.57M
        for (i = 0; i < c->n_cigar; ++i) {
4339
1.54M
            r |= kputw(bam_cigar_oplen(cigar[i]), str);
4340
1.54M
            r |= kputc_(bam_cigar_opchr(cigar[i]), str);
4341
1.54M
        }
4342
2.65M
    } else r |= kputc_('*', str);
4343
2.68M
    r |= kputc_('\t', str);
4344
2.68M
    if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr
4345
1.29k
    else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str);
4346
1.24k
    else {
4347
1.24k
        r |= kputs(h->target_name[c->mtid], str);
4348
1.24k
        r |= kputc_('\t', str);
4349
1.24k
    }
4350
2.68M
    r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos
4351
2.68M
    r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len
4352
2.68M
    if (c->l_qseq) { // seq and qual
4353
146k
        uint8_t *s = bam_get_seq(b);
4354
146k
        if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err;
4355
146k
        char *cp = str->s + str->l;
4356
4357
        // Sequence, 2 bases at a time
4358
146k
        nibble2base(s, cp, c->l_qseq);
4359
146k
        cp[c->l_qseq] = '\t';
4360
146k
        cp += c->l_qseq+1;
4361
4362
        // Quality
4363
146k
        s = bam_get_qual(b);
4364
146k
        i = 0;
4365
146k
        if (s[0] == 0xff) {
4366
145k
            cp[i++] = '*';
4367
145k
        } else {
4368
208
            add33((uint8_t *)cp, s, c->l_qseq); // cp[i] = s[i]+33;
4369
208
            i = c->l_qseq;
4370
208
        }
4371
146k
        cp[i] = 0;
4372
146k
        cp += i;
4373
146k
        str->l = cp - str->s;
4374
2.54M
    } else r |= kputsn_("*\t*", 3, str);
4375
4376
2.68M
    s = bam_get_aux(b); // aux
4377
2.68M
    end = b->data + b->l_data;
4378
4379
3.39M
    while (end - s >= 4) {
4380
702k
        r |= kputc_('\t', str);
4381
702k
        if ((s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)) == NULL)
4382
8
            goto bad_aux;
4383
702k
    }
4384
2.68M
    r |= kputsn("", 0, str); // nul terminate
4385
2.68M
    if (r < 0) goto mem_err;
4386
4387
2.68M
    return str->l;
4388
4389
8
 bad_aux:
4390
8
    hts_log_error("Corrupted aux data for read %.*s flag %d",
4391
8
                  b->core.l_qname, bam_get_qname(b), b->core.flag);
4392
8
    errno = EINVAL;
4393
8
    return -1;
4394
4395
0
 mem_err:
4396
0
    hts_log_error("Out of memory");
4397
0
    errno = ENOMEM;
4398
0
    return -1;
4399
2.68M
}
4400
4401
int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4402
2.68M
{
4403
2.68M
    str->l = 0;
4404
2.68M
    return sam_format1_append(h, b, str);
4405
2.68M
}
4406
4407
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end);
4408
int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str)
4409
0
{
4410
0
    unsigned flag = b->core.flag;
4411
0
    int i, e = 0, len = b->core.l_qseq;
4412
0
    uint8_t *seq, *qual;
4413
4414
0
    str->l = 0;
4415
4416
    // Name
4417
0
    if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF)
4418
0
        return -1;
4419
4420
    // UMI tag
4421
0
    if (x && *x->UMI[0]) {
4422
        // Temporary copy of '#num' if present
4423
0
        char plex[256];
4424
0
        size_t len = str->l;
4425
0
        while (len && str->s[len] != ':' && str->s[len] != '#')
4426
0
            len--;
4427
4428
0
        if (str->s[len] == '#' && str->l - len < 255) {
4429
0
            memcpy(plex, &str->s[len], str->l - len);
4430
0
            plex[str->l - len] = 0;
4431
0
            str->l = len;
4432
0
        } else {
4433
0
            *plex = 0;
4434
0
        }
4435
4436
0
        uint8_t *bc = NULL;
4437
0
        int n;
4438
0
        for (n = 0; !bc && n < UMI_TAGS; n++)
4439
0
            bc = bam_aux_get(b, x->UMI[n]);
4440
0
        if (bc && *bc == 'Z') {
4441
0
            int err = kputc(':', str) < 0;
4442
            // Replace any non-alpha with '+'
4443
0
            while (*++bc)
4444
0
                err |= kputc(isalpha_c(*bc) ? toupper_c(*bc) : '+', str) < 0;
4445
0
            if (err)
4446
0
                return -1;
4447
0
        }
4448
4449
0
        if (*plex && kputs(plex, str) < 0)
4450
0
            return -1;
4451
0
    }
4452
4453
    // /1 or /2 suffix
4454
0
    if (x && x->rnum && (flag & BAM_FPAIRED)) {
4455
0
        int r12 = flag & (BAM_FREAD1 | BAM_FREAD2);
4456
0
        if (r12 == BAM_FREAD1) {
4457
0
            if (kputs("/1", str) == EOF)
4458
0
                return -1;
4459
0
        } else if (r12 == BAM_FREAD2) {
4460
0
            if (kputs("/2", str) == EOF)
4461
0
                return -1;
4462
0
        }
4463
0
    }
4464
4465
    // Illumina CASAVA tag.
4466
    // This is <rnum>:<Y/N qcfail>:<control-bits>:<barcode-or-zero>
4467
0
    if (x && x->casava) {
4468
0
        int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0;
4469
0
        char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N';
4470
0
        uint8_t *bc = bam_aux_get(b, x->BC);
4471
0
        if (ksprintf(str, " %d:%c:0:%s", rnum, filtered,
4472
0
                     bc ? (char *)bc+1 : "0") < 0)
4473
0
            return -1;
4474
4475
0
        if (bc && (*bc != 'Z' || (!isupper_c(bc[1]) && !islower_c(bc[1])))) {
4476
0
            hts_log_warning("BC tag starts with non-sequence base; using '0'");
4477
0
            str->l -= strlen((char *)bc)-2; // limit to 1 char
4478
0
            str->s[str->l-1] = '0';
4479
0
            str->s[str->l] = 0;
4480
0
            bc = NULL;
4481
0
        }
4482
4483
        // Replace any non-alpha with '+'.  Ie seq-seq to seq+seq
4484
0
        if (bc) {
4485
0
            int l = strlen((char *)bc+1);
4486
0
            char *c = (char *)str->s + str->l - l;
4487
0
            for (i = 0; i < l; i++) {
4488
0
                if (!isalpha_c(c[i]))
4489
0
                    c[i] = '+';
4490
0
                else if (islower_c(c[i]))
4491
0
                    c[i] = toupper_c(c[i]);
4492
0
            }
4493
0
        }
4494
0
    }
4495
4496
    // Aux tags
4497
0
    if (x && x->aux) {
4498
0
        uint8_t *s = bam_get_aux(b), *end = b->data + b->l_data;
4499
0
        while (s && end - s >= 4) {
4500
0
            int tt = s[0]*256 + s[1];
4501
0
            if (x->tags == NULL ||
4502
0
                kh_get(tag, x->tags, tt) != kh_end(x->tags)) {
4503
0
                e |= kputc_('\t', str) < 0;
4504
0
                if (!(s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)))
4505
0
                    return -1;
4506
0
            } else {
4507
0
                s = skip_aux(s+2, end);
4508
0
            }
4509
0
        }
4510
0
        e |= kputsn("", 0, str) < 0; // nul terminate
4511
0
    }
4512
4513
0
    if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1;
4514
0
    e |= kputc_('\n', str) < 0;
4515
4516
    // Seq line
4517
0
    seq = bam_get_seq(b);
4518
0
    if (flag & BAM_FREVERSE)
4519
0
        for (i = len-1; i >= 0; i--)
4520
0
            e |= kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str) < 0;
4521
0
    else
4522
0
        for (i = 0; i < len; i++)
4523
0
            e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0;
4524
4525
4526
    // Qual line
4527
0
    if (x->nprefix == '@') {
4528
0
        kputsn("\n+\n", 3, str);
4529
0
        qual = bam_get_qual(b);
4530
0
        if (qual[0] == 0xff)
4531
0
            for (i = 0; i < len; i++)
4532
0
                e |= kputc_('B', str) < 0;
4533
0
        else if (flag & BAM_FREVERSE)
4534
0
            for (i = len-1; i >= 0; i--)
4535
0
                e |= kputc_(33 + qual[i], str) < 0;
4536
0
        else
4537
0
            for (i = 0; i < len; i++)
4538
0
                e |= kputc_(33 + qual[i], str) < 0;
4539
4540
0
    }
4541
0
    e |= kputc('\n', str) < 0;
4542
4543
0
    return e ? -1 : str->l;
4544
0
}
4545
4546
// Sadly we need to be able to modify the bam_hdr here so we can
4547
// reference count the structure.
4548
int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b)
4549
8.06M
{
4550
8.06M
    switch (fp->format.format) {
4551
0
    case binary_format:
4552
0
        fp->format.category = sequence_data;
4553
0
        fp->format.format = bam;
4554
        /* fall-through */
4555
2.68M
    case bam:
4556
2.68M
        return bam_write_idx1(fp, h, b);
4557
4558
2.68M
    case cram:
4559
2.68M
        return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
4560
4561
0
    case text_format:
4562
0
        fp->format.category = sequence_data;
4563
0
        fp->format.format = sam;
4564
        /* fall-through */
4565
2.68M
    case sam:
4566
2.68M
        if (fp->state) {
4567
0
            SAM_state *fd = (SAM_state *)fp->state;
4568
4569
            // Threaded output
4570
0
            if (!fd->h) {
4571
                // NB: discard const.  We don't actually modify sam_hdr_t here,
4572
                // just data pointed to by it (which is a bit weasely still),
4573
                // but out cached pointer must be non-const as we want to
4574
                // destroy it later on and sam_hdr_destroy takes non-const.
4575
                //
4576
                // We do this because some tools do sam_hdr_destroy; sam_close
4577
                // while others do sam_close; sam_hdr_destroy.  The former is
4578
                // an issue as we need the header still when flushing.
4579
0
                fd->h = (sam_hdr_t *)h;
4580
0
                fd->h->ref_count++;
4581
4582
0
                if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write,
4583
0
                                   fp) != 0)
4584
0
                    return -2;
4585
0
                fd->dispatcher_set = 1;
4586
0
            }
4587
4588
0
            if (fd->h != h) {
4589
0
                hts_log_error("SAM multi-threaded decoding does not support changing header");
4590
0
                return -2;
4591
0
            }
4592
4593
            // Find a suitable BAM array to copy to
4594
0
            sp_bams *gb = fd->curr_bam;
4595
0
            if (!gb) {
4596
0
                pthread_mutex_lock(&fd->lines_m);
4597
0
                if (fd->bams) {
4598
0
                    fd->curr_bam = gb = fd->bams;
4599
0
                    fd->bams = gb->next;
4600
0
                    gb->next = NULL;
4601
0
                    gb->nbams = 0;
4602
0
                    gb->bam_mem = 0;
4603
0
                    pthread_mutex_unlock(&fd->lines_m);
4604
0
                } else {
4605
0
                    pthread_mutex_unlock(&fd->lines_m);
4606
0
                    if (!(gb = calloc(1, sizeof(*gb)))) return -1;
4607
0
                    if (!(gb->bams = calloc(SAM_NBAM, sizeof(*gb->bams)))) {
4608
0
                        free(gb);
4609
0
                        return -1;
4610
0
                    }
4611
0
                    gb->nbams = 0;
4612
0
                    gb->abams = SAM_NBAM;
4613
0
                    gb->bam_mem = 0;
4614
0
                    gb->fd = fd;
4615
0
                    fd->curr_idx = 0;
4616
0
                    fd->curr_bam = gb;
4617
0
                }
4618
0
            }
4619
4620
0
            if (!bam_copy1(&gb->bams[gb->nbams++], b))
4621
0
                return -2;
4622
0
            gb->bam_mem += b->l_data + sizeof(*b);
4623
4624
            // Dispatch if full
4625
0
            if (gb->nbams == SAM_NBAM || gb->bam_mem > SAM_NBYTES*0.8) {
4626
0
                gb->serial = fd->serial++;
4627
0
                pthread_mutex_lock(&fd->command_m);
4628
0
                if (fd->errcode != 0) {
4629
0
                    pthread_mutex_unlock(&fd->command_m);
4630
0
                    return -fd->errcode;
4631
0
                }
4632
0
                if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb,
4633
0
                                        cleanup_sp_bams,
4634
0
                                        cleanup_sp_lines, 0) < 0) {
4635
0
                    pthread_mutex_unlock(&fd->command_m);
4636
0
                    return -1;
4637
0
                }
4638
0
                pthread_mutex_unlock(&fd->command_m);
4639
0
                fd->curr_bam = NULL;
4640
0
            }
4641
4642
            // Dummy value as we don't know how long it really is.
4643
            // We could track file sizes via a SAM_state field, but I don't think
4644
            // it is necessary.
4645
0
            return 1;
4646
2.68M
        } else {
4647
2.68M
            if (sam_format1(h, b, &fp->line) < 0) return -1;
4648
2.68M
            kputc('\n', &fp->line);
4649
2.68M
            if (fp->is_bgzf) {
4650
0
                if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4651
0
                    return -1;
4652
0
                if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4653
2.68M
            } else {
4654
2.68M
                if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4655
2.68M
            }
4656
4657
2.68M
            if (fp->idx) {
4658
0
                if (fp->format.compression == bgzf) {
4659
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4660
0
                                      bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4661
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4662
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4663
0
                        return -1;
4664
0
                    }
4665
0
                } else {
4666
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4667
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4668
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4669
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4670
0
                        return -1;
4671
0
                    }
4672
0
                }
4673
0
            }
4674
4675
2.68M
            return fp->line.l;
4676
2.68M
        }
4677
4678
4679
0
    case fasta_format:
4680
0
    case fastq_format: {
4681
0
        fastq_state *x = (fastq_state *)fp->state;
4682
0
        if (!x) {
4683
0
            if (!(fp->state = fastq_state_init(fp->format.format
4684
0
                                               == fastq_format ? '@' : '>')))
4685
0
                return -2;
4686
0
        }
4687
4688
0
        if (fastq_format1(fp->state, b, &fp->line) < 0)
4689
0
            return -1;
4690
0
        if (fp->is_bgzf) {
4691
0
            if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4692
0
                return -1;
4693
0
            if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l)
4694
0
                return -1;
4695
0
        } else {
4696
0
            if (hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l)
4697
0
                return -1;
4698
0
        }
4699
0
        return fp->line.l;
4700
0
    }
4701
4702
0
    default:
4703
0
        errno = EBADF;
4704
0
        return -1;
4705
8.06M
    }
4706
8.06M
}
4707
4708
/************************
4709
 *** Auxiliary fields ***
4710
 ************************/
4711
#ifndef HTS_LITTLE_ENDIAN
4712
static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
4713
    int tsz = aux_type2size(type);
4714
4715
    if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
4716
4717
    switch (tsz) {
4718
        case 'H': case 'Z': case 1:  // Trivial
4719
            memcpy(out, in, len);
4720
            break;
4721
4722
#define aux_val_to_le(type_t, store_le) do {                            \
4723
        type_t v;                                                       \
4724
        size_t i;                                                       \
4725
        for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
4726
            memcpy(&v, in + i, sizeof(type_t));                         \
4727
            store_le(v, out);                                           \
4728
        }                                                               \
4729
    } while (0)
4730
4731
        case 2: aux_val_to_le(uint16_t, u16_to_le); break;
4732
        case 4: aux_val_to_le(uint32_t, u32_to_le); break;
4733
        case 8: aux_val_to_le(uint64_t, u64_to_le); break;
4734
4735
#undef aux_val_to_le
4736
4737
        case 'B': { // Recurse!
4738
            uint32_t n;
4739
            if (len < 5) return -1;
4740
            memcpy(&n, in + 1, 4);
4741
            out[0] = in[0];
4742
            u32_to_le(n, out + 1);
4743
            return aux_to_le(in[0], out + 5, in + 5, len - 5);
4744
        }
4745
4746
        default: // Unknown type code
4747
            return -1;
4748
    }
4749
4750
4751
4752
    return 0;
4753
}
4754
#endif
4755
4756
int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
4757
0
{
4758
0
    uint32_t new_len;
4759
4760
0
    assert(b->l_data >= 0);
4761
0
    new_len = b->l_data + 3 + len;
4762
0
    if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
4763
4764
0
    if (realloc_bam_data(b, new_len) < 0) return -1;
4765
4766
0
    b->data[b->l_data] = tag[0];
4767
0
    b->data[b->l_data + 1] = tag[1];
4768
0
    b->data[b->l_data + 2] = type;
4769
4770
0
#ifdef HTS_LITTLE_ENDIAN
4771
0
    memcpy(b->data + b->l_data + 3, data, len);
4772
#else
4773
    if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
4774
        errno = EINVAL;
4775
        return -1;
4776
    }
4777
#endif
4778
4779
0
    b->l_data = new_len;
4780
4781
0
    return 0;
4782
4783
0
 nomem:
4784
0
    errno = ENOMEM;
4785
0
    return -1;
4786
0
}
4787
4788
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
4789
1.49M
{
4790
1.49M
    int size;
4791
1.49M
    uint32_t n;
4792
1.49M
    if (s >= end) return end;
4793
1.49M
    size = aux_type2size(*s); ++s; // skip type
4794
1.49M
    switch (size) {
4795
171k
    case 'Z':
4796
176k
    case 'H':
4797
176k
        s = memchr(s, 0, end-s);
4798
176k
        return s ? s+1 : end;
4799
65.5k
    case 'B':
4800
65.5k
        if (end - s < 5) return NULL;
4801
65.5k
        size = aux_type2size(*s); ++s;
4802
65.5k
        n = le_to_u32(s);
4803
65.5k
        s += 4;
4804
65.5k
        if (size == 0 || end - s < size * n) return NULL;
4805
65.5k
        return s + size * n;
4806
2
    case 0:
4807
2
        return NULL;
4808
1.24M
    default:
4809
1.24M
        if (end - s < size) return NULL;
4810
1.24M
        return s + size;
4811
1.49M
    }
4812
1.49M
}
4813
4814
uint8_t *bam_aux_first(const bam1_t *b)
4815
2.75M
{
4816
2.75M
    uint8_t *s = bam_get_aux(b);
4817
2.75M
    uint8_t *end = b->data + b->l_data;
4818
2.75M
    if (end - s <= 2) { errno = ENOENT; return NULL; }
4819
87.7k
    return s+2;
4820
2.75M
}
4821
4822
uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s)
4823
1.46M
{
4824
1.46M
    uint8_t *end = b->data + b->l_data;
4825
1.46M
    uint8_t *next = s? skip_aux((uint8_t *) s, end) : end;
4826
1.46M
    if (next == NULL) goto bad_aux;
4827
1.46M
    if (end - next <= 2) { errno = ENOENT; return NULL; }
4828
1.40M
    return next+2;
4829
4830
2
 bad_aux:
4831
2
    hts_log_error("Corrupted aux data for read %s flag %d",
4832
2
                  bam_get_qname(b), b->core.flag);
4833
2
    errno = EINVAL;
4834
2
    return NULL;
4835
1.46M
}
4836
4837
uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
4838
2.75M
{
4839
2.75M
    uint8_t *s;
4840
4.21M
    for (s = bam_aux_first(b); s; s = bam_aux_next(b, s))
4841
1.49M
        if (s[-2] == tag[0] && s[-1] == tag[1]) {
4842
            // Check the tag value is valid and complete
4843
29.1k
            uint8_t *e = skip_aux(s, b->data + b->l_data);
4844
29.1k
            if (e == NULL) goto bad_aux;
4845
29.1k
            if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux;
4846
4847
29.1k
            return s;
4848
29.1k
        }
4849
4850
    // errno now as set by bam_aux_first()/bam_aux_next()
4851
2.72M
    return NULL;
4852
4853
0
 bad_aux:
4854
0
    hts_log_error("Corrupted aux data for read %s flag %d",
4855
0
                  bam_get_qname(b), b->core.flag);
4856
0
    errno = EINVAL;
4857
0
    return NULL;
4858
2.75M
}
4859
4860
int bam_aux_del(bam1_t *b, uint8_t *s)
4861
0
{
4862
0
    s = bam_aux_remove(b, s);
4863
0
    return (s || errno == ENOENT)? 0 : -1;
4864
0
}
4865
4866
uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s)
4867
0
{
4868
0
    uint8_t *end = b->data + b->l_data;
4869
0
    uint8_t *next = skip_aux(s, end);
4870
0
    if (next == NULL) goto bad_aux;
4871
4872
0
    b->l_data -= next - (s-2);
4873
0
    if (next >= end) { errno = ENOENT; return NULL; }
4874
4875
0
    memmove(s-2, next, end - next);
4876
0
    return s;
4877
4878
0
 bad_aux:
4879
0
    hts_log_error("Corrupted aux data for read %s flag %d",
4880
0
                  bam_get_qname(b), b->core.flag);
4881
0
    errno = EINVAL;
4882
0
    return NULL;
4883
0
}
4884
4885
int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
4886
0
{
4887
    // FIXME: This is not at all efficient!
4888
0
    size_t ln = len >= 0 ? len : strlen(data) + 1;
4889
0
    size_t old_ln = 0;
4890
0
    int need_nul = ln == 0 || data[ln - 1] != '\0';
4891
0
    int save_errno = errno;
4892
0
    int new_tag = 0;
4893
0
    uint8_t *s = bam_aux_get(b,tag), *e;
4894
4895
0
    if (s) {  // Replacing existing tag
4896
0
        char type = *s;
4897
0
        if (type != 'Z') {
4898
0
            hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type);
4899
0
            errno = EINVAL;
4900
0
            return -1;
4901
0
        }
4902
0
        s++;
4903
0
        e = memchr(s, '\0', b->data + b->l_data - s);
4904
0
        old_ln = (e ? e - s : b->data + b->l_data - s) + 1;
4905
0
        s -= 3;
4906
0
    } else {
4907
0
        if (errno != ENOENT) { // Invalid aux data, give up
4908
0
            return -1;
4909
0
        } else { // Tag doesn't exist - put it on the end
4910
0
            errno = save_errno;
4911
0
            s = b->data + b->l_data;
4912
0
            new_tag = 3;
4913
0
        }
4914
0
    }
4915
4916
0
    if (old_ln < ln + need_nul + new_tag) {
4917
0
        ptrdiff_t s_offset = s - b->data;
4918
0
        if (possibly_expand_bam_data(b, ln + need_nul + new_tag - old_ln) < 0)
4919
0
            return -1;
4920
0
        s = b->data + s_offset;
4921
0
    }
4922
0
    if (!new_tag) {
4923
0
        memmove(s + 3 + ln + need_nul,
4924
0
                s + 3 + old_ln,
4925
0
                b->l_data - (s + 3 - b->data) - old_ln);
4926
0
    }
4927
0
    b->l_data += new_tag + ln + need_nul - old_ln;
4928
4929
0
    s[0] = tag[0];
4930
0
    s[1] = tag[1];
4931
0
    s[2] = 'Z';
4932
0
    memmove(s+3,data,ln);
4933
0
    if (need_nul) s[3 + ln] = '\0';
4934
0
    return 0;
4935
0
}
4936
4937
int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val)
4938
0
{
4939
0
    uint32_t sz, old_sz = 0, new = 0;
4940
0
    uint8_t *s, type;
4941
4942
0
    if (val < INT32_MIN || val > UINT32_MAX) {
4943
0
        errno = EOVERFLOW;
4944
0
        return -1;
4945
0
    }
4946
0
    if (val < INT16_MIN)       { type = 'i'; sz = 4; }
4947
0
    else if (val < INT8_MIN)   { type = 's'; sz = 2; }
4948
0
    else if (val < 0)          { type = 'c'; sz = 1; }
4949
0
    else if (val < UINT8_MAX)  { type = 'C'; sz = 1; }
4950
0
    else if (val < UINT16_MAX) { type = 'S'; sz = 2; }
4951
0
    else                       { type = 'I'; sz = 4; }
4952
4953
0
    s = bam_aux_get(b, tag);
4954
0
    if (s) {  // Tag present - how big was the old one?
4955
0
        switch (*s) {
4956
0
            case 'c': case 'C': old_sz = 1; break;
4957
0
            case 's': case 'S': old_sz = 2; break;
4958
0
            case 'i': case 'I': old_sz = 4; break;
4959
0
            default: errno = EINVAL; return -1;  // Not an integer
4960
0
        }
4961
0
    } else {
4962
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
4963
0
            s = b->data + b->l_data;
4964
0
            new = 1;
4965
0
        }  else { // Invalid aux data, give up.
4966
0
            return -1;
4967
0
        }
4968
0
    }
4969
4970
0
    if (new || old_sz < sz) {
4971
        // Make room for new tag
4972
0
        ptrdiff_t s_offset = s - b->data;
4973
0
        if (possibly_expand_bam_data(b, (new ? 3 : 0) + sz - old_sz) < 0)
4974
0
            return -1;
4975
0
        s =  b->data + s_offset;
4976
0
        if (new) { // Add tag id
4977
0
            *s++ = tag[0];
4978
0
            *s++ = tag[1];
4979
0
        } else {   // Shift following data so we have space
4980
0
            memmove(s + sz, s + old_sz, b->l_data - s_offset - old_sz);
4981
0
        }
4982
0
    } else {
4983
        // Reuse old space.  Data value may be bigger than necessary but
4984
        // we avoid having to move everything else
4985
0
        sz = old_sz;
4986
0
        type = (val < 0 ? "\0cs\0i" : "\0CS\0I")[old_sz];
4987
0
        assert(type > 0);
4988
0
    }
4989
0
    *s++ = type;
4990
0
#ifdef HTS_LITTLE_ENDIAN
4991
0
    memcpy(s, &val, sz);
4992
#else
4993
    switch (sz) {
4994
        case 4:  u32_to_le(val, s); break;
4995
        case 2:  u16_to_le(val, s); break;
4996
        default: *s = val; break;
4997
    }
4998
#endif
4999
0
    b->l_data += (new ? 3 : 0) + sz - old_sz;
5000
0
    return 0;
5001
0
}
5002
5003
int bam_aux_update_float(bam1_t *b, const char tag[2], float val)
5004
0
{
5005
0
    uint8_t *s = bam_aux_get(b, tag);
5006
0
    int shrink = 0, new = 0;
5007
5008
0
    if (s) { // Tag present - what was it?
5009
0
        switch (*s) {
5010
0
            case 'f': break;
5011
0
            case 'd': shrink = 1; break;
5012
0
            default: errno = EINVAL; return -1;  // Not a float
5013
0
        }
5014
0
    } else {
5015
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
5016
0
            new = 1;
5017
0
        }  else { // Invalid aux data, give up.
5018
0
            return -1;
5019
0
        }
5020
0
    }
5021
5022
0
    if (new) { // Ensure there's room
5023
0
        if (possibly_expand_bam_data(b, 3 + 4) < 0)
5024
0
            return -1;
5025
0
        s = b->data + b->l_data;
5026
0
        *s++ = tag[0];
5027
0
        *s++ = tag[1];
5028
0
    } else if (shrink) { // Convert non-standard double tag to float
5029
0
        memmove(s + 5, s + 9, b->l_data - ((s + 9) - b->data));
5030
0
        b->l_data -= 4;
5031
0
    }
5032
0
    *s++ = 'f';
5033
0
    float_to_le(val, s);
5034
0
    if (new) b->l_data += 7;
5035
5036
0
    return 0;
5037
0
}
5038
5039
int bam_aux_update_array(bam1_t *b, const char tag[2],
5040
                         uint8_t type, uint32_t items, void *data)
5041
0
{
5042
0
    uint8_t *s = bam_aux_get(b, tag);
5043
0
    size_t old_sz = 0, new_sz;
5044
0
    int new = 0;
5045
5046
0
    if (s) { // Tag present
5047
0
        if (*s != 'B') { errno = EINVAL; return -1; }
5048
0
        old_sz = aux_type2size(s[1]);
5049
0
        if (old_sz < 1 || old_sz > 4) { errno = EINVAL; return -1; }
5050
0
        old_sz *= le_to_u32(s + 2);
5051
0
    } else {
5052
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
5053
0
            s = b->data + b->l_data;
5054
0
            new = 1;
5055
0
        }  else { // Invalid aux data, give up.
5056
0
            return -1;
5057
0
        }
5058
0
    }
5059
5060
0
    new_sz = aux_type2size(type);
5061
0
    if (new_sz < 1 || new_sz > 4) { errno = EINVAL; return -1; }
5062
0
    if (items > INT32_MAX / new_sz) { errno = ENOMEM; return -1; }
5063
0
    new_sz *= items;
5064
5065
0
    if (new || old_sz < new_sz) {
5066
        // Make room for new tag
5067
0
        ptrdiff_t s_offset = s - b->data;
5068
0
        if (possibly_expand_bam_data(b, (new ? 8 : 0) + new_sz - old_sz) < 0)
5069
0
            return -1;
5070
0
        s =  b->data + s_offset;
5071
0
    }
5072
0
    if (new) { // Add tag id and type
5073
0
        *s++ = tag[0];
5074
0
        *s++ = tag[1];
5075
0
        *s = 'B';
5076
0
        b->l_data += 8 + new_sz;
5077
0
    } else if (old_sz != new_sz) { // shift following data if necessary
5078
0
        memmove(s + 6 + new_sz, s + 6 + old_sz,
5079
0
                b->l_data - ((s + 6 + old_sz) - b->data));
5080
0
        b->l_data -= old_sz;
5081
0
        b->l_data += new_sz;
5082
0
    }
5083
5084
0
    s[1] = type;
5085
0
    u32_to_le(items, s + 2);
5086
0
    if (new_sz > 0) {
5087
0
#ifdef HTS_LITTLE_ENDIAN
5088
0
        memcpy(s + 6, data, new_sz);
5089
#else
5090
        return aux_to_le(type, s + 6, data, new_sz);
5091
#endif
5092
0
    }
5093
0
    return 0;
5094
0
}
5095
5096
static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s,
5097
                                      uint32_t idx)
5098
0
{
5099
0
    switch (type) {
5100
0
        case 'c': return le_to_i8(s + idx);
5101
0
        case 'C': return s[idx];
5102
0
        case 's': return le_to_i16(s + 2 * idx);
5103
0
        case 'S': return le_to_u16(s + 2 * idx);
5104
0
        case 'i': return le_to_i32(s + 4 * idx);
5105
0
        case 'I': return le_to_u32(s + 4 * idx);
5106
0
        default:
5107
0
            errno = EINVAL;
5108
0
            return 0;
5109
0
    }
5110
0
}
5111
5112
int64_t bam_aux2i(const uint8_t *s)
5113
0
{
5114
0
    int type;
5115
0
    type = *s++;
5116
0
    return get_int_aux_val(type, s, 0);
5117
0
}
5118
5119
double bam_aux2f(const uint8_t *s)
5120
0
{
5121
0
    int type;
5122
0
    type = *s++;
5123
0
    if (type == 'd') return le_to_double(s);
5124
0
    else if (type == 'f') return le_to_float(s);
5125
0
    else return get_int_aux_val(type, s, 0);
5126
0
}
5127
5128
char bam_aux2A(const uint8_t *s)
5129
0
{
5130
0
    int type;
5131
0
    type = *s++;
5132
0
    if (type == 'A') return *(char*)s;
5133
0
    errno = EINVAL;
5134
0
    return 0;
5135
0
}
5136
5137
char *bam_aux2Z(const uint8_t *s)
5138
0
{
5139
0
    int type;
5140
0
    type = *s++;
5141
0
    if (type == 'Z' || type == 'H') return (char*)s;
5142
0
    errno = EINVAL;
5143
0
    return 0;
5144
0
}
5145
5146
uint32_t bam_auxB_len(const uint8_t *s)
5147
0
{
5148
0
    if (s[0] != 'B') {
5149
0
        errno = EINVAL;
5150
0
        return 0;
5151
0
    }
5152
0
    return le_to_u32(s + 2);
5153
0
}
5154
5155
int64_t bam_auxB2i(const uint8_t *s, uint32_t idx)
5156
0
{
5157
0
    uint32_t len = bam_auxB_len(s);
5158
0
    if (idx >= len) {
5159
0
        errno = ERANGE;
5160
0
        return 0;
5161
0
    }
5162
0
    return get_int_aux_val(s[1], s + 6, idx);
5163
0
}
5164
5165
double bam_auxB2f(const uint8_t *s, uint32_t idx)
5166
0
{
5167
0
    uint32_t len = bam_auxB_len(s);
5168
0
    if (idx >= len) {
5169
0
        errno = ERANGE;
5170
0
        return 0.0;
5171
0
    }
5172
0
    if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx);
5173
0
    else return get_int_aux_val(s[1], s + 6, idx);
5174
0
}
5175
5176
int sam_open_mode(char *mode, const char *fn, const char *format)
5177
0
{
5178
    // TODO Parse "bam5" etc for compression level
5179
0
    if (format == NULL) {
5180
        // Try to pick a format based on the filename extension
5181
0
        char extension[HTS_MAX_EXT_LEN];
5182
0
        if (find_file_extension(fn, extension) < 0) return -1;
5183
0
        return sam_open_mode(mode, fn, extension);
5184
0
    }
5185
0
    else if (strcasecmp(format, "bam") == 0) strcpy(mode, "b");
5186
0
    else if (strcasecmp(format, "cram") == 0) strcpy(mode, "c");
5187
0
    else if (strcasecmp(format, "sam") == 0) strcpy(mode, "");
5188
0
    else if (strcasecmp(format, "sam.gz") == 0) strcpy(mode, "z");
5189
0
    else if (strcasecmp(format, "fastq") == 0 ||
5190
0
             strcasecmp(format, "fq") == 0) strcpy(mode, "f");
5191
0
    else if (strcasecmp(format, "fastq.gz") == 0 ||
5192
0
             strcasecmp(format, "fq.gz") == 0) strcpy(mode, "fz");
5193
0
    else if (strcasecmp(format, "fasta") == 0 ||
5194
0
             strcasecmp(format, "fa") == 0) strcpy(mode, "F");
5195
0
    else if (strcasecmp(format, "fasta.gz") == 0 ||
5196
0
             strcasecmp(format, "fa.gz") == 0) strcpy(mode, "Fz");
5197
0
    else return -1;
5198
5199
0
    return 0;
5200
0
}
5201
5202
// A version of sam_open_mode that can handle ,key=value options.
5203
// The format string is allocated and returned, to be freed by the caller.
5204
// Prefix should be "r" or "w",
5205
char *sam_open_mode_opts(const char *fn,
5206
                         const char *mode,
5207
                         const char *format)
5208
0
{
5209
0
    char *mode_opts = malloc((format ? strlen(format) : 1) +
5210
0
                             (mode   ? strlen(mode)   : 1) + 12);
5211
0
    char *opts, *cp;
5212
0
    int format_len;
5213
5214
0
    if (!mode_opts)
5215
0
        return NULL;
5216
5217
0
    strcpy(mode_opts, mode ? mode : "r");
5218
0
    cp = mode_opts + strlen(mode_opts);
5219
5220
0
    if (format == NULL) {
5221
        // Try to pick a format based on the filename extension
5222
0
        char extension[HTS_MAX_EXT_LEN];
5223
0
        if (find_file_extension(fn, extension) < 0) {
5224
0
            free(mode_opts);
5225
0
            return NULL;
5226
0
        }
5227
0
        if (sam_open_mode(cp, fn, extension) == 0) {
5228
0
            return mode_opts;
5229
0
        } else {
5230
0
            free(mode_opts);
5231
0
            return NULL;
5232
0
        }
5233
0
    }
5234
5235
0
    if ((opts = strchr(format, ','))) {
5236
0
        format_len = opts-format;
5237
0
    } else {
5238
0
        opts="";
5239
0
        format_len = strlen(format);
5240
0
    }
5241
5242
0
    if (strncmp(format, "bam", format_len) == 0) {
5243
0
        *cp++ = 'b';
5244
0
    } else if (strncmp(format, "cram", format_len) == 0) {
5245
0
        *cp++ = 'c';
5246
0
    } else if (strncmp(format, "cram2", format_len) == 0) {
5247
0
        *cp++ = 'c';
5248
0
        strcpy(cp, ",VERSION=2.1");
5249
0
        cp += 12;
5250
0
    } else if (strncmp(format, "cram3", format_len) == 0) {
5251
0
        *cp++ = 'c';
5252
0
        strcpy(cp, ",VERSION=3.0");
5253
0
        cp += 12;
5254
0
    } else if (strncmp(format, "sam", format_len) == 0) {
5255
0
        ; // format mode=""
5256
0
    } else if (strncmp(format, "sam.gz", format_len) == 0) {
5257
0
        *cp++ = 'z';
5258
0
    } else if (strncmp(format, "fastq", format_len) == 0 ||
5259
0
               strncmp(format, "fq", format_len) == 0) {
5260
0
        *cp++ = 'f';
5261
0
    } else if (strncmp(format, "fastq.gz", format_len) == 0 ||
5262
0
               strncmp(format, "fq.gz", format_len) == 0) {
5263
0
        *cp++ = 'f';
5264
0
        *cp++ = 'z';
5265
0
    } else if (strncmp(format, "fasta", format_len) == 0 ||
5266
0
               strncmp(format, "fa", format_len) == 0) {
5267
0
        *cp++ = 'F';
5268
0
    } else if (strncmp(format, "fasta.gz", format_len) == 0 ||
5269
0
               strncmp(format, "fa", format_len) == 0) {
5270
0
        *cp++ = 'F';
5271
0
        *cp++ = 'z';
5272
0
    } else {
5273
0
        free(mode_opts);
5274
0
        return NULL;
5275
0
    }
5276
5277
0
    strcpy(cp, opts);
5278
5279
0
    return mode_opts;
5280
0
}
5281
5282
0
#define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
5283
int bam_str2flag(const char *str)
5284
0
{
5285
0
    char *end, *beg = (char*) str;
5286
0
    long int flag = strtol(str, &end, 0);
5287
0
    if ( end!=str ) return flag;    // the conversion was successful
5288
0
    flag = 0;
5289
0
    while ( *str )
5290
0
    {
5291
0
        end = beg;
5292
0
        while ( *end && *end!=',' ) end++;
5293
0
        if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
5294
0
        else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
5295
0
        else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
5296
0
        else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
5297
0
        else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
5298
0
        else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
5299
0
        else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
5300
0
        else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
5301
0
        else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
5302
0
        else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
5303
0
        else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
5304
0
        else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
5305
0
        else return -1;
5306
0
        if ( !*end ) break;
5307
0
        beg = end + 1;
5308
0
    }
5309
0
    return flag;
5310
0
}
5311
5312
char *bam_flag2str(int flag)
5313
0
{
5314
0
    kstring_t str = {0,0,0};
5315
0
    if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
5316
0
    if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
5317
0
    if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
5318
0
    if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
5319
0
    if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
5320
0
    if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
5321
0
    if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
5322
0
    if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
5323
0
    if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
5324
0
    if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
5325
0
    if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
5326
0
    if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
5327
0
    if ( str.l == 0 ) kputsn("", 0, &str);
5328
0
    return str.s;
5329
0
}
5330
5331
5332
/**************************
5333
 *** Pileup and Mpileup ***
5334
 **************************/
5335
5336
#if !defined(BAM_NO_PILEUP)
5337
5338
#include <assert.h>
5339
5340
/*******************
5341
 *** Memory pool ***
5342
 *******************/
5343
5344
typedef struct {
5345
    int k, y;
5346
    hts_pos_t x, end;
5347
} cstate_t;
5348
5349
static cstate_t g_cstate_null = { -1, 0, 0, 0 };
5350
5351
typedef struct __linkbuf_t {
5352
    bam1_t b;
5353
    hts_pos_t beg, end;
5354
    cstate_t s;
5355
    struct __linkbuf_t *next;
5356
    bam_pileup_cd cd;
5357
} lbnode_t;
5358
5359
typedef struct {
5360
    int cnt, n, max;
5361
    lbnode_t **buf;
5362
} mempool_t;
5363
5364
static mempool_t *mp_init(void)
5365
0
{
5366
0
    mempool_t *mp;
5367
0
    mp = (mempool_t*)calloc(1, sizeof(mempool_t));
5368
0
    return mp;
5369
0
}
5370
static void mp_destroy(mempool_t *mp)
5371
0
{
5372
0
    int k;
5373
0
    for (k = 0; k < mp->n; ++k) {
5374
0
        free(mp->buf[k]->b.data);
5375
0
        free(mp->buf[k]);
5376
0
    }
5377
0
    free(mp->buf);
5378
0
    free(mp);
5379
0
}
5380
static inline lbnode_t *mp_alloc(mempool_t *mp)
5381
0
{
5382
0
    ++mp->cnt;
5383
0
    if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
5384
0
    else return mp->buf[--mp->n];
5385
0
}
5386
static inline void mp_free(mempool_t *mp, lbnode_t *p)
5387
0
{
5388
0
    --mp->cnt; p->next = 0; // clear lbnode_t::next here
5389
0
    if (mp->n == mp->max) {
5390
0
        mp->max = mp->max? mp->max<<1 : 256;
5391
0
        mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
5392
0
    }
5393
0
    mp->buf[mp->n++] = p;
5394
0
}
5395
5396
/**********************
5397
 *** CIGAR resolver ***
5398
 **********************/
5399
5400
/* s->k: the index of the CIGAR operator that has just been processed.
5401
   s->x: the reference coordinate of the start of s->k
5402
   s->y: the query coordinate of the start of s->k
5403
 */
5404
static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s)
5405
0
{
5406
0
#define _cop(c) ((c)&BAM_CIGAR_MASK)
5407
0
#define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
5408
5409
0
    bam1_t *b = p->b;
5410
0
    bam1_core_t *c = &b->core;
5411
0
    uint32_t *cigar = bam_get_cigar(b);
5412
0
    int k;
5413
    // determine the current CIGAR operation
5414
    //fprintf(stderr, "%s\tpos=%ld\tend=%ld\t(%d,%ld,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
5415
0
    if (s->k == -1) { // never processed
5416
0
        p->qpos = 0;
5417
0
        if (c->n_cigar == 1) { // just one operation, save a loop
5418
0
          if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
5419
0
        } else { // find the first match or deletion
5420
0
            for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
5421
0
                int op = _cop(cigar[k]);
5422
0
                int l = _cln(cigar[k]);
5423
0
                if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP ||
5424
0
                    op == BAM_CEQUAL || op == BAM_CDIFF) break;
5425
0
                else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5426
0
            }
5427
0
            assert(k < c->n_cigar);
5428
0
            s->k = k;
5429
0
        }
5430
0
    } else { // the read has been processed before
5431
0
        int op, l = _cln(cigar[s->k]);
5432
0
        if (pos - s->x >= l) { // jump to the next operation
5433
0
            assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
5434
0
            op = _cop(cigar[s->k+1]);
5435
0
            if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
5436
0
              if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5437
0
                s->x += l;
5438
0
                ++s->k;
5439
0
            } else { // find the next M/D/N/=/X
5440
0
              if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5441
0
                s->x += l;
5442
0
                for (k = s->k + 1; k < c->n_cigar; ++k) {
5443
0
                    op = _cop(cigar[k]), l = _cln(cigar[k]);
5444
0
                    if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
5445
0
                    else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5446
0
                }
5447
0
                s->k = k;
5448
0
            }
5449
0
            assert(s->k < c->n_cigar); // otherwise a bug
5450
0
        } // else, do nothing
5451
0
    }
5452
0
    { // collect pileup information
5453
0
        int op, l;
5454
0
        op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
5455
0
        p->is_del = p->indel = p->is_refskip = 0;
5456
0
        if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
5457
0
            int op2 = _cop(cigar[s->k+1]);
5458
0
            int l2 = _cln(cigar[s->k+1]);
5459
0
            if (op2 == BAM_CDEL && op != BAM_CDEL) {
5460
                // At start of a new deletion, merge e.g. 1D2D to 3D.
5461
                // Within a deletion (the 2D in 1D2D) we keep p->indel=0
5462
                // and rely on is_del=1 as we would for 3D.
5463
0
                p->indel = -(int)l2;
5464
0
                for (k = s->k+2; k < c->n_cigar; ++k) {
5465
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5466
0
                    if (op2 == BAM_CDEL) p->indel -= l2;
5467
0
                    else break;
5468
0
                }
5469
0
            } else if (op2 == BAM_CINS) {
5470
0
                p->indel = l2;
5471
0
                for (k = s->k+2; k < c->n_cigar; ++k) {
5472
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5473
0
                    if (op2 == BAM_CINS) p->indel += l2;
5474
0
                    else if (op2 != BAM_CPAD) break;
5475
0
                }
5476
0
            } else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) {
5477
0
                int l3 = 0;
5478
0
                for (k = s->k + 2; k < c->n_cigar; ++k) {
5479
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5480
0
                    if (op2 == BAM_CINS) l3 += l2;
5481
0
                    else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
5482
0
                }
5483
0
                if (l3 > 0) p->indel = l3;
5484
0
            }
5485
0
        }
5486
0
        if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
5487
0
            p->qpos = s->y + (pos - s->x);
5488
0
        } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
5489
0
            p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
5490
0
            p->is_refskip = (op == BAM_CREF_SKIP);
5491
0
        } // cannot be other operations; otherwise a bug
5492
0
        p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
5493
0
    }
5494
0
    p->cigar_ind = s->k;
5495
0
    return 1;
5496
0
}
5497
5498
/*******************************
5499
 *** Expansion of insertions ***
5500
 *******************************/
5501
5502
/*
5503
 * Fills out the kstring with the padded insertion sequence for the current
5504
 * location in 'p'.  If this is not an insertion site, the string is blank.
5505
 *
5506
 * This variant handles base modifications, but only when "m" is non-NULL.
5507
 *
5508
 * Returns the number of inserted base on success, with string length being
5509
 *        accessable via ins->l;
5510
 *        -1 on failure.
5511
 */
5512
int bam_plp_insertion_mod(const bam_pileup1_t *p,
5513
                          hts_base_mod_state *m,
5514
0
                          kstring_t *ins, int *del_len) {
5515
0
    int j, k, indel, nb = 0;
5516
0
    uint32_t *cigar;
5517
5518
0
    if (p->indel <= 0) {
5519
0
        if (ks_resize(ins, 1) < 0)
5520
0
            return -1;
5521
0
        ins->l = 0;
5522
0
        ins->s[0] = '\0';
5523
0
        return 0;
5524
0
    }
5525
5526
0
    if (del_len)
5527
0
        *del_len = 0;
5528
5529
    // Measure indel length including pads
5530
0
    indel = 0;
5531
0
    k = p->cigar_ind+1;
5532
0
    cigar = bam_get_cigar(p->b);
5533
0
    while (k < p->b->core.n_cigar) {
5534
0
        switch (cigar[k] & BAM_CIGAR_MASK) {
5535
0
        case BAM_CPAD:
5536
0
        case BAM_CINS:
5537
0
            indel += (cigar[k] >> BAM_CIGAR_SHIFT);
5538
0
            break;
5539
0
        default:
5540
0
            k = p->b->core.n_cigar;
5541
0
            break;
5542
0
        }
5543
0
        k++;
5544
0
    }
5545
0
    nb = ins->l = indel;
5546
5547
    // Produce sequence
5548
0
    if (ks_resize(ins, indel+1) < 0)
5549
0
        return -1;
5550
0
    indel = 0;
5551
0
    k = p->cigar_ind+1;
5552
0
    j = 1;
5553
0
    while (k < p->b->core.n_cigar) {
5554
0
        int l, c;
5555
0
        switch (cigar[k] & BAM_CIGAR_MASK) {
5556
0
        case BAM_CPAD:
5557
0
            for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++)
5558
0
                ins->s[indel++] = '*';
5559
0
            break;
5560
0
        case BAM_CINS:
5561
0
            for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) {
5562
0
                c = p->qpos + j - p->is_del < p->b->core.l_qseq
5563
0
                    ? seq_nt16_str[bam_seqi(bam_get_seq(p->b),
5564
0
                                            p->qpos + j - p->is_del)]
5565
0
                    : 'N';
5566
0
                ins->s[indel++] = c;
5567
0
                int nm;
5568
0
                hts_base_mod mod[256];
5569
0
                if (m && (nm = bam_mods_at_qpos(p->b, p->qpos + j - p->is_del,
5570
0
                                                m, mod, 256)) > 0) {
5571
0
                    int o_indel = indel;
5572
0
                    if (ks_resize(ins, ins->l + nm*16+3) < 0)
5573
0
                        return -1;
5574
0
                    ins->s[indel++] = '[';
5575
0
                    int j;
5576
0
                    for (j = 0; j < nm; j++) {
5577
0
                        char qual[20];
5578
0
                        if (mod[j].qual >= 0)
5579
0
                            snprintf(qual, sizeof(qual), "%d", mod[j].qual);
5580
0
                        else
5581
0
                            *qual=0;
5582
0
                        if (mod[j].modified_base < 0)
5583
                            // ChEBI
5584
0
                            indel += snprintf(&ins->s[indel], ins->m - indel,
5585
0
                                              "%c(%d)%s",
5586
0
                                              "+-"[mod[j].strand],
5587
0
                                              -mod[j].modified_base,
5588
0
                                              qual);
5589
0
                        else
5590
0
                            indel += snprintf(&ins->s[indel], ins->m - indel,
5591
0
                                              "%c%c%s",
5592
0
                                              "+-"[mod[j].strand],
5593
0
                                              mod[j].modified_base,
5594
0
                                              qual);
5595
0
                    }
5596
0
                    ins->s[indel++] = ']';
5597
0
                    ins->l += indel - o_indel; // grow by amount we used
5598
0
                }
5599
0
            }
5600
0
            break;
5601
0
        case BAM_CDEL:
5602
            // eg cigar 1M2I1D gives mpileup output in T+2AA-1C style
5603
0
            if (del_len)
5604
0
                *del_len = cigar[k]>>BAM_CIGAR_SHIFT;
5605
            // fall through
5606
0
        default:
5607
0
            k = p->b->core.n_cigar;
5608
0
            break;
5609
0
        }
5610
0
        k++;
5611
0
    }
5612
0
    ins->s[indel] = '\0';
5613
0
    ins->l = indel; // string length
5614
5615
0
    return nb;      // base length
5616
0
}
5617
5618
/*
5619
 * Fills out the kstring with the padded insertion sequence for the current
5620
 * location in 'p'.  If this is not an insertion site, the string is blank.
5621
 *
5622
 * This is the original interface with no capability for reporting base
5623
 * modifications.
5624
 *
5625
 * Returns the length of insertion string on success;
5626
 *        -1 on failure.
5627
 */
5628
0
int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) {
5629
0
    return bam_plp_insertion_mod(p, NULL, ins, del_len);
5630
0
}
5631
5632
/***********************
5633
 *** Pileup iterator ***
5634
 ***********************/
5635
5636
// Dictionary of overlapping reads
5637
KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
5638
typedef khash_t(olap_hash) olap_hash_t;
5639
5640
struct bam_plp_s {
5641
    mempool_t *mp;
5642
    lbnode_t *head, *tail;
5643
    int32_t tid, max_tid;
5644
    hts_pos_t pos, max_pos;
5645
    int is_eof, max_plp, error, maxcnt;
5646
    uint64_t id;
5647
    bam_pileup1_t *plp;
5648
    // for the "auto" interface only
5649
    bam1_t *b;
5650
    bam_plp_auto_f func;
5651
    void *data;
5652
    olap_hash_t *overlaps;
5653
5654
    // For notification of creation and destruction events
5655
    // and associated client-owned pointer.
5656
    int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
5657
    int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd);
5658
};
5659
5660
bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
5661
0
{
5662
0
    bam_plp_t iter;
5663
0
    iter = (bam_plp_t)calloc(1, sizeof(struct bam_plp_s));
5664
0
    iter->mp = mp_init();
5665
0
    iter->head = iter->tail = mp_alloc(iter->mp);
5666
0
    iter->max_tid = iter->max_pos = -1;
5667
0
    iter->maxcnt = 8000;
5668
0
    if (func) {
5669
0
        iter->func = func;
5670
0
        iter->data = data;
5671
0
        iter->b = bam_init1();
5672
0
    }
5673
0
    return iter;
5674
0
}
5675
5676
int bam_plp_init_overlaps(bam_plp_t iter)
5677
0
{
5678
0
    iter->overlaps = kh_init(olap_hash);  // hash for tweaking quality of bases in overlapping reads
5679
0
    return iter->overlaps ? 0 : -1;
5680
0
}
5681
5682
void bam_plp_destroy(bam_plp_t iter)
5683
0
{
5684
0
    lbnode_t *p, *pnext;
5685
0
    if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
5686
0
    for (p = iter->head; p != NULL; p = pnext) {
5687
0
        if (iter->plp_destruct && p != iter->tail)
5688
0
            iter->plp_destruct(iter->data, &p->b, &p->cd);
5689
0
        pnext = p->next;
5690
0
        mp_free(iter->mp, p);
5691
0
    }
5692
0
    mp_destroy(iter->mp);
5693
0
    if (iter->b) bam_destroy1(iter->b);
5694
0
    free(iter->plp);
5695
0
    free(iter);
5696
0
}
5697
5698
void bam_plp_constructor(bam_plp_t plp,
5699
0
                         int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5700
0
    plp->plp_construct = func;
5701
0
}
5702
5703
void bam_plp_destructor(bam_plp_t plp,
5704
0
                        int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5705
0
    plp->plp_destruct = func;
5706
0
}
5707
5708
//---------------------------------
5709
//---  Tweak overlapping reads
5710
//---------------------------------
5711
5712
/**
5713
 *  cigar_iref2iseq_set()  - find the first CMATCH setting the ref and the read index
5714
 *  cigar_iref2iseq_next() - get the next CMATCH base
5715
 *  @cigar:       pointer to current cigar block (rw)
5716
 *  @cigar_max:   pointer just beyond the last cigar block
5717
 *  @icig:        position within the current cigar block (rw)
5718
 *  @iseq:        position in the sequence (rw)
5719
 *  @iref:        position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
5720
 *
5721
 *  Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered,
5722
 *  or -2 on error.
5723
 */
5724
static inline int cigar_iref2iseq_set(const uint32_t **cigar,
5725
                                      const uint32_t *cigar_max,
5726
                                      hts_pos_t *icig,
5727
                                      hts_pos_t *iseq,
5728
                                      hts_pos_t *iref)
5729
0
{
5730
0
    hts_pos_t pos = *iref;
5731
0
    if ( pos < 0 ) return -1;
5732
0
    *icig = 0;
5733
0
    *iseq = 0;
5734
0
    *iref = 0;
5735
0
    while ( *cigar<cigar_max )
5736
0
    {
5737
0
        int cig  = (**cigar) & BAM_CIGAR_MASK;
5738
0
        int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5739
5740
0
        if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5741
0
        if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
5742
0
        if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5743
0
        {
5744
0
            pos -= ncig;
5745
0
            if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
5746
0
            (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
5747
0
            continue;
5748
0
        }
5749
0
        if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5750
0
        if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
5751
0
        {
5752
0
            pos -= ncig;
5753
0
            if ( pos<0 ) pos = 0;
5754
0
            (*cigar)++; *icig = 0; *iref += ncig;
5755
0
            continue;
5756
0
        }
5757
0
        hts_log_error("Unexpected cigar %d", cig);
5758
0
        return -2;
5759
0
    }
5760
0
    *iseq = -1;
5761
0
    return -1;
5762
0
}
5763
static inline int cigar_iref2iseq_next(const uint32_t **cigar,
5764
                                       const uint32_t *cigar_max,
5765
                                       hts_pos_t *icig,
5766
                                       hts_pos_t *iseq,
5767
                                       hts_pos_t *iref)
5768
0
{
5769
0
    while ( *cigar < cigar_max )
5770
0
    {
5771
0
        int cig  = (**cigar) & BAM_CIGAR_MASK;
5772
0
        int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5773
5774
0
        if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5775
0
        {
5776
0
            if ( *icig >= ncig - 1 ) { *icig = -1;  (*cigar)++; continue; }
5777
0
            (*iseq)++; (*icig)++; (*iref)++;
5778
0
            return BAM_CMATCH;
5779
0
        }
5780
0
        if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = -1; continue; }
5781
0
        if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5782
0
        if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5783
0
        if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = -1; continue; }
5784
0
        hts_log_error("Unexpected cigar %d", cig);
5785
0
        return -2;
5786
0
    }
5787
0
    *iseq = -1;
5788
0
    *iref = -1;
5789
0
    return -1;
5790
0
}
5791
5792
// Given overlapping read 'a' (left) and 'b' (right) on the same
5793
// template, adjust quality values to zero for either a or b.
5794
// Note versions 1.12 and earlier always removed quality from 'b' for
5795
// matching bases.  Now we select a or b semi-randomly based on name hash.
5796
// Returns 0 on success,
5797
//        -1 on failure
5798
static int tweak_overlap_quality(bam1_t *a, bam1_t *b)
5799
0
{
5800
0
    const uint32_t *a_cigar = bam_get_cigar(a),
5801
0
        *a_cigar_max = a_cigar + a->core.n_cigar;
5802
0
    const uint32_t *b_cigar = bam_get_cigar(b),
5803
0
        *b_cigar_max = b_cigar + b->core.n_cigar;
5804
0
    hts_pos_t a_icig = 0, a_iseq = 0;
5805
0
    hts_pos_t b_icig = 0, b_iseq = 0;
5806
0
    uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
5807
0
    uint8_t *a_seq  = bam_get_seq(a), *b_seq = bam_get_seq(b);
5808
5809
0
    hts_pos_t iref   = b->core.pos;
5810
0
    hts_pos_t a_iref = iref - a->core.pos;
5811
0
    hts_pos_t b_iref = iref - b->core.pos;
5812
5813
0
    int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max,
5814
0
                                    &a_icig, &a_iseq, &a_iref);
5815
0
    if ( a_ret<0 )
5816
        // no overlap or error
5817
0
        return a_ret<-1 ? -1:0;
5818
5819
0
    int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max,
5820
0
                                    &b_icig, &b_iseq, &b_iref);
5821
0
    if ( b_ret<0 )
5822
        // no overlap or error
5823
0
        return b_ret<-1 ? -1:0;
5824
5825
    // Determine which seq is the one getting modified qualities.
5826
0
    uint8_t amul, bmul;
5827
0
    if (__ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(a))) & 1) {
5828
0
        amul = 1;
5829
0
        bmul = 0;
5830
0
    } else {
5831
0
        amul = 0;
5832
0
        bmul = 1;
5833
0
    }
5834
5835
    // Loop over the overlapping region nulling qualities in either
5836
    // seq a or b.
5837
0
    int err = 0;
5838
0
    while ( 1 ) {
5839
        // Step to next matching reference position in a and b
5840
0
        while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos )
5841
0
            a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
5842
0
                                         &a_icig, &a_iseq, &a_iref);
5843
0
        if ( a_ret<0 ) { // done
5844
0
            err = a_ret<-1?-1:0;
5845
0
            break;
5846
0
        }
5847
5848
0
        while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos )
5849
0
            b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig,
5850
0
                                         &b_iseq, &b_iref);
5851
0
        if ( b_ret<0 ) { // done
5852
0
            err = b_ret<-1?-1:0;
5853
0
            break;
5854
0
        }
5855
5856
0
        if ( iref < a_iref + a->core.pos )
5857
0
            iref = a_iref + a->core.pos;
5858
5859
0
        if ( iref < b_iref + b->core.pos )
5860
0
            iref = b_iref + b->core.pos;
5861
5862
0
        iref++;
5863
5864
        // If A or B has a deletion then we catch up the other to this point.
5865
        // We also amend quality values using the same rules for mismatch.
5866
0
        if (a_iref+a->core.pos != b_iref+b->core.pos) {
5867
0
            if (a_iref+a->core.pos < b_iref+b->core.pos
5868
0
                && b_cigar > bam_get_cigar(b)
5869
0
                && bam_cigar_op(b_cigar[-1]) == BAM_CDEL) {
5870
                // Del in B means it's moved on further than A
5871
0
                do {
5872
0
                    a_qual[a_iseq] = amul
5873
0
                        ? a_qual[a_iseq]*0.8
5874
0
                        : 0;
5875
0
                    a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
5876
0
                                                 &a_icig, &a_iseq, &a_iref);
5877
0
                    if (a_ret < 0)
5878
0
                        return -(a_ret<-1); // 0 or -1
5879
0
                } while (a_iref + a->core.pos < b_iref+b->core.pos);
5880
0
            } else if (a_cigar > bam_get_cigar(a)
5881
0
                       && bam_cigar_op(a_cigar[-1]) == BAM_CDEL) {
5882
                // Del in A means it's moved on further than B
5883
0
                do {
5884
0
                    b_qual[b_iseq] = bmul
5885
0
                        ? b_qual[b_iseq]*0.8
5886
0
                        : 0;
5887
0
                    b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max,
5888
0
                                                 &b_icig, &b_iseq, &b_iref);
5889
0
                    if (b_ret < 0)
5890
0
                        return -(b_ret<-1); // 0 or -1
5891
0
                } while (b_iref + b->core.pos < a_iref+a->core.pos);
5892
0
            } else {
5893
                // Anything else, eg ref-skip, we don't support here
5894
0
                continue;
5895
0
            }
5896
0
        }
5897
5898
        // fprintf(stderr, "a_cig=%ld,%ld b_cig=%ld,%ld iref=%ld "
5899
        //         "a_iref=%ld b_iref=%ld a_iseq=%ld b_iseq=%ld\n",
5900
        //         a_cigar-bam_get_cigar(a), a_icig,
5901
        //         b_cigar-bam_get_cigar(b), b_icig,
5902
        //         iref, a_iref+a->core.pos+1, b_iref+b->core.pos+1,
5903
        //         a_iseq, b_iseq);
5904
5905
0
        if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq)
5906
            // Fell off end of sequence, bad CIGAR?
5907
0
            return -1;
5908
5909
        // We're finally at the same ref base in both a and b.
5910
        // Check if the bases match (confident) or mismatch
5911
        // (not so confident).
5912
0
        if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) {
5913
            // We are very confident about this base.  Use sum of quals
5914
0
            int qual = a_qual[a_iseq] + b_qual[b_iseq];
5915
0
            a_qual[a_iseq] = amul * (qual>200 ? 200 : qual);
5916
0
            b_qual[b_iseq] = bmul * (qual>200 ? 200 : qual);;
5917
0
        } else {
5918
            // Not so confident about anymore given the mismatch.
5919
            // Reduce qual for lowest quality base.
5920
0
            if ( a_qual[a_iseq] > b_qual[b_iseq] ) {
5921
                // A highest qual base; keep
5922
0
                a_qual[a_iseq] = 0.8 * a_qual[a_iseq];
5923
0
                b_qual[b_iseq] = 0;
5924
0
            } else if (a_qual[a_iseq] < b_qual[b_iseq] ) {
5925
                // B highest qual base; keep
5926
0
                b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
5927
0
                a_qual[a_iseq] = 0;
5928
0
            } else {
5929
                // Both equal, so pick randomly
5930
0
                a_qual[a_iseq] = amul * 0.8 * a_qual[a_iseq];
5931
0
                b_qual[b_iseq] = bmul * 0.8 * b_qual[b_iseq];
5932
0
            }
5933
0
        }
5934
0
    }
5935
5936
0
    return err;
5937
0
}
5938
5939
// Fix overlapping reads. Simple soft-clipping did not give good results.
5940
// Lowering qualities of unwanted bases is more selective and works better.
5941
//
5942
// Returns 0 on success, -1 on failure
5943
static int overlap_push(bam_plp_t iter, lbnode_t *node)
5944
0
{
5945
0
    if ( !iter->overlaps ) return 0;
5946
5947
    // mapped mates and paired reads only
5948
0
    if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return 0;
5949
5950
    // no overlap possible, unless some wild cigar
5951
0
    if ( (node->b.core.mtid >= 0 && node->b.core.tid != node->b.core.mtid)
5952
0
         || (llabs(node->b.core.isize) >= 2*node->b.core.l_qseq
5953
0
         && node->b.core.mpos >= node->end) // for those wild cigars
5954
0
       ) return 0;
5955
5956
0
    khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
5957
0
    if ( kitr==kh_end(iter->overlaps) )
5958
0
    {
5959
        // Only add reads where the mate is still to arrive
5960
0
        if (node->b.core.mpos >= node->b.core.pos ||
5961
0
            ((node->b.core.flag & BAM_FPAIRED) && node->b.core.mpos == -1)) {
5962
0
            int ret;
5963
0
            kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
5964
0
            if (ret < 0) return -1;
5965
0
            kh_value(iter->overlaps, kitr) = node;
5966
0
        }
5967
0
    }
5968
0
    else
5969
0
    {
5970
0
        lbnode_t *a = kh_value(iter->overlaps, kitr);
5971
0
        int err = tweak_overlap_quality(&a->b, &node->b);
5972
0
        kh_del(olap_hash, iter->overlaps, kitr);
5973
0
        assert(a->end-1 == a->s.end);
5974
0
        return err;
5975
0
    }
5976
0
    return 0;
5977
0
}
5978
5979
static void overlap_remove(bam_plp_t iter, const bam1_t *b)
5980
0
{
5981
0
    if ( !iter->overlaps ) return;
5982
5983
0
    khiter_t kitr;
5984
0
    if ( b )
5985
0
    {
5986
0
        kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
5987
0
        if ( kitr!=kh_end(iter->overlaps) )
5988
0
            kh_del(olap_hash, iter->overlaps, kitr);
5989
0
    }
5990
0
    else
5991
0
    {
5992
        // remove all
5993
0
        for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
5994
0
            if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
5995
0
    }
5996
0
}
5997
5998
5999
6000
// Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
6001
// pointer to the piled records if next position is ready or NULL if there is not enough records in the
6002
// buffer yet (the current position is still the maximum position across all buffered reads).
6003
const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
6004
0
{
6005
0
    if (iter->error) { *_n_plp = -1; return NULL; }
6006
0
    *_n_plp = 0;
6007
0
    if (iter->is_eof && iter->head == iter->tail) return NULL;
6008
0
    while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
6009
0
        int n_plp = 0;
6010
        // write iter->plp at iter->pos
6011
0
        lbnode_t **pptr = &iter->head;
6012
0
        while (*pptr != iter->tail) {
6013
0
            lbnode_t *p = *pptr;
6014
0
            if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
6015
0
                overlap_remove(iter, &p->b);
6016
0
                if (iter->plp_destruct)
6017
0
                    iter->plp_destruct(iter->data, &p->b, &p->cd);
6018
0
                *pptr = p->next; mp_free(iter->mp, p);
6019
0
            }
6020
0
            else {
6021
0
                if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
6022
0
                    if (n_plp == iter->max_plp) { // then double the capacity
6023
0
                        iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
6024
0
                        iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
6025
0
                    }
6026
0
                    iter->plp[n_plp].b = &p->b;
6027
0
                    iter->plp[n_plp].cd = p->cd;
6028
0
                    if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
6029
0
                }
6030
0
                pptr = &(*pptr)->next;
6031
0
            }
6032
0
        }
6033
0
        *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
6034
        // update iter->tid and iter->pos
6035
0
        if (iter->head != iter->tail) {
6036
0
            if (iter->tid > iter->head->b.core.tid) {
6037
0
                hts_log_error("Unsorted input. Pileup aborts");
6038
0
                iter->error = 1;
6039
0
                *_n_plp = -1;
6040
0
                return NULL;
6041
0
            }
6042
0
        }
6043
0
        if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
6044
0
            iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
6045
0
        } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
6046
0
            iter->pos = iter->head->beg; // jump to the next position
6047
0
        } else ++iter->pos; // scan contiguously
6048
        // return
6049
0
        if (n_plp) return iter->plp;
6050
0
        if (iter->is_eof && iter->head == iter->tail) break;
6051
0
    }
6052
0
    return NULL;
6053
0
}
6054
6055
const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
6056
0
{
6057
0
    hts_pos_t pos64 = 0;
6058
0
    const bam_pileup1_t *p = bam_plp64_next(iter, _tid, &pos64, _n_plp);
6059
0
    if (pos64 < INT_MAX) {
6060
0
        *_pos = pos64;
6061
0
    } else {
6062
0
        hts_log_error("Position %"PRId64" too large", pos64);
6063
0
        *_pos = INT_MAX;
6064
0
        iter->error = 1;
6065
0
        *_n_plp = -1;
6066
0
        return NULL;
6067
0
    }
6068
0
    return p;
6069
0
}
6070
6071
int bam_plp_push(bam_plp_t iter, const bam1_t *b)
6072
0
{
6073
0
    if (iter->error) return -1;
6074
0
    if (b) {
6075
0
        if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
6076
        // Skip only unmapped reads here, any additional filtering must be done in iter->func
6077
0
        if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
6078
0
        if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
6079
0
        {
6080
0
            overlap_remove(iter, b);
6081
0
            return 0;
6082
0
        }
6083
0
        if (bam_copy1(&iter->tail->b, b) == NULL)
6084
0
            return -1;
6085
0
        iter->tail->b.id = iter->id++;
6086
0
        iter->tail->beg = b->core.pos;
6087
        // Use raw rlen rather than bam_endpos() which adjusts rlen=0 to rlen=1
6088
0
        iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
6089
0
        iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
6090
0
        if (b->core.tid < iter->max_tid) {
6091
0
            hts_log_error("The input is not sorted (chromosomes out of order)");
6092
0
            iter->error = 1;
6093
0
            return -1;
6094
0
        }
6095
0
        if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
6096
0
            hts_log_error("The input is not sorted (reads out of order)");
6097
0
            iter->error = 1;
6098
0
            return -1;
6099
0
        }
6100
0
        iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
6101
0
        if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
6102
0
            lbnode_t *next = mp_alloc(iter->mp);
6103
0
            if (!next) {
6104
0
                iter->error = 1;
6105
0
                return -1;
6106
0
            }
6107
0
            if (iter->plp_construct) {
6108
0
                if (iter->plp_construct(iter->data, &iter->tail->b,
6109
0
                                        &iter->tail->cd) < 0) {
6110
0
                    mp_free(iter->mp, next);
6111
0
                    iter->error = 1;
6112
0
                    return -1;
6113
0
                }
6114
0
            }
6115
0
            if (overlap_push(iter, iter->tail) < 0) {
6116
0
                mp_free(iter->mp, next);
6117
0
                iter->error = 1;
6118
0
                return -1;
6119
0
            }
6120
0
            iter->tail->next = next;
6121
0
            iter->tail = iter->tail->next;
6122
0
        }
6123
0
    } else iter->is_eof = 1;
6124
0
    return 0;
6125
0
}
6126
6127
const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
6128
0
{
6129
0
    const bam_pileup1_t *plp;
6130
0
    if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
6131
0
    if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6132
0
    else { // no pileup line can be obtained; read alignments
6133
0
        *_n_plp = 0;
6134
0
        if (iter->is_eof) return 0;
6135
0
        int ret;
6136
0
        while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
6137
0
            if (bam_plp_push(iter, iter->b) < 0) {
6138
0
                *_n_plp = -1;
6139
0
                return 0;
6140
0
            }
6141
0
            if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6142
            // otherwise no pileup line can be returned; read the next alignment.
6143
0
        }
6144
0
        if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
6145
0
        if (bam_plp_push(iter, 0) < 0) {
6146
0
            *_n_plp = -1;
6147
0
            return 0;
6148
0
        }
6149
0
        if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6150
0
        return 0;
6151
0
    }
6152
0
}
6153
6154
const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
6155
0
{
6156
0
    hts_pos_t pos64 = 0;
6157
0
    const bam_pileup1_t *p = bam_plp64_auto(iter, _tid, &pos64, _n_plp);
6158
0
    if (pos64 < INT_MAX) {
6159
0
        *_pos = pos64;
6160
0
    } else {
6161
0
        hts_log_error("Position %"PRId64" too large", pos64);
6162
0
        *_pos = INT_MAX;
6163
0
        iter->error = 1;
6164
0
        *_n_plp = -1;
6165
0
        return NULL;
6166
0
    }
6167
0
    return p;
6168
0
}
6169
6170
void bam_plp_reset(bam_plp_t iter)
6171
0
{
6172
0
    overlap_remove(iter, NULL);
6173
0
    iter->max_tid = iter->max_pos = -1;
6174
0
    iter->tid = iter->pos = 0;
6175
0
    iter->is_eof = 0;
6176
0
    while (iter->head != iter->tail) {
6177
0
        lbnode_t *p = iter->head;
6178
0
        iter->head = p->next;
6179
0
        mp_free(iter->mp, p);
6180
0
    }
6181
0
}
6182
6183
void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
6184
0
{
6185
0
    iter->maxcnt = maxcnt;
6186
0
}
6187
6188
/************************
6189
 *** Mpileup iterator ***
6190
 ************************/
6191
6192
struct bam_mplp_s {
6193
    int n;
6194
    int32_t min_tid, *tid;
6195
    hts_pos_t min_pos, *pos;
6196
    bam_plp_t *iter;
6197
    int *n_plp;
6198
    const bam_pileup1_t **plp;
6199
};
6200
6201
bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
6202
0
{
6203
0
    int i;
6204
0
    bam_mplp_t iter;
6205
0
    iter = (bam_mplp_t)calloc(1, sizeof(struct bam_mplp_s));
6206
0
    iter->pos = (hts_pos_t*)calloc(n, sizeof(hts_pos_t));
6207
0
    iter->tid = (int32_t*)calloc(n, sizeof(int32_t));
6208
0
    iter->n_plp = (int*)calloc(n, sizeof(int));
6209
0
    iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
6210
0
    iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
6211
0
    iter->n = n;
6212
0
    iter->min_pos = HTS_POS_MAX;
6213
0
    iter->min_tid = (uint32_t)-1;
6214
0
    for (i = 0; i < n; ++i) {
6215
0
        iter->iter[i] = bam_plp_init(func, data[i]);
6216
0
        iter->pos[i] = iter->min_pos;
6217
0
        iter->tid[i] = iter->min_tid;
6218
0
    }
6219
0
    return iter;
6220
0
}
6221
6222
int bam_mplp_init_overlaps(bam_mplp_t iter)
6223
0
{
6224
0
    int i, r = 0;
6225
0
    for (i = 0; i < iter->n; ++i)
6226
0
        r |= bam_plp_init_overlaps(iter->iter[i]);
6227
0
    return r == 0 ? 0 : -1;
6228
0
}
6229
6230
void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
6231
0
{
6232
0
    int i;
6233
0
    for (i = 0; i < iter->n; ++i)
6234
0
        iter->iter[i]->maxcnt = maxcnt;
6235
0
}
6236
6237
void bam_mplp_destroy(bam_mplp_t iter)
6238
0
{
6239
0
    int i;
6240
0
    for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
6241
0
    free(iter->iter); free(iter->pos); free(iter->tid);
6242
0
    free(iter->n_plp); free(iter->plp);
6243
0
    free(iter);
6244
0
}
6245
6246
int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp)
6247
0
{
6248
0
    int i, ret = 0;
6249
0
    hts_pos_t new_min_pos = HTS_POS_MAX;
6250
0
    uint32_t new_min_tid = (uint32_t)-1;
6251
0
    for (i = 0; i < iter->n; ++i) {
6252
0
        if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
6253
0
            int tid;
6254
0
            hts_pos_t pos;
6255
0
            iter->plp[i] = bam_plp64_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
6256
0
            if ( iter->iter[i]->error ) return -1;
6257
0
            if (iter->plp[i]) {
6258
0
                iter->tid[i] = tid;
6259
0
                iter->pos[i] = pos;
6260
0
            } else {
6261
0
                iter->tid[i] = 0;
6262
0
                iter->pos[i] = 0;
6263
0
            }
6264
0
        }
6265
0
        if (iter->plp[i]) {
6266
0
            if (iter->tid[i] < new_min_tid) {
6267
0
                new_min_tid = iter->tid[i];
6268
0
                new_min_pos = iter->pos[i];
6269
0
            } else if (iter->tid[i] == new_min_tid && iter->pos[i] < new_min_pos) {
6270
0
                new_min_pos = iter->pos[i];
6271
0
            }
6272
0
        }
6273
0
    }
6274
0
    iter->min_pos = new_min_pos;
6275
0
    iter->min_tid = new_min_tid;
6276
0
    if (new_min_pos == HTS_POS_MAX) return 0;
6277
0
    *_tid = new_min_tid; *_pos = new_min_pos;
6278
0
    for (i = 0; i < iter->n; ++i) {
6279
0
        if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
6280
0
            n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
6281
0
            ++ret;
6282
0
        } else n_plp[i] = 0, plp[i] = 0;
6283
0
    }
6284
0
    return ret;
6285
0
}
6286
6287
int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
6288
0
{
6289
0
    hts_pos_t pos64 = 0;
6290
0
    int ret = bam_mplp64_auto(iter, _tid, &pos64, n_plp, plp);
6291
0
    if (ret >= 0) {
6292
0
        if (pos64 < INT_MAX) {
6293
0
            *_pos = pos64;
6294
0
        } else {
6295
0
            hts_log_error("Position %"PRId64" too large", pos64);
6296
0
            *_pos = INT_MAX;
6297
0
            return -1;
6298
0
        }
6299
0
    }
6300
0
    return ret;
6301
0
}
6302
6303
void bam_mplp_reset(bam_mplp_t iter)
6304
0
{
6305
0
    int i;
6306
0
    iter->min_pos = HTS_POS_MAX;
6307
0
    iter->min_tid = (uint32_t)-1;
6308
0
    for (i = 0; i < iter->n; ++i) {
6309
0
        bam_plp_reset(iter->iter[i]);
6310
0
        iter->pos[i] = HTS_POS_MAX;
6311
0
        iter->tid[i] = (uint32_t)-1;
6312
0
        iter->n_plp[i] = 0;
6313
0
        iter->plp[i] = NULL;
6314
0
    }
6315
0
}
6316
6317
void bam_mplp_constructor(bam_mplp_t iter,
6318
0
                          int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
6319
0
    int i;
6320
0
    for (i = 0; i < iter->n; ++i)
6321
0
        bam_plp_constructor(iter->iter[i], func);
6322
0
}
6323
6324
void bam_mplp_destructor(bam_mplp_t iter,
6325
0
                         int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
6326
0
    int i;
6327
0
    for (i = 0; i < iter->n; ++i)
6328
0
        bam_plp_destructor(iter->iter[i], func);
6329
0
}
6330
6331
#endif // ~!defined(BAM_NO_PILEUP)