Coverage Report

Created: 2025-11-16 06:31

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/sam.c
Line
Count
Source
1
/*  sam.c -- SAM and BAM file I/O and manipulation.
2
3
    Copyright (C) 2008-2010, 2012-2025 Genome Research Ltd.
4
    Copyright (C) 2010, 2012, 2013 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <strings.h>
30
#include <stdio.h>
31
#include <stdlib.h>
32
#include <string.h>
33
#include <errno.h>
34
#include <zlib.h>
35
#include <assert.h>
36
#include <signal.h>
37
#include <inttypes.h>
38
#include <unistd.h>
39
#include <regex.h>
40
41
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
42
#include "fuzz_settings.h"
43
#endif
44
45
// Suppress deprecation message for cigar_tab, which we initialise
46
#include "htslib/hts_defs.h"
47
#undef HTS_DEPRECATED
48
#define HTS_DEPRECATED(message)
49
50
#include "htslib/sam.h"
51
#include "htslib/bgzf.h"
52
#include "cram/cram.h"
53
#include "hts_internal.h"
54
#include "sam_internal.h"
55
#include "htslib/hfile.h"
56
#include "htslib/hts_endian.h"
57
#include "htslib/hts_expr.h"
58
#include "header.h"
59
60
#include "htslib/khash.h"
61
KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
62
KHASH_SET_INIT_INT(tag)
63
64
#ifndef EFTYPE
65
0
#define EFTYPE ENOEXEC
66
#endif
67
#ifndef EOVERFLOW
68
#define EOVERFLOW ERANGE
69
#endif
70
71
/**********************
72
 *** BAM header I/O ***
73
 **********************/
74
75
HTSLIB_EXPORT
76
const int8_t bam_cigar_table[256] = {
77
    // 0 .. 47
78
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
79
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
80
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
81
82
    // 48 .. 63  (including =)
83
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, BAM_CEQUAL, -1, -1,
84
85
    // 64 .. 79  (including MIDNHB)
86
    -1, -1, BAM_CBACK, -1,  BAM_CDEL, -1, -1, -1,
87
        BAM_CHARD_CLIP, BAM_CINS, -1, -1,  -1, BAM_CMATCH, BAM_CREF_SKIP, -1,
88
89
    // 80 .. 95  (including SPX)
90
    BAM_CPAD, -1, -1, BAM_CSOFT_CLIP,  -1, -1, -1, -1,
91
        BAM_CDIFF, -1, -1, -1,  -1, -1, -1, -1,
92
93
    // 96 .. 127
94
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
95
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
96
97
    // 128 .. 255
98
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
99
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
100
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
101
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
102
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
103
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
104
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
105
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1
106
};
107
108
sam_hdr_t *sam_hdr_init(void)
109
6.66k
{
110
6.66k
    sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t));
111
6.66k
    if (bh == NULL) return NULL;
112
113
6.66k
    bh->cigar_tab = bam_cigar_table;
114
6.66k
    return bh;
115
6.66k
}
116
117
void sam_hdr_destroy(sam_hdr_t *bh)
118
15.2k
{
119
15.2k
    int32_t i;
120
121
15.2k
    if (bh == NULL) return;
122
123
8.53k
    if (bh->ref_count > 0) {
124
1.86k
        --bh->ref_count;
125
1.86k
        return;
126
1.86k
    }
127
128
6.66k
    if (bh->target_name) {
129
7.37k
        for (i = 0; i < bh->n_targets; ++i)
130
4.17k
            free(bh->target_name[i]);
131
3.19k
        free(bh->target_name);
132
3.19k
        free(bh->target_len);
133
3.19k
    }
134
6.66k
    free(bh->text);
135
6.66k
    if (bh->hrecs)
136
4.14k
        sam_hrecs_free(bh->hrecs);
137
6.66k
    if (bh->sdict)
138
288
        kh_destroy(s2i, (khash_t(s2i) *) bh->sdict);
139
6.66k
    free(bh);
140
6.66k
}
141
142
// Copy the sam_hdr_t::sdict hash, used to store the real lengths of long
143
// references before sam_hdr_t::hrecs is populated
144
int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h)
145
0
{
146
0
    const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict;
147
0
    khash_t(s2i) *dest_long_refs = kh_init(s2i);
148
0
    int i;
149
0
    if (!dest_long_refs) return -1;
150
151
0
    for (i = 0; i < h->n_targets; i++) {
152
0
        int ret;
153
0
        khiter_t ksrc, kdest;
154
0
        if (h->target_len[i] < UINT32_MAX) continue;
155
0
        ksrc = kh_get(s2i, src_long_refs, h->target_name[i]);
156
0
        if (ksrc == kh_end(src_long_refs)) continue;
157
0
        kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret);
158
0
        if (ret < 0) {
159
0
            kh_destroy(s2i, dest_long_refs);
160
0
            return -1;
161
0
        }
162
0
        kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc);
163
0
    }
164
165
0
    h->sdict = dest_long_refs;
166
0
    return 0;
167
0
}
168
169
sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0)
170
3.87k
{
171
3.87k
    if (h0 == NULL) return NULL;
172
3.87k
    sam_hdr_t *h;
173
3.87k
    if ((h = sam_hdr_init()) == NULL) return NULL;
174
    // copy the simple data
175
3.87k
    h->n_targets = 0;
176
3.87k
    h->ignore_sam_err = h0->ignore_sam_err;
177
3.87k
    h->l_text = 0;
178
179
    // Then the pointery stuff
180
181
3.87k
    if (!h0->hrecs) {
182
3
        h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t));
183
3
        if (!h->target_len) goto fail;
184
3
        h->target_name = (char**)calloc(h0->n_targets, sizeof(char*));
185
3
        if (!h->target_name) goto fail;
186
187
3
        int i;
188
3
        for (i = 0; i < h0->n_targets; ++i) {
189
0
            h->target_len[i] = h0->target_len[i];
190
0
            h->target_name[i] = strdup(h0->target_name[i]);
191
0
            if (!h->target_name[i]) break;
192
0
        }
193
3
        h->n_targets = i;
194
3
        if (i < h0->n_targets) goto fail;
195
196
3
        if (h0->sdict) {
197
0
            if (sam_hdr_dup_sdict(h0, h) < 0) goto fail;
198
0
        }
199
3
    }
200
201
3.87k
    if (h0->hrecs) {
202
3.87k
        kstring_t tmp = { 0, 0, NULL };
203
3.87k
        if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) {
204
0
            free(ks_release(&tmp));
205
0
            goto fail;
206
0
        }
207
208
3.87k
        h->l_text = tmp.l;
209
3.87k
        h->text   = ks_release(&tmp);
210
211
3.87k
        if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0)
212
0
            goto fail;
213
3.87k
    } else {
214
3
        h->l_text = h0->text ? h0->l_text : 0;
215
3
        h->text = malloc(h->l_text + 1);
216
3
        if (!h->text) goto fail;
217
3
        if (h0->text)
218
3
            memcpy(h->text, h0->text, h->l_text);
219
3
        h->text[h->l_text] = '\0';
220
3
    }
221
222
3.87k
    return h;
223
224
0
 fail:
225
0
    sam_hdr_destroy(h);
226
0
    return NULL;
227
3.87k
}
228
229
sam_hdr_t *bam_hdr_read(BGZF *fp)
230
42
{
231
42
    sam_hdr_t *h;
232
42
    uint8_t buf[4];
233
42
    int magic_len, has_EOF;
234
42
    int32_t i, name_len, num_names = 0;
235
42
    size_t bufsize;
236
42
    ssize_t bytes;
237
    // check EOF
238
42
    has_EOF = bgzf_check_EOF(fp);
239
42
    if (has_EOF < 0) {
240
0
        perror("[W::bam_hdr_read] bgzf_check_EOF");
241
42
    } else if (has_EOF == 0) {
242
42
        hts_log_warning("EOF marker is absent. The input is probably truncated");
243
42
    }
244
    // read "BAM1"
245
42
    magic_len = bgzf_read(fp, buf, 4);
246
42
    if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) {
247
0
        hts_log_error("Invalid BAM binary header");
248
0
        return 0;
249
0
    }
250
42
    h = sam_hdr_init();
251
42
    if (!h) goto nomem;
252
253
    // read plain text and the number of reference sequences
254
42
    bytes = bgzf_read(fp, buf, 4);
255
42
    if (bytes != 4) goto read_err;
256
42
    h->l_text = le_to_u32(buf);
257
258
42
    bufsize = h->l_text + 1;
259
42
    if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
260
42
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
261
42
    if (bufsize > FUZZ_ALLOC_LIMIT) goto nomem;
262
42
#endif
263
42
    h->text = (char*)malloc(bufsize);
264
42
    if (!h->text) goto nomem;
265
42
    h->text[h->l_text] = 0; // make sure it is NULL terminated
266
42
    bytes = bgzf_read(fp, h->text, h->l_text);
267
42
    if (bytes != h->l_text) goto read_err;
268
269
42
    bytes = bgzf_read(fp, &h->n_targets, 4);
270
42
    if (bytes != 4) goto read_err;
271
42
    if (fp->is_be) ed_swap_4p(&h->n_targets);
272
273
42
    if (h->n_targets < 0) goto invalid;
274
275
    // read reference sequence names and lengths
276
42
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
277
42
    if (h->n_targets > (FUZZ_ALLOC_LIMIT - bufsize)/(sizeof(char*)+sizeof(uint32_t)))
278
0
        goto nomem;
279
42
#endif
280
42
    if (h->n_targets > 0) {
281
21
        h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
282
21
        if (!h->target_name) goto nomem;
283
21
        h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
284
21
        if (!h->target_len) goto nomem;
285
21
    }
286
21
    else {
287
21
        h->target_name = NULL;
288
21
        h->target_len = NULL;
289
21
    }
290
291
57
    for (i = 0; i != h->n_targets; ++i) {
292
24
        bytes = bgzf_read(fp, &name_len, 4);
293
24
        if (bytes != 4) goto read_err;
294
24
        if (fp->is_be) ed_swap_4p(&name_len);
295
24
        if (name_len <= 0) goto invalid;
296
297
24
        h->target_name[i] = (char*)malloc(name_len);
298
24
        if (!h->target_name[i]) goto nomem;
299
24
        num_names++;
300
301
24
        bytes = bgzf_read(fp, h->target_name[i], name_len);
302
24
        if (bytes != name_len) goto read_err;
303
304
15
        if (h->target_name[i][name_len - 1] != '\0') {
305
            /* Fix missing NUL-termination.  Is this being too nice?
306
               We could alternatively bail out with an error. */
307
15
            char *new_name;
308
15
            if (name_len == INT32_MAX) goto invalid;
309
15
            new_name = realloc(h->target_name[i], name_len + 1);
310
15
            if (new_name == NULL) goto nomem;
311
15
            h->target_name[i] = new_name;
312
15
            h->target_name[i][name_len] = '\0';
313
15
        }
314
315
15
        bytes = bgzf_read(fp, &h->target_len[i], 4);
316
15
        if (bytes != 4) goto read_err;
317
15
        if (fp->is_be) ed_swap_4p(&h->target_len[i]);
318
15
    }
319
33
    return h;
320
321
0
 nomem:
322
0
    hts_log_error("Out of memory");
323
0
    goto clean;
324
325
9
 read_err:
326
9
    if (bytes < 0) {
327
9
        hts_log_error("Error reading BGZF stream");
328
9
    } else {
329
0
        hts_log_error("Truncated BAM header");
330
0
    }
331
9
    goto clean;
332
333
0
 invalid:
334
0
    hts_log_error("Invalid BAM binary header");
335
336
9
 clean:
337
9
    if (h != NULL) {
338
9
        h->n_targets = num_names; // ensure we free only allocated target_names
339
9
        sam_hdr_destroy(h);
340
9
    }
341
9
    return NULL;
342
0
}
343
344
int bam_hdr_write(BGZF *fp, const sam_hdr_t *h)
345
837
{
346
837
    int32_t i, name_len, x;
347
837
    kstring_t hdr_ks = { 0, 0, NULL };
348
837
    char *text;
349
837
    uint32_t l_text;
350
351
837
    if (!h) return -1;
352
353
837
    if (h->hrecs) {
354
836
        if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1;
355
836
        if (hdr_ks.l > UINT32_MAX) {
356
0
            hts_log_error("Header too long for BAM format");
357
0
            free(hdr_ks.s);
358
0
            return -1;
359
836
        } else if (hdr_ks.l > INT32_MAX) {
360
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
361
0
            hts_log_warning("Output file may not be portable");
362
0
        }
363
836
        text = hdr_ks.s;
364
836
        l_text = hdr_ks.l;
365
836
    } else {
366
1
        if (h->l_text > UINT32_MAX) {
367
0
            hts_log_error("Header too long for BAM format");
368
0
            return -1;
369
1
        } else if (h->l_text > INT32_MAX) {
370
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
371
0
            hts_log_warning("Output file may not be portable");
372
0
        }
373
1
        text = h->text;
374
1
        l_text = h->l_text;
375
1
    }
376
    // write "BAM1"
377
837
    if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; }
378
    // write plain text and the number of reference sequences
379
837
    if (fp->is_be) {
380
0
        x = ed_swap_4(l_text);
381
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
382
0
        if (l_text) {
383
0
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
384
0
        }
385
0
        x = ed_swap_4(h->n_targets);
386
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
387
837
    } else {
388
837
        if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; }
389
837
        if (l_text) {
390
489
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
391
489
        }
392
837
        if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; }
393
837
    }
394
837
    free(hdr_ks.s);
395
    // write sequence names and lengths
396
1.37k
    for (i = 0; i != h->n_targets; ++i) {
397
539
        char *p = h->target_name[i];
398
539
        name_len = strlen(p) + 1;
399
539
        if (fp->is_be) {
400
0
            x = ed_swap_4(name_len);
401
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
402
539
        } else {
403
539
            if (bgzf_write(fp, &name_len, 4) < 0) return -1;
404
539
        }
405
539
        if (bgzf_write(fp, p, name_len) < 0) return -1;
406
539
        if (fp->is_be) {
407
0
            x = ed_swap_4(h->target_len[i]);
408
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
409
539
        } else {
410
539
            if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
411
539
        }
412
539
    }
413
837
    if (bgzf_flush(fp) < 0) return -1;
414
837
    return 0;
415
837
}
416
417
const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
418
0
                             hts_pos_t *beg, hts_pos_t *end, int flags) {
419
0
    return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags);
420
0
}
421
422
/*************************
423
 *** BAM alignment I/O ***
424
 *************************/
425
426
bam1_t *bam_init1(void)
427
536k
{
428
536k
    return (bam1_t*)calloc(1, sizeof(bam1_t));
429
536k
}
430
431
int sam_realloc_bam_data(bam1_t *b, size_t desired)
432
557k
{
433
557k
    uint32_t new_m_data;
434
557k
    uint8_t *new_data;
435
557k
    new_m_data = desired;
436
557k
    kroundup32(new_m_data); // next power of 2
437
557k
    new_m_data += 32; // reduces malloc arena migrations?
438
557k
    if (new_m_data < desired) {
439
0
        errno = ENOMEM; // Not strictly true but we can't store the size
440
0
        return -1;
441
0
    }
442
557k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
443
557k
    if (new_m_data > FUZZ_ALLOC_LIMIT) {
444
2
        errno = ENOMEM;
445
2
        return -1;
446
2
    }
447
557k
#endif
448
557k
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
449
557k
        new_data = realloc(b->data, new_m_data);
450
557k
    } else {
451
0
        if ((new_data = malloc(new_m_data)) != NULL) {
452
0
            if (b->l_data > 0)
453
0
                memcpy(new_data, b->data,
454
0
                       b->l_data < b->m_data ? b->l_data : b->m_data);
455
0
            bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA));
456
0
        }
457
0
    }
458
557k
    if (!new_data) return -1;
459
557k
    b->data = new_data;
460
557k
    b->m_data = new_m_data;
461
557k
    return 0;
462
557k
}
463
464
void bam_destroy1(bam1_t *b)
465
5.35M
{
466
5.35M
    if (b == 0) return;
467
536k
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
468
536k
        free(b->data);
469
536k
        if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) {
470
            // In case of reuse
471
0
            b->data = NULL;
472
0
            b->m_data = 0;
473
0
            b->l_data = 0;
474
0
        }
475
536k
    }
476
477
536k
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0)
478
536k
        free(b);
479
536k
}
480
481
bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
482
3.79M
{
483
3.79M
    if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL;
484
3.79M
    memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data
485
3.79M
    memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest
486
3.79M
    bdst->l_data = bsrc->l_data;
487
3.79M
    bdst->id = bsrc->id;
488
3.79M
    return bdst;
489
3.79M
}
490
491
bam1_t *bam_dup1(const bam1_t *bsrc)
492
533k
{
493
533k
    if (bsrc == NULL) return NULL;
494
533k
    bam1_t *bdst = bam_init1();
495
533k
    if (bdst == NULL) return NULL;
496
533k
    if (bam_copy1(bdst, bsrc) == NULL) {
497
0
        bam_destroy1(bdst);
498
0
        return NULL;
499
0
    }
500
533k
    return bdst;
501
533k
}
502
503
static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar,
504
                             hts_pos_t *rlen, hts_pos_t *qlen)
505
189
{
506
189
    int k;
507
189
    *rlen = *qlen = 0;
508
627
    for (k = 0; k < n_cigar; ++k) {
509
438
        int type = bam_cigar_type(bam_cigar_op(cigar[k]));
510
438
        int len = bam_cigar_oplen(cigar[k]);
511
438
        if (type & 1) *qlen += len;
512
438
        if (type & 2) *rlen += len;
513
438
    }
514
189
}
515
516
static int subtract_check_underflow(size_t length, size_t *limit)
517
56.5M
{
518
56.5M
    if (length <= *limit) {
519
56.5M
        *limit -= length;
520
56.5M
        return 0;
521
56.5M
    }
522
523
0
    return -1;
524
56.5M
}
525
526
int bam_set1(bam1_t *bam,
527
             size_t l_qname, const char *qname,
528
             uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq,
529
             size_t n_cigar, const uint32_t *cigar,
530
             int32_t mtid, hts_pos_t mpos, hts_pos_t isize,
531
             size_t l_seq, const char *seq, const char *qual,
532
             size_t l_aux)
533
11.3M
{
534
    // use a default qname "*" if none is provided
535
11.3M
    if (l_qname == 0) {
536
10.1M
        l_qname = 1;
537
10.1M
        qname = "*";
538
10.1M
    }
539
540
    // note: the qname is stored nul terminated and padded as described in the
541
    // documentation for the bam1_t struct.
542
11.3M
    size_t qname_nuls = 4 - l_qname % 4;
543
544
    // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos().
545
    // can't use bam_endpos() directly as some fields not yet set up.
546
11.3M
    hts_pos_t rlen = 0, qlen = 0;
547
11.3M
    if (!(flag & BAM_FUNMAP)) {
548
0
        bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen);
549
0
    }
550
11.3M
    if (rlen == 0) {
551
11.3M
        rlen = 1;
552
11.3M
    }
553
554
    // validate parameters
555
11.3M
    if (l_qname > 254) {
556
48
        hts_log_error("Query name too long");
557
48
        errno = EINVAL;
558
48
        return -1;
559
48
    }
560
11.3M
    if (HTS_POS_MAX - rlen <= pos) {
561
0
        hts_log_error("Read ends beyond highest supported position");
562
0
        errno = EINVAL;
563
0
        return -1;
564
0
    }
565
11.3M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) {
566
0
        hts_log_error("Mapped query must have a CIGAR");
567
0
        errno = EINVAL;
568
0
        return -1;
569
0
    }
570
11.3M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) {
571
0
        hts_log_error("CIGAR and query sequence are of different length");
572
0
        errno = EINVAL;
573
0
        return -1;
574
0
    }
575
576
11.3M
    size_t limit = INT32_MAX;
577
11.3M
    int u = subtract_check_underflow(l_qname + qname_nuls, &limit);
578
11.3M
    u    += subtract_check_underflow(n_cigar * 4, &limit);
579
11.3M
    u    += subtract_check_underflow((l_seq + 1) / 2, &limit);
580
11.3M
    u    += subtract_check_underflow(l_seq, &limit);
581
11.3M
    u    += subtract_check_underflow(l_aux, &limit);
582
11.3M
    if (u != 0) {
583
0
        hts_log_error("Size overflow");
584
0
        errno = EINVAL;
585
0
        return -1;
586
0
    }
587
588
    // re-allocate the data buffer as needed.
589
11.3M
    size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq;
590
11.3M
    if (realloc_bam_data(bam, data_len + l_aux) < 0) {
591
0
        return -1;
592
0
    }
593
594
11.3M
    bam->l_data = (int)data_len;
595
11.3M
    bam->core.pos = pos;
596
11.3M
    bam->core.tid = tid;
597
11.3M
    bam->core.bin = bam_reg2bin(pos, pos + rlen);
598
11.3M
    bam->core.qual = mapq;
599
11.3M
    bam->core.l_extranul = (uint8_t)(qname_nuls - 1);
600
11.3M
    bam->core.flag = flag;
601
11.3M
    bam->core.l_qname = (uint16_t)(l_qname + qname_nuls);
602
11.3M
    bam->core.n_cigar = (uint32_t)n_cigar;
603
11.3M
    bam->core.l_qseq = (int32_t)l_seq;
604
11.3M
    bam->core.mtid = mtid;
605
11.3M
    bam->core.mpos = mpos;
606
11.3M
    bam->core.isize = isize;
607
608
11.3M
    uint8_t *cp = bam->data;
609
11.3M
    strncpy((char *)cp, qname, l_qname);
610
11.3M
    int i;
611
44.8M
    for (i = 0; i < qname_nuls; i++) {
612
33.5M
        cp[l_qname + i] = '\0';
613
33.5M
    }
614
11.3M
    cp += l_qname + qname_nuls;
615
616
11.3M
    if (n_cigar > 0) {
617
0
        memcpy(cp, cigar, n_cigar * 4);
618
0
    }
619
11.3M
    cp += n_cigar * 4;
620
621
455M
#define NN 16
622
11.3M
    const uint8_t *useq = (uint8_t *)seq;
623
48.3M
    for (i = 0; i + NN < l_seq; i += NN) {
624
37.0M
        int j;
625
37.0M
        const uint8_t *u2 = useq+i;
626
333M
        for (j = 0; j < NN/2; j++)
627
296M
            cp[j] = (seq_nt16_table[u2[j*2]]<<4) | seq_nt16_table[u2[j*2+1]];
628
37.0M
        cp += NN/2;
629
37.0M
    }
630
12.5M
    for (; i + 1 < l_seq; i += 2) {
631
1.20M
        *cp++ = (seq_nt16_table[useq[i]] << 4) | seq_nt16_table[useq[i + 1]];
632
1.20M
    }
633
634
11.4M
    for (; i < l_seq; i++) {
635
128k
        *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4;
636
128k
    }
637
638
11.3M
    if (qual) {
639
486
        memcpy(cp, qual, l_seq);
640
486
    }
641
11.3M
    else {
642
11.3M
        memset(cp, '\xff', l_seq);
643
11.3M
    }
644
645
11.3M
    return (int)data_len;
646
11.3M
}
647
648
hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
649
3.74M
{
650
3.74M
    int k;
651
3.74M
    hts_pos_t l;
652
4.48M
    for (k = l = 0; k < n_cigar; ++k)
653
734k
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
654
649k
            l += bam_cigar_oplen(cigar[k]);
655
3.74M
    return l;
656
3.74M
}
657
658
hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
659
66.5k
{
660
66.5k
    int k;
661
66.5k
    hts_pos_t l;
662
2.61M
    for (k = l = 0; k < n_cigar; ++k)
663
2.54M
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
664
2.34M
            l += bam_cigar_oplen(cigar[k]);
665
66.5k
    return l;
666
66.5k
}
667
668
hts_pos_t bam_endpos(const bam1_t *b)
669
390
{
670
390
    hts_pos_t rlen = (b->core.flag & BAM_FUNMAP)? 0 : bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
671
390
    if (rlen == 0) rlen = 1;
672
390
    return b->core.pos + rlen;
673
390
}
674
675
static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
676
91.8k
{
677
91.8k
    bam1_core_t *c = &b->core;
678
679
    // Bail out as fast as possible for the easy case
680
91.8k
    uint32_t test_CG = BAM_CSOFT_CLIP | (c->l_qseq << BAM_CIGAR_SHIFT);
681
91.8k
    if (c->n_cigar == 0 || test_CG != *bam_get_cigar(b))
682
69.2k
        return 0;
683
684
    // The above isn't fool proof - we may have old CIGAR tags that aren't used,
685
    // but this is much less likely so do as a secondary check.
686
22.5k
    if (c->tid < 0 || c->pos < 0)
687
10.9k
        return 0;
688
689
    // Do we have a CG tag?
690
11.5k
    uint8_t *CG = bam_aux_get(b, "CG");
691
11.5k
    int saved_errno = errno;
692
11.5k
    if (!CG) {
693
11.0k
        if (errno != ENOENT) return -1;  // Bad aux data
694
11.0k
        errno = saved_errno; // restore errno on expected no-CG-tag case
695
11.0k
        return 0;
696
11.0k
    }
697
698
    // Now we start with the serious work migrating CG to CIGAR
699
512
    uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data,
700
512
        *cigar0, CG_len, fake_bytes;
701
512
    cigar0 = bam_get_cigar(b);
702
512
    fake_bytes = c->n_cigar * 4;
703
512
    if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i'))
704
122
        return 0; // not of type B,I
705
390
    CG_len = le_to_u32(CG + 2);
706
    // don't move if the real CIGAR length is shorter than the fake cigar length
707
390
    if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0;
708
709
    // move from the CG tag to the right position
710
390
    cigar_st = (uint8_t*)cigar0 - b->data;
711
390
    c->n_cigar = CG_len;
712
390
    n_cigar4 = c->n_cigar * 4;
713
390
    CG_st = CG - b->data - 2;
714
390
    CG_en = CG_st + 8 + n_cigar4;
715
390
    if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1;
716
    // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
717
390
    b->l_data = b->l_data - fake_bytes + n_cigar4;
718
    // insert c->n_cigar-fake_bytes empty space to make room
719
390
    memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes));
720
    // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
721
390
    memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4);
722
390
    if (ori_len > CG_en) // move data after the CG tag
723
43
        memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
724
390
    b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
725
390
    if (recal_bin)
726
390
        b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5);
727
390
    if (give_warning)
728
390
        hts_log_warning("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
729
390
    return 1;
730
390
}
731
732
static inline int aux_type2size(uint8_t type)
733
1.48M
{
734
1.48M
    switch (type) {
735
822k
    case 'A': case 'c': case 'C':
736
822k
        return 1;
737
89.8k
    case 's': case 'S':
738
89.8k
        return 2;
739
280k
    case 'i': case 'I': case 'f':
740
280k
        return 4;
741
4.86k
    case 'd':
742
4.86k
        return 8;
743
290k
    case 'Z': case 'H': case 'B':
744
290k
        return type;
745
47
    default:
746
47
        return 0;
747
1.48M
    }
748
1.48M
}
749
750
static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
751
0
{
752
0
    uint32_t *cigar = (uint32_t*)(data + c->l_qname);
753
0
    uint32_t i;
754
0
    for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
755
0
}
756
757
// Fix bad records where qname is not terminated correctly.
758
97
static int fixup_missing_qname_nul(bam1_t *b) {
759
97
    bam1_core_t *c = &b->core;
760
761
    // Note this is called before c->l_extranul is added to c->l_qname
762
97
    if (c->l_extranul > 0) {
763
97
        b->data[c->l_qname++] = '\0';
764
97
        c->l_extranul--;
765
97
    } else {
766
0
        if (b->l_data > INT_MAX - 4) return -1;
767
0
        if (realloc_bam_data(b, b->l_data + 4) < 0) return -1;
768
0
        b->l_data += 4;
769
0
        b->data[c->l_qname++] = '\0';
770
0
        c->l_extranul = 3;
771
0
    }
772
97
    return 0;
773
97
}
774
775
/*
776
 * Note a second interface that returns a bam pointer instead would avoid bam_copy1
777
 * in multi-threaded handling.  This may be worth considering for htslib2.
778
 */
779
int bam_read1(BGZF *fp, bam1_t *b)
780
209
{
781
209
    bam1_core_t *c = &b->core;
782
209
    int32_t block_len, ret, i;
783
209
    uint32_t new_l_data;
784
209
    uint8_t tmp[32], *x;
785
786
209
    b->l_data = 0;
787
788
209
    if ((ret = bgzf_read_small(fp, &block_len, 4)) != 4) {
789
0
        if (ret == 0) return -1; // normal end-of-file
790
0
        else return -2; // truncated
791
0
    }
792
209
    if (fp->is_be)
793
0
        ed_swap_4p(&block_len);
794
209
    if (block_len < 32) return -4;  // block_len includes core data
795
209
    if (fp->block_length - fp->block_offset > 32) {
796
        // Avoid bgzf_read and a temporary copy to a local buffer
797
207
        x = (uint8_t *)fp->uncompressed_block + fp->block_offset;
798
207
        fp->block_offset += 32;
799
207
    } else {
800
2
        x = tmp;
801
2
        if (bgzf_read(fp, x, 32) != 32) return -3;
802
2
    }
803
804
207
    c->tid        = le_to_u32(x);
805
207
    c->pos        = le_to_i32(x+4);
806
207
    uint32_t x2   = le_to_u32(x+8);
807
207
    c->bin        = x2>>16;
808
207
    c->qual       = x2>>8&0xff;
809
207
    c->l_qname    = x2&0xff;
810
207
    c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
811
207
    uint32_t x3   = le_to_u32(x+12);
812
207
    c->flag       = x3>>16;
813
207
    c->n_cigar    = x3&0xffff;
814
207
    c->l_qseq     = le_to_u32(x+16);
815
207
    c->mtid       = le_to_u32(x+20);
816
207
    c->mpos       = le_to_i32(x+24);
817
207
    c->isize      = le_to_i32(x+28);
818
819
207
    new_l_data = block_len - 32 + c->l_extranul;
820
207
    if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
821
202
    if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
822
202
        + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data)
823
5
        return -4;
824
197
    if (realloc_bam_data(b, new_l_data) < 0) return -4;
825
195
    b->l_data = new_l_data;
826
827
195
    if (bgzf_read_small(fp, b->data, c->l_qname) != c->l_qname) return -4;
828
193
    if (b->data[c->l_qname - 1] != '\0') { // try to fix missing nul termination
829
97
        if (fixup_missing_qname_nul(b) < 0) return -4;
830
97
    }
831
362
    for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
832
193
    c->l_qname += c->l_extranul;
833
193
    if (b->l_data < c->l_qname ||
834
193
        bgzf_read_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
835
4
        return -4;
836
189
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
837
189
    if (bam_tag2cigar(b, 0, 0) < 0)
838
0
        return -4;
839
840
    // TODO: consider making this conditional
841
189
    if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
842
189
        hts_pos_t rlen, qlen;
843
189
        bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
844
189
        if ((b->core.flag & BAM_FUNMAP) || rlen == 0) rlen = 1;
845
189
        b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
846
        // Sanity check for broken CIGAR alignments
847
189
        if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
848
2
            hts_log_error("CIGAR and query sequence lengths differ for %s",
849
2
                    bam_get_qname(b));
850
2
            return -4;
851
2
        }
852
189
    }
853
854
187
    return 4 + block_len;
855
189
}
856
857
int bam_write1(BGZF *fp, const bam1_t *b)
858
3.79M
{
859
3.79M
    const bam1_core_t *c = &b->core;
860
3.79M
    uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
861
3.79M
    int i, ok;
862
3.79M
    if (c->l_qname - c->l_extranul > 255) {
863
0
        hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b));
864
0
        errno = EOVERFLOW;
865
0
        return -1;
866
0
    }
867
3.79M
    if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
868
3.79M
    if (c->pos > INT_MAX ||
869
3.79M
        c->mpos > INT_MAX ||
870
3.79M
        c->isize < INT_MIN || c->isize > INT_MAX) {
871
28
        hts_log_error("Positional data is too large for BAM format");
872
28
        return -1;
873
28
    }
874
3.79M
    x[0] = c->tid;
875
3.79M
    x[1] = c->pos;
876
3.79M
    x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
877
3.79M
    if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
878
3.79M
    else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
879
3.79M
    x[4] = c->l_qseq;
880
3.79M
    x[5] = c->mtid;
881
3.79M
    x[6] = c->mpos;
882
3.79M
    x[7] = c->isize;
883
3.79M
    ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
884
3.79M
    if (fp->is_be) {
885
0
        for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
886
0
        y = block_len;
887
0
        if (ok) ok = (bgzf_write_small(fp, ed_swap_4p(&y), 4) >= 0);
888
0
        swap_data(c, b->l_data, b->data, 1);
889
3.79M
    } else {
890
3.79M
        if (ok) ok = (bgzf_write_small(fp, &block_len, 4) >= 0);
891
3.79M
    }
892
3.79M
    if (ok) ok = (bgzf_write_small(fp, x, 32) >= 0);
893
3.79M
    if (ok) ok = (bgzf_write_small(fp, b->data, c->l_qname - c->l_extranul) >= 0);
894
3.79M
    if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
895
3.79M
        if (ok) ok = (bgzf_write_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
896
3.79M
    } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
897
6
        uint8_t buf[8];
898
6
        uint32_t cigar_st, cigar_en, cigar[2];
899
6
        hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b));
900
6
        if (cigreflen >= (1<<28)) {
901
            // Length of reference covered is greater than the biggest
902
            // CIGAR operation currently allowed.
903
0
            hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos
904
0
                          " cannot be written in BAM.  Try writing SAM or CRAM instead.\n",
905
0
                          bam_get_qname(b), c->n_cigar, cigreflen);
906
0
            return -1;
907
0
        }
908
6
        cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
909
6
        cigar_en = cigar_st + c->n_cigar * 4;
910
6
        cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
911
6
        cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP;
912
6
        u32_to_le(cigar[0], buf);
913
6
        u32_to_le(cigar[1], buf + 4);
914
6
        if (ok) ok = (bgzf_write_small(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
915
6
        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
916
6
        if (ok) ok = (bgzf_write_small(fp, "CGBI", 4) >= 0); // write CG:B,I
917
6
        u32_to_le(c->n_cigar, buf);
918
6
        if (ok) ok = (bgzf_write_small(fp, buf, 4) >= 0); // write the true CIGAR length
919
6
        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
920
6
    }
921
3.79M
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
922
3.79M
    return ok? 4 + block_len : -1;
923
3.79M
}
924
925
/*
926
 * Write a BAM file and append to the in-memory index simultaneously.
927
 */
928
3.79M
static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) {
929
3.79M
    BGZF *bfp = fp->fp.bgzf;
930
931
3.79M
    if (!fp->idx)
932
3.79M
        return bam_write1(bfp, b);
933
934
0
    uint32_t block_len = b->l_data - b->core.l_extranul + 32;
935
0
    if (bgzf_flush_try(bfp, 4 + block_len) < 0)
936
0
        return -1;
937
0
    if (!bfp->mt)
938
0
        hts_idx_amend_last(fp->idx, bgzf_tell(bfp));
939
940
0
    int ret = bam_write1(bfp, b);
941
0
    if (ret < 0)
942
0
        return -1;
943
944
0
    if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) {
945
0
        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
946
0
                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
947
0
        ret = -1;
948
0
    }
949
950
0
    return ret;
951
0
}
952
953
/*
954
 * Set the qname in a BAM record
955
 */
956
int bam_set_qname(bam1_t *rec, const char *qname)
957
0
{
958
0
    if (!rec) return -1;
959
0
    if (!qname || !*qname) return -1;
960
961
0
    size_t old_len = rec->core.l_qname;
962
0
    size_t new_len = strlen(qname) + 1;
963
0
    if (new_len < 1 || new_len > 255) return -1;
964
965
0
    int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0;
966
967
0
    size_t new_data_len = rec->l_data - old_len + new_len + extranul;
968
0
    if (realloc_bam_data(rec, new_data_len) < 0) return -1;
969
970
    // Make room
971
0
    if (new_len + extranul != rec->core.l_qname)
972
0
        memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname);
973
    // Copy in new name and pad if needed
974
0
    memcpy(rec->data, qname, new_len);
975
0
    int n;
976
0
    for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0';
977
978
0
    rec->l_data = new_data_len;
979
0
    rec->core.l_qname = new_len + extranul;
980
0
    rec->core.l_extranul = extranul;
981
982
0
    return 0;
983
0
}
984
985
/********************
986
 *** BAM indexing ***
987
 ********************/
988
989
static hts_idx_t *sam_index(htsFile *fp, int min_shift)
990
0
{
991
0
    int n_lvls, i, fmt, ret;
992
0
    bam1_t *b;
993
0
    hts_idx_t *idx;
994
0
    sam_hdr_t *h;
995
0
    h = sam_hdr_read(fp);
996
0
    if (h == NULL) return NULL;
997
0
    if (min_shift > 0) {
998
0
        hts_pos_t max_len = 0, s;
999
0
        for (i = 0; i < h->n_targets; ++i) {
1000
0
            hts_pos_t len = sam_hdr_tid2len(h, i);
1001
0
            if (max_len < len) max_len = len;
1002
0
        }
1003
0
        max_len += 256;
1004
0
        for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
1005
0
        fmt = HTS_FMT_CSI;
1006
0
    } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1007
0
    idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1008
0
    b = bam_init1();
1009
0
    while ((ret = sam_read1(fp, h, b)) >= 0) {
1010
0
        ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP));
1011
0
        if (ret < 0) { // unsorted or doesn't fit
1012
0
            hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
1013
0
            goto err;
1014
0
        }
1015
0
    }
1016
0
    if (ret < -1) goto err; // corrupted BAM file
1017
1018
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
1019
0
    sam_hdr_destroy(h);
1020
0
    bam_destroy1(b);
1021
0
    return idx;
1022
1023
0
err:
1024
0
    bam_destroy1(b);
1025
0
    hts_idx_destroy(idx);
1026
0
    return NULL;
1027
0
}
1028
1029
int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
1030
0
{
1031
0
    hts_idx_t *idx;
1032
0
    htsFile *fp;
1033
0
    int ret = 0;
1034
1035
0
    if ((fp = hts_open(fn, "r")) == 0) return -2;
1036
0
    if (nthreads)
1037
0
        hts_set_threads(fp, nthreads);
1038
1039
0
    switch (fp->format.format) {
1040
0
    case cram:
1041
1042
0
        ret = cram_index_build(fp->fp.cram, fn, fnidx);
1043
0
        break;
1044
1045
0
    case bam:
1046
0
    case sam:
1047
0
        if (fp->format.compression != bgzf) {
1048
0
            hts_log_error("%s file \"%s\" not BGZF compressed",
1049
0
                          fp->format.format == bam ? "BAM" : "SAM", fn);
1050
0
            ret = -1;
1051
0
            break;
1052
0
        }
1053
0
        idx = sam_index(fp, min_shift);
1054
0
        if (idx) {
1055
0
            ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
1056
0
            if (ret < 0) ret = -4;
1057
0
            hts_idx_destroy(idx);
1058
0
        }
1059
0
        else ret = -1;
1060
0
        break;
1061
1062
0
    default:
1063
0
        ret = -3;
1064
0
        break;
1065
0
    }
1066
0
    hts_close(fp);
1067
1068
0
    return ret;
1069
0
}
1070
1071
int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
1072
0
{
1073
0
    return sam_index_build3(fn, fnidx, min_shift, 0);
1074
0
}
1075
1076
int sam_index_build(const char *fn, int min_shift)
1077
0
{
1078
0
    return sam_index_build3(fn, NULL, min_shift, 0);
1079
0
}
1080
1081
// Provide bam_index_build() symbol for binary compatibility with earlier HTSlib
1082
#undef bam_index_build
1083
int bam_index_build(const char *fn, int min_shift)
1084
0
{
1085
0
    return sam_index_build2(fn, NULL, min_shift);
1086
0
}
1087
1088
// Initialise fp->idx for the current format type.
1089
// This must be called after the header has been written but no other data.
1090
0
int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) {
1091
0
    fp->fnidx = fnidx;
1092
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1093
0
        (fp->format.format == sam && fp->format.compression == bgzf)) {
1094
0
        int n_lvls, fmt = HTS_FMT_CSI;
1095
0
        if (min_shift > 0) {
1096
0
            int64_t max_len = 0, s;
1097
0
            int i;
1098
0
            for (i = 0; i < h->n_targets; ++i)
1099
0
                if (max_len < h->target_len[i]) max_len = h->target_len[i];
1100
0
            max_len += 256;
1101
0
            for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
1102
1103
0
        } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1104
1105
0
        fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1106
0
        return fp->idx ? 0 : -1;
1107
0
    }
1108
1109
0
    if (fp->format.format == cram) {
1110
0
        fp->fp.cram->idxfp = bgzf_open(fnidx, "wg");
1111
0
        return fp->fp.cram->idxfp ? 0 : -1;
1112
0
    }
1113
1114
0
    return -1;
1115
0
}
1116
1117
// Finishes an index. Call after the last record has been written.
1118
// Returns 0 on success, <0 on failure.
1119
0
int sam_idx_save(htsFile *fp) {
1120
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1121
0
        fp->format.format == vcf || fp->format.format == sam) {
1122
0
        int ret;
1123
0
        if ((ret = sam_state_destroy(fp)) < 0) {
1124
0
            errno = -ret;
1125
0
            return -1;
1126
0
        }
1127
0
        if (!fp->is_bgzf || bgzf_flush(fp->fp.bgzf) < 0)
1128
0
            return -1;
1129
0
        hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
1130
1131
0
        if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0)
1132
0
            return -1;
1133
1134
0
        return hts_idx_save_but_not_close(fp->idx, fp->fnidx, hts_idx_fmt(fp->idx));
1135
1136
0
    } else if (fp->format.format == cram) {
1137
        // flushed and closed by cram_close
1138
0
    }
1139
1140
0
    return 0;
1141
0
}
1142
1143
static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1144
0
{
1145
0
    htsFile *fp = (htsFile *)fpv;
1146
0
    bam1_t *b = bv;
1147
0
    fp->line.l = 0;
1148
0
    int ret = sam_read1(fp, fp->bam_header, b);
1149
0
    if (ret >= 0) {
1150
0
        *tid = b->core.tid;
1151
0
        *beg = b->core.pos;
1152
0
        *end = bam_endpos(b);
1153
0
    }
1154
0
    return ret;
1155
0
}
1156
1157
// This is used only with read_rest=1 iterators, so need not set tid/beg/end.
1158
static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1159
0
{
1160
0
    htsFile *fp = (htsFile *)fpv;
1161
0
    bam1_t *b = bv;
1162
0
    fp->line.l = 0;
1163
0
    int ret = sam_read1(fp, fp->bam_header, b);
1164
0
    return ret;
1165
0
}
1166
1167
// Internal (for now) func used by bam_sym_lookup.  This is copied from
1168
// samtools/bam.c.
1169
static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b)
1170
0
{
1171
0
    const char *rg;
1172
0
    kstring_t lib = { 0, 0, NULL };
1173
0
    rg = (char *)bam_aux_get(b, "RG");
1174
1175
0
    if (!rg)
1176
0
        return NULL;
1177
0
    else
1178
0
        rg++;
1179
1180
0
    if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib)  < 0)
1181
0
        return NULL;
1182
1183
0
    static char LB_text[1024];
1184
0
    int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1;
1185
1186
0
    memcpy(LB_text, lib.s, len);
1187
0
    LB_text[len] = 0;
1188
1189
0
    free(lib.s);
1190
1191
0
    return LB_text;
1192
0
}
1193
1194
1195
// Bam record pointer and SAM header combined
1196
typedef struct {
1197
    const sam_hdr_t *h;
1198
    const bam1_t *b;
1199
} hb_pair;
1200
1201
// Looks up variable names in str and replaces them with their value.
1202
// Also supports aux tags.
1203
//
1204
// Note the expression parser deliberately overallocates str size so it
1205
// is safe to use memcmp over strcmp.
1206
static int bam_sym_lookup(void *data, char *str, char **end,
1207
0
                          hts_expr_val_t *res) {
1208
0
    hb_pair *hb = (hb_pair *)data;
1209
0
    const bam1_t *b = hb->b;
1210
1211
0
    res->is_str = 0;
1212
0
    switch(*str) {
1213
0
    case 'c':
1214
0
        if (memcmp(str, "cigar", 5) == 0) {
1215
0
            *end = str+5;
1216
0
            res->is_str = 1;
1217
0
            ks_clear(&res->s);
1218
0
            uint32_t *cigar = bam_get_cigar(b);
1219
0
            int i, n = b->core.n_cigar, r = 0;
1220
0
            if (n) {
1221
0
                for (i = 0; i < n; i++) {
1222
0
                    r |= kputw (bam_cigar_oplen(cigar[i]), &res->s) < 0;
1223
0
                    r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s) < 0;
1224
0
                }
1225
0
                r |= kputs("", &res->s) < 0;
1226
0
            } else {
1227
0
                r |= kputs("*", &res->s) < 0;
1228
0
            }
1229
0
            return r ? -1 : 0;
1230
0
        }
1231
0
        break;
1232
1233
0
    case 'e':
1234
0
        if (memcmp(str, "endpos", 6) == 0) {
1235
0
            *end = str+6;
1236
0
            res->d = bam_endpos(b);
1237
0
            return 0;
1238
0
        }
1239
0
        break;
1240
1241
0
    case 'f':
1242
0
        if (memcmp(str, "flag", 4) == 0) {
1243
0
            str = *end = str+4;
1244
0
            if (*str != '.') {
1245
0
                res->d = b->core.flag;
1246
0
                return 0;
1247
0
            } else {
1248
0
                str++;
1249
0
                if (!memcmp(str, "paired", 6)) {
1250
0
                    *end = str+6;
1251
0
                    res->d = b->core.flag & BAM_FPAIRED;
1252
0
                    return 0;
1253
0
                } else if (!memcmp(str, "proper_pair", 11)) {
1254
0
                    *end = str+11;
1255
0
                    res->d = b->core.flag & BAM_FPROPER_PAIR;
1256
0
                    return 0;
1257
0
                } else if (!memcmp(str, "unmap", 5)) {
1258
0
                    *end = str+5;
1259
0
                    res->d = b->core.flag & BAM_FUNMAP;
1260
0
                    return 0;
1261
0
                } else if (!memcmp(str, "munmap", 6)) {
1262
0
                    *end = str+6;
1263
0
                    res->d = b->core.flag & BAM_FMUNMAP;
1264
0
                    return 0;
1265
0
                } else if (!memcmp(str, "reverse", 7)) {
1266
0
                    *end = str+7;
1267
0
                    res->d = b->core.flag & BAM_FREVERSE;
1268
0
                    return 0;
1269
0
                } else if (!memcmp(str, "mreverse", 8)) {
1270
0
                    *end = str+8;
1271
0
                    res->d = b->core.flag & BAM_FMREVERSE;
1272
0
                    return 0;
1273
0
                } else if (!memcmp(str, "read1", 5)) {
1274
0
                    *end = str+5;
1275
0
                    res->d = b->core.flag & BAM_FREAD1;
1276
0
                    return 0;
1277
0
                } else if (!memcmp(str, "read2", 5)) {
1278
0
                    *end = str+5;
1279
0
                    res->d = b->core.flag & BAM_FREAD2;
1280
0
                    return 0;
1281
0
                } else if (!memcmp(str, "secondary", 9)) {
1282
0
                    *end = str+9;
1283
0
                    res->d = b->core.flag & BAM_FSECONDARY;
1284
0
                    return 0;
1285
0
                } else if (!memcmp(str, "qcfail", 6)) {
1286
0
                    *end = str+6;
1287
0
                    res->d = b->core.flag & BAM_FQCFAIL;
1288
0
                    return 0;
1289
0
                } else if (!memcmp(str, "dup", 3)) {
1290
0
                    *end = str+3;
1291
0
                    res->d = b->core.flag & BAM_FDUP;
1292
0
                    return 0;
1293
0
                } else if (!memcmp(str, "supplementary", 13)) {
1294
0
                    *end = str+13;
1295
0
                    res->d = b->core.flag & BAM_FSUPPLEMENTARY;
1296
0
                    return 0;
1297
0
                } else {
1298
0
                    hts_log_error("Unrecognised flag string");
1299
0
                    return -1;
1300
0
                }
1301
0
            }
1302
0
        }
1303
0
        break;
1304
1305
0
    case 'h':
1306
0
        if (memcmp(str, "hclen", 5) == 0) {
1307
0
            int hclen = 0;
1308
0
            uint32_t *cigar = bam_get_cigar(b);
1309
0
            uint32_t ncigar = b->core.n_cigar;
1310
1311
            // left
1312
0
            if (ncigar > 0 && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP)
1313
0
                hclen = bam_cigar_oplen(cigar[0]);
1314
1315
            // right
1316
0
            if (ncigar > 1 && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP)
1317
0
                hclen += bam_cigar_oplen(cigar[ncigar-1]);
1318
1319
0
            *end = str+5;
1320
0
            res->d = hclen;
1321
0
            return 0;
1322
0
        }
1323
0
        break;
1324
1325
0
    case 'l':
1326
0
        if (memcmp(str, "library", 7) == 0) {
1327
0
            *end = str+7;
1328
0
            res->is_str = 1;
1329
0
            const char *lib = bam_get_library(hb->h, b);
1330
0
            kputs(lib ? lib : "", ks_clear(&res->s));
1331
0
            return 0;
1332
0
        }
1333
0
        break;
1334
1335
0
    case 'm':
1336
0
        if (memcmp(str, "mapq", 4) == 0) {
1337
0
            *end = str+4;
1338
0
            res->d = b->core.qual;
1339
0
            return 0;
1340
0
        } else if (memcmp(str, "mpos", 4) == 0) {
1341
0
            *end = str+4;
1342
0
            res->d = b->core.mpos+1;
1343
0
            return 0;
1344
0
        } else if (memcmp(str, "mrname", 6) == 0) {
1345
0
            *end = str+6;
1346
0
            res->is_str = 1;
1347
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1348
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1349
0
            return 0;
1350
0
        } else if (memcmp(str, "mrefid", 6) == 0) {
1351
0
            *end = str+6;
1352
0
            res->d = b->core.mtid;
1353
0
            return 0;
1354
0
        }
1355
0
        break;
1356
1357
0
    case 'n':
1358
0
        if (memcmp(str, "ncigar", 6) == 0) {
1359
0
            *end = str+6;
1360
0
            res->d = b->core.n_cigar;
1361
0
            return 0;
1362
0
        }
1363
0
        break;
1364
1365
0
    case 'p':
1366
0
        if (memcmp(str, "pos", 3) == 0) {
1367
0
            *end = str+3;
1368
0
            res->d = b->core.pos+1;
1369
0
            return 0;
1370
0
        } else if (memcmp(str, "pnext", 5) == 0) {
1371
0
            *end = str+5;
1372
0
            res->d = b->core.mpos+1;
1373
0
            return 0;
1374
0
        }
1375
0
        break;
1376
1377
0
    case 'q':
1378
0
        if (memcmp(str, "qlen", 4) == 0) {
1379
0
            *end = str+4;
1380
0
            res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
1381
0
            return 0;
1382
0
        } else if (memcmp(str, "qname", 5) == 0) {
1383
0
            *end = str+5;
1384
0
            res->is_str = 1;
1385
0
            kputs(bam_get_qname(b), ks_clear(&res->s));
1386
0
            return 0;
1387
0
        } else if (memcmp(str, "qual", 4) == 0) {
1388
0
            *end = str+4;
1389
0
            ks_clear(&res->s);
1390
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1391
0
                return -1;
1392
0
            memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq);
1393
0
            res->s.l = b->core.l_qseq;
1394
0
            res->is_str = 1;
1395
0
            return 0;
1396
0
        }
1397
0
        break;
1398
1399
0
    case 'r':
1400
0
        if (memcmp(str, "rlen", 4) == 0) {
1401
0
            *end = str+4;
1402
0
            res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
1403
0
            return 0;
1404
0
        } else if (memcmp(str, "rname", 5) == 0) {
1405
0
            *end = str+5;
1406
0
            res->is_str = 1;
1407
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.tid);
1408
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1409
0
            return 0;
1410
0
        } else if (memcmp(str, "rnext", 5) == 0) {
1411
0
            *end = str+5;
1412
0
            res->is_str = 1;
1413
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1414
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1415
0
            return 0;
1416
0
        } else if (memcmp(str, "refid", 5) == 0) {
1417
0
            *end = str+5;
1418
0
            res->d = b->core.tid;
1419
0
            return 0;
1420
0
        }
1421
0
        break;
1422
1423
0
    case 's':
1424
0
        if (memcmp(str, "seq", 3) == 0) {
1425
0
            *end = str+3;
1426
0
            ks_clear(&res->s);
1427
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1428
0
                return -1;
1429
0
            nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq);
1430
0
            res->s.s[b->core.l_qseq] = 0;
1431
0
            res->s.l = b->core.l_qseq;
1432
0
            res->is_str = 1;
1433
0
            return 0;
1434
0
        } else if (memcmp(str, "sclen", 5) == 0) {
1435
0
            int sclen = 0;
1436
0
            uint32_t *cigar = bam_get_cigar(b);
1437
0
            int ncigar = b->core.n_cigar;
1438
0
            int left = 0;
1439
1440
            // left
1441
0
            if (ncigar > 0
1442
0
                && bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP)
1443
0
                left = 0, sclen += bam_cigar_oplen(cigar[0]);
1444
0
            else if (ncigar > 1
1445
0
                     && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP
1446
0
                     && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP)
1447
0
                left = 1, sclen += bam_cigar_oplen(cigar[1]);
1448
1449
            // right
1450
0
            if (ncigar-1 > left
1451
0
                && bam_cigar_op(cigar[ncigar-1]) == BAM_CSOFT_CLIP)
1452
0
                sclen += bam_cigar_oplen(cigar[ncigar-1]);
1453
0
            else if (ncigar-2 > left
1454
0
                     && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP
1455
0
                     && bam_cigar_op(cigar[ncigar-2]) == BAM_CSOFT_CLIP)
1456
0
                sclen += bam_cigar_oplen(cigar[ncigar-2]);
1457
1458
0
            *end = str+5;
1459
0
            res->d = sclen;
1460
0
            return 0;
1461
0
        }
1462
0
        break;
1463
1464
0
    case 't':
1465
0
        if (memcmp(str, "tlen", 4) == 0) {
1466
0
            *end = str+4;
1467
0
            res->d = b->core.isize;
1468
0
            return 0;
1469
0
        }
1470
0
        break;
1471
1472
0
    case '[':
1473
0
        if (*str == '[' && str[1] && str[2] && str[3] == ']') {
1474
            /* aux tags */
1475
0
            *end = str+4;
1476
1477
0
            uint8_t *aux = bam_aux_get(b, str+1);
1478
0
            if (aux) {
1479
                // we define the truth of a tag to be its presence, even if 0.
1480
0
                res->is_true = 1;
1481
0
                switch (*aux) {
1482
0
                case 'Z':
1483
0
                case 'H':
1484
0
                    res->is_str = 1;
1485
0
                    kputs((char *)aux+1, ks_clear(&res->s));
1486
0
                    break;
1487
1488
0
                case 'A':
1489
0
                    res->is_str = 1;
1490
0
                    kputsn((char *)aux+1, 1, ks_clear(&res->s));
1491
0
                    break;
1492
1493
0
                case 'i': case 'I':
1494
0
                case 's': case 'S':
1495
0
                case 'c': case 'C':
1496
0
                    res->is_str = 0;
1497
0
                    res->d = bam_aux2i(aux);
1498
0
                    break;
1499
1500
0
                case 'f':
1501
0
                case 'd':
1502
0
                    res->is_str = 0;
1503
0
                    res->d = bam_aux2f(aux);
1504
0
                    break;
1505
1506
0
                default:
1507
0
                    hts_log_error("Aux type '%c not yet supported by filters",
1508
0
                                  *aux);
1509
0
                    return -1;
1510
0
                }
1511
0
                return 0;
1512
1513
0
            } else {
1514
                // hence absent tags are always false (and strings)
1515
0
                res->is_str = 1;
1516
0
                res->s.l = 0;
1517
0
                res->d = 0;
1518
0
                res->is_true = 0;
1519
0
                return 0;
1520
0
            }
1521
0
        }
1522
0
        break;
1523
0
    }
1524
1525
    // All successful matches in switch should return 0.
1526
    // So if we didn't match, it's a parse error.
1527
0
    return -1;
1528
0
}
1529
1530
// Returns 1 when accepted by the filter, 0 if not, -1 on error.
1531
int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt)
1532
0
{
1533
0
    hb_pair hb = {h, b};
1534
0
    hts_expr_val_t res = HTS_EXPR_VAL_INIT;
1535
0
    if (hts_filter_eval2(filt, &hb, bam_sym_lookup, &res)) {
1536
0
        hts_log_error("Couldn't process filter expression");
1537
0
        hts_expr_val_free(&res);
1538
0
        return -1;
1539
0
    }
1540
1541
0
    int t = res.is_true;
1542
0
    hts_expr_val_free(&res);
1543
1544
0
    return t;
1545
0
}
1546
1547
static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1548
0
{
1549
0
    htsFile *fp = fpv;
1550
0
    bam1_t *b = bv;
1551
0
    int pass_filter, ret;
1552
1553
0
    do {
1554
0
        ret = cram_get_bam_seq(fp->fp.cram, &b);
1555
0
        if (ret < 0)
1556
0
            return cram_eof(fp->fp.cram) ? -1 : -2;
1557
1558
0
        if (bam_tag2cigar(b, 1, 1) < 0)
1559
0
            return -2;
1560
1561
0
        *tid = b->core.tid;
1562
0
        *beg = b->core.pos;
1563
0
        *end = bam_endpos(b);
1564
1565
0
        if (fp->filter) {
1566
0
            pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter);
1567
0
            if (pass_filter < 0)
1568
0
                return -2;
1569
0
        } else {
1570
0
            pass_filter = 1;
1571
0
        }
1572
0
    } while (pass_filter == 0);
1573
1574
0
    return ret;
1575
0
}
1576
1577
static int cram_pseek(void *fp, int64_t offset, int whence)
1578
0
{
1579
0
    cram_fd *fd =  (cram_fd *)fp;
1580
1581
0
    if ((0 != cram_seek(fd, offset, SEEK_SET))
1582
0
     && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
1583
0
        return -1;
1584
1585
0
    fd->curr_position = offset;
1586
1587
0
    if (fd->ctr) {
1588
0
        cram_free_container(fd->ctr);
1589
0
        if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
1590
0
            cram_free_container(fd->ctr_mt);
1591
1592
0
        fd->ctr = NULL;
1593
0
        fd->ctr_mt = NULL;
1594
0
        fd->ooc = 0;
1595
0
    }
1596
1597
0
    return 0;
1598
0
}
1599
1600
/*
1601
 * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
1602
 *   after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
1603
 *   container previously fetched. It was designed like this to integrate with the functionality
1604
 *   of the iterator stepping logic.
1605
 */
1606
1607
static int64_t cram_ptell(void *fp)
1608
0
{
1609
0
    cram_fd *fd = (cram_fd *)fp;
1610
0
    cram_container *c;
1611
0
    cram_slice *s;
1612
0
    int64_t ret = -1L;
1613
1614
0
    if (fd) {
1615
0
        if ((c = fd->ctr) != NULL) {
1616
0
            if ((s = c->slice) != NULL && s->max_rec) {
1617
0
                if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1))
1618
0
                    fd->curr_position += c->offset + c->length;
1619
0
            }
1620
0
        }
1621
0
        ret = fd->curr_position;
1622
0
    }
1623
1624
0
    return ret;
1625
0
}
1626
1627
static int bam_pseek(void *fp, int64_t offset, int whence)
1628
0
{
1629
0
    BGZF *fd = (BGZF *)fp;
1630
1631
0
    return bgzf_seek(fd, offset, whence);
1632
0
}
1633
1634
static int64_t bam_ptell(void *fp)
1635
0
{
1636
0
    BGZF *fd = (BGZF *)fp;
1637
0
    if (!fd)
1638
0
        return -1L;
1639
1640
0
    return bgzf_tell(fd);
1641
0
}
1642
1643
1644
1645
static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags)
1646
0
{
1647
0
    switch (fp->format.format) {
1648
0
    case bam:
1649
0
    case sam:
1650
0
        return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags);
1651
1652
0
    case cram: {
1653
0
        if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
1654
1655
        // Cons up a fake "index" just pointing at the associated cram_fd:
1656
0
        hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
1657
0
        if (idx == NULL) return NULL;
1658
0
        idx->fmt = HTS_FMT_CRAI;
1659
0
        idx->cram = fp->fp.cram;
1660
0
        return (hts_idx_t *) idx;
1661
0
        }
1662
1663
0
    default:
1664
0
        return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
1665
0
    }
1666
0
}
1667
1668
hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags)
1669
0
{
1670
0
    return index_load(fp, fn, fnidx, flags);
1671
0
}
1672
1673
0
hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) {
1674
0
    return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE);
1675
0
}
1676
1677
hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
1678
0
{
1679
0
    return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE);
1680
0
}
1681
1682
static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
1683
0
{
1684
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1685
0
    hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
1686
0
    if (iter == NULL) return NULL;
1687
1688
    // Cons up a dummy iterator for which hts_itr_next() will simply invoke
1689
    // the readrec function:
1690
0
    iter->is_cram = 1;
1691
0
    iter->read_rest = 1;
1692
0
    iter->off = NULL;
1693
0
    iter->bins.a = NULL;
1694
0
    iter->readrec = readrec;
1695
1696
0
    if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
1697
0
        cram_range r = { tid, beg+1, end };
1698
0
        int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
1699
1700
0
        iter->curr_off = 0;
1701
        // The following fields are not required by hts_itr_next(), but are
1702
        // filled in in case user code wants to look at them.
1703
0
        iter->tid = tid;
1704
0
        iter->beg = beg;
1705
0
        iter->end = end;
1706
1707
0
        switch (ret) {
1708
0
        case 0:
1709
0
            break;
1710
1711
0
        case -2:
1712
            // No data vs this ref, so mark iterator as completed.
1713
            // Same as HTS_IDX_NONE.
1714
0
            iter->finished = 1;
1715
0
            break;
1716
1717
0
        default:
1718
0
            free(iter);
1719
0
            return NULL;
1720
0
        }
1721
0
    }
1722
0
    else switch (tid) {
1723
0
    case HTS_IDX_REST:
1724
0
        iter->curr_off = 0;
1725
0
        break;
1726
0
    case HTS_IDX_NONE:
1727
0
        iter->curr_off = 0;
1728
0
        iter->finished = 1;
1729
0
        break;
1730
0
    default:
1731
0
        hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
1732
0
        abort();
1733
0
        break;
1734
0
    }
1735
1736
0
    return iter;
1737
0
}
1738
1739
hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
1740
0
{
1741
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1742
0
    if (idx == NULL)
1743
0
        return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest);
1744
0
    else if (cidx->fmt == HTS_FMT_CRAI)
1745
0
        return cram_itr_query(idx, tid, beg, end, sam_readrec);
1746
0
    else
1747
0
        return hts_itr_query(idx, tid, beg, end, sam_readrec);
1748
0
}
1749
1750
static int cram_name2id(void *fdv, const char *ref)
1751
0
{
1752
0
    cram_fd *fd = (cram_fd *) fdv;
1753
0
    return sam_hdr_name2tid(fd->header, ref);
1754
0
}
1755
1756
hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region)
1757
0
{
1758
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1759
0
    return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr,
1760
0
                          cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query,
1761
0
                          sam_readrec);
1762
0
}
1763
1764
hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount)
1765
0
{
1766
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1767
0
    hts_reglist_t *r_list = NULL;
1768
0
    int r_count = 0;
1769
1770
0
    if (!cidx || !hdr)
1771
0
        return NULL;
1772
1773
0
    hts_itr_t *itr = NULL;
1774
0
    if (cidx->fmt == HTS_FMT_CRAI) {
1775
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id);
1776
0
        if (!r_list)
1777
0
            return NULL;
1778
0
        itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram,
1779
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1780
0
    } else {
1781
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, (hts_name2id_f)(bam_name2id));
1782
0
        if (!r_list)
1783
0
            return NULL;
1784
0
        itr = hts_itr_regions(idx, r_list, r_count, (hts_name2id_f)(bam_name2id), hdr,
1785
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1786
0
    }
1787
1788
0
    if (!itr)
1789
0
        hts_reglist_free(r_list, r_count);
1790
1791
0
    return itr;
1792
0
}
1793
1794
hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
1795
0
{
1796
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1797
1798
0
    if(!cidx || !hdr || !reglist)
1799
0
        return NULL;
1800
1801
0
    if (cidx->fmt == HTS_FMT_CRAI)
1802
0
        return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
1803
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1804
0
    else
1805
0
        return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr,
1806
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1807
0
}
1808
1809
/**********************
1810
 *** SAM header I/O ***
1811
 **********************/
1812
1813
#include "htslib/kseq.h"
1814
#include "htslib/kstring.h"
1815
1816
sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text)
1817
0
{
1818
0
    sam_hdr_t *bh = sam_hdr_init();
1819
0
    if (!bh) return NULL;
1820
1821
0
    if (sam_hdr_add_lines(bh, text, l_text) != 0) {
1822
0
        sam_hdr_destroy(bh);
1823
0
        return NULL;
1824
0
    }
1825
1826
0
    return bh;
1827
0
}
1828
1829
// Minimal sanitisation of a header to ensure.
1830
// - null terminated string.
1831
// - all lines start with @ (also implies no blank lines).
1832
//
1833
// Much more could be done, but currently is not, including:
1834
// - checking header types are known (HD, SQ, etc).
1835
// - syntax (eg checking tab separated fields).
1836
// - validating n_targets matches @SQ records.
1837
// - validating target lengths against @SQ records.
1838
1.87k
static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) {
1839
1.87k
    if (!h)
1840
9
        return NULL;
1841
1842
    // Special case for empty headers.
1843
1.86k
    if (h->l_text == 0)
1844
405
        return h;
1845
1846
1.45k
    size_t i;
1847
1.45k
    unsigned int lnum = 0;
1848
1.45k
    char *cp = h->text, last = '\n';
1849
18.0M
    for (i = 0; i < h->l_text; i++) {
1850
        // NB: l_text excludes terminating nul.  This finds early ones.
1851
18.0M
        if (cp[i] == 0)
1852
564
            break;
1853
1854
        // Error on \n[^@], including duplicate newlines
1855
18.0M
        if (last == '\n') {
1856
106k
            lnum++;
1857
106k
            if (cp[i] != '@') {
1858
0
                hts_log_error("Malformed SAM header at line %u", lnum);
1859
0
                sam_hdr_destroy(h);
1860
0
                return NULL;
1861
0
            }
1862
106k
        }
1863
1864
18.0M
        last = cp[i];
1865
18.0M
    }
1866
1867
1.45k
    if (i < h->l_text) { // Early nul found.  Complain if not just padding.
1868
564
        size_t j = i;
1869
3.65k
        while (j < h->l_text && cp[j] == '\0') j++;
1870
564
        if (j < h->l_text)
1871
561
            hts_log_warning("Unexpected NUL character in header. Possibly truncated");
1872
564
    }
1873
1874
    // Add trailing newline and/or trailing nul if required.
1875
1.45k
    if (last != '\n') {
1876
561
        hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
1877
1878
561
        if (h->l_text < 2 || i >= h->l_text - 2) {
1879
81
            if (h->l_text >= SIZE_MAX - 2) {
1880
0
                hts_log_error("No room for extra newline");
1881
0
                sam_hdr_destroy(h);
1882
0
                return NULL;
1883
0
            }
1884
1885
81
            cp = realloc(h->text, (size_t) h->l_text+2);
1886
81
            if (!cp) {
1887
0
                sam_hdr_destroy(h);
1888
0
                return NULL;
1889
0
            }
1890
81
            h->text = cp;
1891
81
        }
1892
561
        cp[i++] = '\n';
1893
1894
        // l_text may be larger already due to multiple nul padding
1895
561
        if (h->l_text < i)
1896
0
            h->l_text = i;
1897
561
        cp[h->l_text] = '\0';
1898
561
    }
1899
1900
1.45k
    return h;
1901
1.45k
}
1902
1903
1.32k
static sam_hdr_t *sam_hdr_create(htsFile* fp) {
1904
1.32k
    sam_hdr_t* h = sam_hdr_init();
1905
1.32k
    if (!h)
1906
0
        return NULL;
1907
1908
1.32k
    if (sam_hdr_build_from_sam_file(h, fp) != 0) {
1909
78
        sam_hdr_destroy(h);
1910
78
        return NULL;
1911
78
    }
1912
1913
1.24k
    if (fp->bam_header)
1914
0
        sam_hdr_destroy(fp->bam_header);
1915
1.24k
    fp->bam_header = sam_hdr_sanitise(h);
1916
1.24k
    fp->bam_header->ref_count = 1;
1917
1918
1.24k
    return fp->bam_header;
1919
1.32k
}
1920
1921
sam_hdr_t *sam_hdr_read(htsFile *fp)
1922
2.59k
{
1923
2.59k
    sam_hdr_t *h = NULL;
1924
2.59k
    if (!fp) {
1925
0
        errno = EINVAL;
1926
0
        return NULL;
1927
0
    }
1928
1929
2.59k
    switch (fp->format.format) {
1930
42
    case bam:
1931
42
        h = sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
1932
42
        break;
1933
1934
582
    case cram:
1935
582
        h = sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header));
1936
582
        break;
1937
1938
1.32k
    case sam:
1939
1.32k
        h = sam_hdr_create(fp);
1940
1.32k
        break;
1941
1942
24
    case fastq_format:
1943
648
    case fasta_format:
1944
648
        return sam_hdr_init();
1945
1946
0
    case empty_format:
1947
0
        errno = EPIPE;
1948
0
        return NULL;
1949
1950
0
    default:
1951
0
        errno = EFTYPE;
1952
0
        return NULL;
1953
2.59k
    }
1954
    //only sam,bam and cram reaches here
1955
1.95k
    if (h && !fp->bam_header) { //set except for sam which already has it
1956
        //for cram, it is the o/p header as for rest and not the internal header
1957
615
        fp->bam_header = h;
1958
615
        sam_hdr_incr_ref(fp->bam_header);
1959
615
    }
1960
1.95k
    return h;
1961
2.59k
}
1962
1963
int sam_hdr_write(htsFile *fp, const sam_hdr_t *h)
1964
2.51k
{
1965
2.51k
    if (!fp || !h) {
1966
0
        errno = EINVAL;
1967
0
        return -1;
1968
0
    }
1969
1970
2.51k
    switch (fp->format.format) {
1971
837
    case binary_format:
1972
837
        fp->format.category = sequence_data;
1973
837
        fp->format.format = bam;
1974
        /* fall-through */
1975
837
    case bam:
1976
837
        if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
1977
837
        break;
1978
1979
837
    case cram: {
1980
837
        cram_fd *fd = fp->fp.cram;
1981
837
        if (cram_set_header2(fd, h) < 0) return -1;
1982
784
        if (fp->fn_aux)
1983
0
            cram_load_reference(fd, fp->fn_aux);
1984
784
        if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
1985
784
        }
1986
784
        break;
1987
1988
837
    case text_format:
1989
837
        fp->format.category = sequence_data;
1990
837
        fp->format.format = sam;
1991
        /* fall-through */
1992
837
    case sam: {
1993
837
        if (!h->hrecs && !h->text)
1994
0
            return 0;
1995
837
        char *text;
1996
837
        kstring_t hdr_ks = { 0, 0, NULL };
1997
837
        size_t l_text;
1998
837
        ssize_t bytes;
1999
837
        int r = 0, no_sq = 0;
2000
2001
837
        if (h->hrecs) {
2002
836
            if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0)
2003
0
                return -1;
2004
836
            text = hdr_ks.s;
2005
836
            l_text = hdr_ks.l;
2006
836
        } else {
2007
1
            const char *p = NULL;
2008
1
            do {
2009
1
                const char *q = p == NULL ? h->text : p + 4;
2010
1
                p = strstr(q, "@SQ\t");
2011
1
            } while (!(p == NULL || p == h->text || *(p - 1) == '\n'));
2012
1
            no_sq = p == NULL;
2013
1
            text = h->text;
2014
1
            l_text = h->l_text;
2015
1
        }
2016
2017
837
        if (fp->is_bgzf) {
2018
0
            bytes = bgzf_write(fp->fp.bgzf, text, l_text);
2019
837
        } else {
2020
837
            bytes = hwrite(fp->fp.hfile, text, l_text);
2021
837
        }
2022
837
        free(hdr_ks.s);
2023
837
        if (bytes != l_text)
2024
0
            return -1;
2025
2026
837
        if (no_sq) {
2027
1
            int i;
2028
1
            for (i = 0; i < h->n_targets; ++i) {
2029
0
                fp->line.l = 0;
2030
0
                r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0;
2031
0
                r |= kputs(h->target_name[i], &fp->line) < 0;
2032
0
                r |= kputsn("\tLN:", 4, &fp->line) < 0;
2033
0
                r |= kputw(h->target_len[i], &fp->line) < 0;
2034
0
                r |= kputc('\n', &fp->line) < 0;
2035
0
                if (r != 0)
2036
0
                    return -1;
2037
2038
0
                if (fp->is_bgzf) {
2039
0
                    bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
2040
0
                } else {
2041
0
                    bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
2042
0
                }
2043
0
                if (bytes != fp->line.l)
2044
0
                    return -1;
2045
0
            }
2046
1
        }
2047
837
        if (fp->is_bgzf) {
2048
0
            if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2049
837
        } else {
2050
837
            if (hflush(fp->fp.hfile) != 0) return -1;
2051
837
        }
2052
837
        }
2053
837
        break;
2054
2055
837
    case fastq_format:
2056
0
    case fasta_format:
2057
        // Nothing to output; FASTQ has no file headers.
2058
0
        return 0;
2059
0
        break;
2060
2061
0
    default:
2062
0
        errno = EBADF;
2063
0
        return -1;
2064
2.51k
    }
2065
    //only sam,bam and cram reaches here
2066
2.45k
    if (h) {    //the new header
2067
2.45k
        sam_hdr_t *tmp = fp->bam_header;
2068
2.45k
        fp->bam_header = sam_hdr_dup(h);
2069
2.45k
        sam_hdr_destroy(tmp);
2070
2.45k
        if (!fp->bam_header && h)
2071
0
            return -1;  //failed to duplicate
2072
2.45k
    }
2073
2.45k
    return 0;
2074
2.45k
}
2075
2076
static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2077
0
{
2078
0
    char *p, *q, *beg = NULL, *end = NULL, *newtext;
2079
0
    size_t new_l_text;
2080
0
    if (!h || !key)
2081
0
        return -1;
2082
2083
0
    if (h->l_text > 3) {
2084
0
        if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
2085
0
            if ((p = strchr(h->text, '\n')) == 0) return -1;
2086
0
            *p = '\0'; // for strstr call
2087
2088
0
            char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
2089
2090
0
            if ((q = strstr(h->text, tmp)) != 0) { // key exists
2091
0
                *p = '\n'; // change back
2092
2093
                // mark the key:val
2094
0
                beg = q;
2095
0
                for (q += 4; *q != '\n' && *q != '\t'; ++q);
2096
0
                end = q;
2097
2098
0
                if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
2099
0
                    && strlen(val) == end - beg - 4)
2100
0
                     return 0; // val is the same, no need to change
2101
2102
0
            } else {
2103
0
                beg = end = p;
2104
0
                *p = '\n';
2105
0
            }
2106
0
        }
2107
0
    }
2108
0
    if (beg == NULL) { // no @HD
2109
0
        new_l_text = h->l_text;
2110
0
        if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9)
2111
0
            return -1;
2112
0
        new_l_text += strlen(SAM_FORMAT_VERSION) + 8;
2113
0
        if (val) {
2114
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2115
0
                return -1;
2116
0
            new_l_text += strlen(val) + 4;
2117
0
        }
2118
0
        newtext = (char*)malloc(new_l_text + 1);
2119
0
        if (!newtext) return -1;
2120
2121
0
        if (val)
2122
0
            snprintf(newtext, new_l_text + 1,
2123
0
                    "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
2124
0
        else
2125
0
            snprintf(newtext, new_l_text + 1,
2126
0
                    "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
2127
0
    } else { // has @HD but different or no key
2128
0
        new_l_text = (beg - h->text) + (h->text + h->l_text - end);
2129
0
        if (val) {
2130
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2131
0
                return -1;
2132
0
            new_l_text += strlen(val) + 4;
2133
0
        }
2134
0
        newtext = (char*)malloc(new_l_text + 1);
2135
0
        if (!newtext) return -1;
2136
2137
0
        if (val) {
2138
0
            snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s",
2139
0
                    (int) (beg - h->text), h->text, key, val, end);
2140
0
        } else { //delete key
2141
0
            snprintf(newtext, new_l_text + 1, "%.*s%s",
2142
0
                    (int) (beg - h->text), h->text, end);
2143
0
        }
2144
0
    }
2145
0
    free(h->text);
2146
0
    h->text = newtext;
2147
0
    h->l_text = new_l_text;
2148
0
    return 0;
2149
0
}
2150
2151
2152
int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2153
0
{
2154
0
    if (!h || !key)
2155
0
        return -1;
2156
2157
0
    if (!h->hrecs)
2158
0
        return old_sam_hdr_change_HD(h, key, val);
2159
2160
0
    if (val) {
2161
0
        if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0)
2162
0
            return -1;
2163
0
    } else {
2164
0
        if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0)
2165
0
            return -1;
2166
0
    }
2167
0
    return sam_hdr_rebuild(h);
2168
0
}
2169
2170
/* releases existing header and sets new one; increments ref count if not
2171
duplicating */
2172
int sam_hdr_set(samFile *fp, sam_hdr_t *h, int duplicate)
2173
0
{
2174
0
    if (!fp)
2175
0
        return -1;
2176
2177
0
    if (duplicate) {
2178
0
        sam_hdr_t *tmp = fp->bam_header;
2179
0
        fp->bam_header = sam_hdr_dup(h);
2180
0
        sam_hdr_destroy(tmp);
2181
0
        if (!fp->bam_header && h)
2182
0
            return -1;  //duplicate failed
2183
0
    } else {
2184
0
        if (fp->bam_header != h) {  //if not the same
2185
0
            sam_hdr_destroy(fp->bam_header);
2186
0
            fp->bam_header = h;
2187
0
            sam_hdr_incr_ref(fp->bam_header);
2188
0
        }
2189
0
    }
2190
2191
0
    return 0;
2192
0
}
2193
2194
//return the bam_header, user has to use sam_hdr_incr_ref where ever required
2195
sam_hdr_t* sam_hdr_get(samFile* fp)
2196
0
{
2197
0
    if (!fp)
2198
0
        return NULL;
2199
0
    return fp->bam_header;
2200
0
}
2201
2202
/**********************
2203
 *** SAM record I/O ***
2204
 **********************/
2205
2206
// The speed of this code can vary considerably depending on minor code
2207
// changes elsewhere as some of the tight loops are particularly prone to
2208
// speed changes when the instruction blocks are split over a 32-byte
2209
// boundary.  To protect against this, we explicitly specify an alignment
2210
// for this function.  If this is insufficient, we may also wish to
2211
// consider alignment of blocks within this function via
2212
// __attribute__((optimize("align-loops=5"))) (gcc) or clang equivalents.
2213
// However it's not very portable.
2214
// Instead we break into separate functions so we can explicitly specify
2215
// use __attribute__((aligned(32))) instead and force consistent loop
2216
// alignment.
2217
19.7k
static inline int64_t grow_B_array(bam1_t *b, uint32_t *n, size_t size) {
2218
    // Avoid overflow on 32-bit platforms, but it breaks BAM anyway
2219
19.7k
    if (*n > INT32_MAX*0.666) {
2220
0
        errno = ENOMEM;
2221
0
        return -1;
2222
0
    }
2223
2224
19.7k
    size_t bytes = (size_t)size * (size_t)(*n>>1);
2225
19.7k
    if (possibly_expand_bam_data(b, bytes) < 0) {
2226
0
        hts_log_error("Out of memory");
2227
0
        return -1;
2228
0
    }
2229
2230
19.7k
    (*n)+=*n>>1;
2231
19.7k
    return 0;
2232
19.7k
}
2233
2234
2235
// This ensures that q always ends up at the next comma after
2236
// reading a number even if it's followed by junk.  It
2237
// prevents the possibility of trying to read more than n items.
2238
2.33M
#define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0)
2239
2240
HTS_ALIGN32
2241
static char *sam_parse_Bc_vals(bam1_t *b, char *q, uint32_t *nused,
2242
1.65k
                               uint32_t *nalloc, int *overflow) {
2243
51.0k
    while (*q == ',') {
2244
49.4k
        if ((*nused)++ >= (*nalloc)) {
2245
54
            if (grow_B_array(b, nalloc, 1) < 0)
2246
0
                return NULL;
2247
54
        }
2248
49.4k
        *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, overflow);
2249
49.4k
        b->l_data++;
2250
49.4k
    }
2251
1.65k
    return q;
2252
1.65k
}
2253
2254
HTS_ALIGN32
2255
static char *sam_parse_BC_vals(bam1_t *b, char *q, uint32_t *nused,
2256
19.0k
                               uint32_t *nalloc, int *overflow) {
2257
137k
    while (*q == ',') {
2258
118k
        if ((*nused)++ >= (*nalloc)) {
2259
1.97k
            if (grow_B_array(b, nalloc, 1) < 0)
2260
0
                return NULL;
2261
1.97k
        }
2262
118k
        if (q[1] != '-') {
2263
109k
            *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, overflow);
2264
109k
            b->l_data++;
2265
109k
        } else {
2266
8.62k
            *overflow = 1;
2267
8.62k
            q++;
2268
8.62k
            skip_to_comma_(q);
2269
8.62k
        }
2270
118k
    }
2271
19.0k
    return q;
2272
19.0k
}
2273
2274
HTS_ALIGN32
2275
static char *sam_parse_Bs_vals(bam1_t *b, char *q, uint32_t *nused,
2276
4.44k
                               uint32_t *nalloc, int *overflow) {
2277
46.1k
    while (*q == ',') {
2278
41.7k
        if ((*nused)++ >= (*nalloc)) {
2279
1.49k
            if (grow_B_array(b, nalloc, 2) < 0)
2280
0
                return NULL;
2281
1.49k
        }
2282
41.7k
        i16_to_le(hts_str2int(q + 1, &q, 16, overflow),
2283
41.7k
                  b->data + b->l_data);
2284
41.7k
        b->l_data += 2;
2285
41.7k
    }
2286
4.44k
    return q;
2287
4.44k
}
2288
2289
HTS_ALIGN32
2290
static char *sam_parse_BS_vals(bam1_t *b, char *q, uint32_t *nused,
2291
1.98k
                               uint32_t *nalloc, int *overflow) {
2292
1.82M
    while (*q == ',') {
2293
1.82M
        if ((*nused)++ >= (*nalloc)) {
2294
2.24k
            if (grow_B_array(b, nalloc, 2) < 0)
2295
0
                return NULL;
2296
2.24k
        }
2297
1.82M
        if (q[1] != '-') {
2298
1.79M
            u16_to_le(hts_str2uint(q + 1, &q, 16, overflow),
2299
1.79M
                      b->data + b->l_data);
2300
1.79M
            b->l_data += 2;
2301
1.79M
        } else {
2302
27.3k
            *overflow = 1;
2303
27.3k
            q++;
2304
27.3k
            skip_to_comma_(q);
2305
27.3k
        }
2306
1.82M
    }
2307
1.98k
    return q;
2308
1.98k
}
2309
2310
HTS_ALIGN32
2311
static char *sam_parse_Bi_vals(bam1_t *b, char *q, uint32_t *nused,
2312
2.07k
                               uint32_t *nalloc, int *overflow) {
2313
2.07M
    while (*q == ',') {
2314
2.07M
        if ((*nused)++ >= (*nalloc)) {
2315
123
            if (grow_B_array(b, nalloc, 4) < 0)
2316
0
                return NULL;
2317
123
        }
2318
2.07M
        i32_to_le(hts_str2int(q + 1, &q, 32, overflow),
2319
2.07M
                  b->data + b->l_data);
2320
2.07M
        b->l_data += 4;
2321
2.07M
    }
2322
2.07k
    return q;
2323
2.07k
}
2324
2325
HTS_ALIGN32
2326
static char *sam_parse_BI_vals(bam1_t *b, char *q, uint32_t *nused,
2327
6.84k
                               uint32_t *nalloc, int *overflow) {
2328
139k
    while (*q == ',') {
2329
132k
        if ((*nused)++ >= (*nalloc)) {
2330
11.6k
            if (grow_B_array(b, nalloc, 4) < 0)
2331
0
                return NULL;
2332
11.6k
        }
2333
132k
        if (q[1] != '-') {
2334
126k
            u32_to_le(hts_str2uint(q + 1, &q, 32, overflow),
2335
126k
                      b->data + b->l_data);
2336
126k
            b->l_data += 4;
2337
126k
        } else {
2338
6.14k
            *overflow = 1;
2339
6.14k
            q++;
2340
6.14k
            skip_to_comma_(q);
2341
6.14k
        }
2342
132k
    }
2343
6.84k
    return q;
2344
6.84k
}
2345
2346
HTS_ALIGN32
2347
static char *sam_parse_Bf_vals(bam1_t *b, char *q, uint32_t *nused,
2348
2.20k
                               uint32_t *nalloc, int *overflow) {
2349
20.7k
    while (*q == ',') {
2350
18.5k
        if ((*nused)++ >= (*nalloc)) {
2351
2.16k
            if (grow_B_array(b, nalloc, 4) < 0)
2352
0
                return NULL;
2353
2.16k
        }
2354
18.5k
        float_to_le(strtod(q + 1, &q), b->data + b->l_data);
2355
18.5k
        b->l_data += 4;
2356
18.5k
    }
2357
2.20k
    return q;
2358
2.20k
}
2359
2360
HTS_ALIGN32
2361
static int sam_parse_B_vals_r(char type, uint32_t nalloc, char *in,
2362
                              char **end, bam1_t *b,
2363
38.4k
                              int *ctr) {
2364
    // Protect against infinite recursion when dealing with invalid input.
2365
    // An example string is "XX:B:C,-".  The lack of a number means min=0,
2366
    // but it overflowed due to "-" and so we repeat ad-infinitum.
2367
    //
2368
    // Loop detection is the safest solution incase there are other
2369
    // strange corner cases with malformed inputs.
2370
38.4k
    if (++(*ctr) > 2) {
2371
0
        hts_log_error("Malformed data in B:%c array", type);
2372
0
        return -1;
2373
0
    }
2374
2375
38.4k
    int orig_l = b->l_data;
2376
38.4k
    char *q = in;
2377
38.4k
    int32_t size;
2378
38.4k
    size_t bytes;
2379
38.4k
    int overflow = 0;
2380
2381
38.4k
    size = aux_type2size(type);
2382
38.4k
    if (size <= 0 || size > 4) {
2383
2
        hts_log_error("Unrecognized type B:%c", type);
2384
2
        return -1;
2385
2
    }
2386
2387
    // Ensure space for type + values.
2388
    // The first pass through here we don't know the number of entries and
2389
    // nalloc == 0.  We start with a small working set and then parse the
2390
    // data, growing as needed.
2391
    //
2392
    // If we have a second pass through we do know the number of entries
2393
    // and nalloc is already known.  We have no need to expand the bam data.
2394
38.4k
    if (!nalloc)
2395
33.8k
         nalloc=7;
2396
2397
    // Ensure allocated memory is big enough (for current nalloc estimate)
2398
38.4k
    bytes = (size_t) nalloc * (size_t) size;
2399
38.4k
    if (bytes / size != nalloc
2400
38.4k
        || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) {
2401
0
        hts_log_error("Out of memory");
2402
0
        return -1;
2403
0
    }
2404
2405
38.4k
    uint32_t nused = 0;
2406
2407
38.4k
    b->data[b->l_data++] = 'B';
2408
38.4k
    b->data[b->l_data++] = type;
2409
    // 32-bit B-array length is inserted later once we know it.
2410
38.4k
    int b_len_idx = b->l_data;
2411
38.4k
    b->l_data += sizeof(uint32_t);
2412
2413
38.4k
    if (type == 'c') {
2414
1.65k
        if (!(q = sam_parse_Bc_vals(b, q, &nused, &nalloc, &overflow)))
2415
0
            return -1;
2416
36.8k
    } else if (type == 'C') {
2417
19.0k
        if (!(q = sam_parse_BC_vals(b, q, &nused, &nalloc, &overflow)))
2418
0
            return -1;
2419
19.0k
    } else if (type == 's') {
2420
4.44k
        if (!(q = sam_parse_Bs_vals(b, q, &nused, &nalloc, &overflow)))
2421
0
            return -1;
2422
13.3k
    } else if (type == 'S') {
2423
1.98k
        if (!(q = sam_parse_BS_vals(b, q, &nused, &nalloc, &overflow)))
2424
0
            return -1;
2425
11.3k
    } else if (type == 'i') {
2426
2.07k
        if (!(q = sam_parse_Bi_vals(b, q, &nused, &nalloc, &overflow)))
2427
0
            return -1;
2428
9.29k
    } else if (type == 'I') {
2429
6.84k
        if (!(q = sam_parse_BI_vals(b, q, &nused, &nalloc, &overflow)))
2430
0
            return -1;
2431
6.84k
    } else if (type == 'f') {
2432
2.20k
        if (!(q = sam_parse_Bf_vals(b, q, &nused, &nalloc, &overflow)))
2433
0
            return -1;
2434
2.20k
    }
2435
38.4k
    if (*q != '\t' && *q != '\0') {
2436
        // Unknown B array type or junk in the numbers
2437
21
        hts_log_error("Malformed B:%c", type);
2438
21
        return -1;
2439
21
    }
2440
38.4k
    i32_to_le(nused, b->data + b_len_idx);
2441
2442
38.4k
    if (!overflow) {
2443
33.8k
        *end = q;
2444
33.8k
        return 0;
2445
33.8k
    } else {
2446
4.57k
        int64_t max = 0, min = 0, val;
2447
        // Given type was incorrect.  Try to rescue the situation.
2448
4.57k
        char *r = q;
2449
4.57k
        q = in;
2450
4.57k
        overflow = 0;
2451
4.57k
        b->l_data = orig_l;
2452
        // Find out what range of values is present
2453
2.12M
        while (q < r) {
2454
2.12M
            val = hts_str2int(q + 1, &q, 64, &overflow);
2455
2.12M
            if (max < val) max = val;
2456
2.12M
            if (min > val) min = val;
2457
2.12M
            skip_to_comma_(q);
2458
2.12M
        }
2459
        // Retry with appropriate type
2460
4.57k
        if (!overflow) {
2461
4.56k
            if (min < 0) {
2462
4.26k
                if (min >= INT8_MIN && max <= INT8_MAX) {
2463
1.31k
                    return sam_parse_B_vals_r('c', nalloc, in, end, b, ctr);
2464
2.95k
                } else if (min >= INT16_MIN && max <= INT16_MAX) {
2465
892
                    return sam_parse_B_vals_r('s', nalloc, in, end, b, ctr);
2466
2.06k
                } else if (min >= INT32_MIN && max <= INT32_MAX) {
2467
2.05k
                    return sam_parse_B_vals_r('i', nalloc, in, end, b, ctr);
2468
2.05k
                }
2469
4.26k
            } else {
2470
304
                if (max < UINT8_MAX) {
2471
0
                    return sam_parse_B_vals_r('C', nalloc, in, end, b, ctr);
2472
304
                } else if (max <= UINT16_MAX) {
2473
0
                    return sam_parse_B_vals_r('S', nalloc, in, end, b, ctr);
2474
304
                } else if (max <= UINT32_MAX) {
2475
301
                    return sam_parse_B_vals_r('I', nalloc, in, end, b, ctr);
2476
301
                }
2477
304
            }
2478
4.56k
        }
2479
        // If here then at least one of the values is too big to store
2480
14
        hts_log_error("Numeric value in B array out of allowed range");
2481
14
        return -1;
2482
4.57k
    }
2483
38.4k
#undef skip_to_comma_
2484
38.4k
}
2485
2486
HTS_ALIGN32
2487
static int sam_parse_B_vals(char type, char *in, char **end, bam1_t *b)
2488
33.8k
{
2489
33.8k
    int ctr = 0;
2490
33.8k
    uint32_t nalloc = 0;
2491
33.8k
    return sam_parse_B_vals_r(type, nalloc, in, end, b, &ctr);
2492
33.8k
}
2493
2494
92.2k
static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) {
2495
92.2k
    if (*v >= '1' && *v <= '9') {
2496
15.9k
        return hts_str2uint(v, rv, 16, overflow);
2497
15.9k
    }
2498
76.3k
    else if (*v == '0') {
2499
        // handle single-digit "0" directly; otherwise it's hex or octal
2500
21.1k
        if (v[1] == '\t') { *rv = v+1; return 0; }
2501
15
        else {
2502
15
            unsigned long val = strtoul(v, rv, 0);
2503
15
            if (val > 65535) { *overflow = 1; return 65535; }
2504
15
            return val;
2505
15
        }
2506
21.1k
    }
2507
55.1k
    else {
2508
        // TODO implement symbolic flag letters
2509
55.1k
        *rv = v;
2510
55.1k
        return 0;
2511
55.1k
    }
2512
92.2k
}
2513
2514
// Parse tag line and append to bam object b.
2515
// Shared by both SAM and FASTQ parsers.
2516
//
2517
// The difference between the two is how lenient we are to recognising
2518
// non-compliant strings.  The FASTQ parser glosses over arbitrary
2519
// non-SAM looking strings.
2520
static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient,
2521
91.7k
                            khash_t(tag) *tag_whitelist) {
2522
91.7k
    int overflow = 0;
2523
91.7k
    int checkpoint;
2524
91.7k
    char logbuf[40];
2525
91.7k
    char *q = start, *p = end;
2526
2527
91.7k
#define _parse_err(cond, ...)                   \
2528
3.24M
    do {                                        \
2529
6.99M
        if (cond) {                             \
2530
90
            if (lenient) {                      \
2531
0
                while (q < p && !isspace_c(*q))   \
2532
0
                    q++;                        \
2533
0
                while (q < p && isspace_c(*q))    \
2534
0
                    q++;                        \
2535
0
                b->l_data = checkpoint;         \
2536
0
                goto loop;                      \
2537
90
            } else {                            \
2538
90
                hts_log_error(__VA_ARGS__);     \
2539
90
                goto err_ret;                   \
2540
90
            }                                   \
2541
90
        }                                       \
2542
3.24M
    } while (0)
2543
2544
3.11M
    while (q < p) loop: {
2545
3.11M
        char type;
2546
3.11M
        checkpoint = b->l_data;
2547
3.11M
        if (p - q < 5) {
2548
9
            if (lenient) {
2549
0
                break;
2550
9
            } else {
2551
9
                hts_log_error("Incomplete aux field");
2552
9
                goto err_ret;
2553
9
            }
2554
9
        }
2555
1.55M
        _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id");
2556
2557
1.55M
        if (lenient && (q[2] | q[4]) != ':') {
2558
0
            while (q < p && !isspace_c(*q))
2559
0
                q++;
2560
0
            while (q < p && isspace_c(*q))
2561
0
                q++;
2562
0
            continue;
2563
0
        }
2564
2565
1.55M
        if (tag_whitelist) {
2566
0
            int tt = q[0]*256 + q[1];
2567
0
            if (kh_get(tag, tag_whitelist, tt) == kh_end(tag_whitelist)) {
2568
0
                while (q < p && *q != '\t')
2569
0
                    q++;
2570
0
                continue;
2571
0
            }
2572
0
        }
2573
2574
        // Copy over id
2575
1.55M
        if (possibly_expand_bam_data(b, 2) < 0) goto err_ret;
2576
1.55M
        memcpy(b->data + b->l_data, q, 2); b->l_data += 2;
2577
1.55M
        q += 3; type = *q++; ++q; // q points to value
2578
1.55M
        if (type != 'Z' && type != 'H') // the only zero length acceptable fields
2579
1.24M
            _parse_err(*q <= '\t', "incomplete aux field");
2580
2581
        // Ensure enough space for a double + type allocated.
2582
1.55M
        if (possibly_expand_bam_data(b, 16) < 0) goto err_ret;
2583
2584
1.55M
        if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
2585
457k
            b->data[b->l_data++] = 'A';
2586
457k
            b->data[b->l_data++] = *q++;
2587
1.10M
        } else if (type == 'i' || type == 'I') {
2588
716k
            if (*q == '-') {
2589
617k
                int32_t x = hts_str2int(q, &q, 32, &overflow);
2590
617k
                if (x >= INT8_MIN) {
2591
328k
                    b->data[b->l_data++] = 'c';
2592
328k
                    b->data[b->l_data++] = x;
2593
328k
                } else if (x >= INT16_MIN) {
2594
78.8k
                    b->data[b->l_data++] = 's';
2595
78.8k
                    i16_to_le(x, b->data + b->l_data);
2596
78.8k
                    b->l_data += 2;
2597
210k
                } else {
2598
210k
                    b->data[b->l_data++] = 'i';
2599
210k
                    i32_to_le(x, b->data + b->l_data);
2600
210k
                    b->l_data += 4;
2601
210k
                }
2602
617k
            } else {
2603
99.0k
                uint32_t x = hts_str2uint(q, &q, 32, &overflow);
2604
99.0k
                if (x <= UINT8_MAX) {
2605
59.5k
                    b->data[b->l_data++] = 'C';
2606
59.5k
                    b->data[b->l_data++] = x;
2607
59.5k
                } else if (x <= UINT16_MAX) {
2608
28.4k
                    b->data[b->l_data++] = 'S';
2609
28.4k
                    u16_to_le(x, b->data + b->l_data);
2610
28.4k
                    b->l_data += 2;
2611
28.4k
                } else {
2612
11.0k
                    b->data[b->l_data++] = 'I';
2613
11.0k
                    u32_to_le(x, b->data + b->l_data);
2614
11.0k
                    b->l_data += 4;
2615
11.0k
                }
2616
99.0k
            }
2617
716k
        } else if (type == 'f') {
2618
23.6k
            b->data[b->l_data++] = 'f';
2619
23.6k
            float_to_le(strtod(q, &q), b->data + b->l_data);
2620
23.6k
            b->l_data += sizeof(float);
2621
359k
        } else if (type == 'd') {
2622
14.6k
            b->data[b->l_data++] = 'd';
2623
14.6k
            double_to_le(strtod(q, &q), b->data + b->l_data);
2624
14.6k
            b->l_data += sizeof(double);
2625
345k
        } else if (type == 'Z' || type == 'H') {
2626
311k
            char *end = strchr(q, '\t');
2627
311k
            if (!end) end = q + strlen(q);
2628
311k
            _parse_err(type == 'H' && ((end-q)&1) != 0,
2629
311k
                       "hex field does not have an even number of digits");
2630
311k
            b->data[b->l_data++] = type;
2631
311k
            if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret;
2632
311k
            memcpy(b->data + b->l_data, q, end - q);
2633
311k
            b->l_data += end - q;
2634
311k
            b->data[b->l_data++] = '\0';
2635
311k
            q = end;
2636
311k
        } else if (type == 'B') {
2637
33.8k
            type = *q++; // q points to the first ',' following the typing byte
2638
33.8k
            _parse_err(*q && *q != ',' && *q != '\t',
2639
33.8k
                       "B aux field type not followed by ','");
2640
2641
33.8k
            if (sam_parse_B_vals(type, q, &q, b) < 0)
2642
37
                goto err_ret;
2643
33.8k
        } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1));
2644
2645
8.34M
        while (*q > '\t') { q++; } // Skip any junk to next tab
2646
1.55M
        q++;
2647
1.55M
    }
2648
2649
91.6k
    _parse_err(!lenient && overflow != 0, "numeric value out of allowed range");
2650
91.6k
#undef _parse_err
2651
2652
91.6k
    return 0;
2653
2654
136
err_ret:
2655
136
    return -2;
2656
91.6k
}
2657
2658
int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
2659
92.3k
{
2660
383k
#define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0)
2661
2662
92.3k
#if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff
2663
2664
// Macro that operates on 64-bits at a time.
2665
92.3k
#define COPY_MINUS_N(to,from,n,l,failed)                        \
2666
92.3k
    do {                                                        \
2667
77.2k
        uint64_u *from8 = (uint64_u *)(from);                   \
2668
77.2k
        uint64_u *to8 = (uint64_u *)(to);                       \
2669
77.2k
        uint64_t uflow = 0;                                     \
2670
77.2k
        size_t l8 = (l)>>3, i;                                  \
2671
77.2k
        for (i = 0; i < l8; i++) {                              \
2672
0
            to8[i] = from8[i] - (n)*0x0101010101010101UL;       \
2673
0
            uflow |= to8[i];                                    \
2674
0
        }                                                       \
2675
77.7k
        for (i<<=3; i < (l); ++i) {                             \
2676
480
            to[i] = from[i] - (n);                              \
2677
480
            uflow |= to[i];                                     \
2678
480
        }                                                       \
2679
77.2k
        failed = (uflow & 0x8080808080808080UL) > 0;            \
2680
77.2k
    } while (0)
2681
2682
#else
2683
2684
// Basic version which operates a byte at a time
2685
#define COPY_MINUS_N(to,from,n,l,failed) do {                \
2686
        uint8_t uflow = 0;                                   \
2687
        for (i = 0; i < (l); ++i) {                          \
2688
            (to)[i] = (from)[i] - (n);                       \
2689
            uflow |= (uint8_t) (to)[i];                      \
2690
        }                                                    \
2691
        failed = (uflow & 0x80) > 0;                         \
2692
    } while (0)
2693
2694
#endif
2695
2696
153k
#define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l)
2697
1.14M
#define _parse_err(cond, ...) do { if (cond) { hts_log_error(__VA_ARGS__); goto err_ret; } } while (0)
2698
324k
#define _parse_warn(cond, ...) do { if (cond) { hts_log_warning(__VA_ARGS__); } } while (0)
2699
2700
92.3k
    uint8_t *t;
2701
2702
92.3k
    char *p = s->s, *q;
2703
92.3k
    int i, overflow = 0;
2704
92.3k
    char logbuf[40];
2705
92.3k
    hts_pos_t cigreflen;
2706
92.3k
    bam1_core_t *c = &b->core;
2707
2708
92.3k
    b->l_data = 0;
2709
92.3k
    memset(c, 0, 32);
2710
2711
    // qname
2712
92.3k
    q = _read_token(p);
2713
2714
92.2k
    _parse_warn(p - q <= 1, "empty query name");
2715
92.2k
    _parse_err(p - q > 255, "query name too long");
2716
    // resize large enough for name + extranul
2717
92.2k
    if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret;
2718
92.2k
    memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q;
2719
2720
92.2k
    c->l_extranul = (4 - (b->l_data & 3)) & 3;
2721
92.2k
    memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul);
2722
92.2k
    b->l_data += c->l_extranul;
2723
2724
92.2k
    c->l_qname = p - q + c->l_extranul;
2725
2726
    // flag
2727
92.2k
    c->flag = parse_sam_flag(p, &p, &overflow);
2728
92.2k
    if (*p++ != '\t') goto err_ret; // malformated flag
2729
2730
    // chr
2731
92.1k
    q = _read_token(p);
2732
92.1k
    if (strcmp(q, "*")) {
2733
77.7k
        _parse_err(h->n_targets == 0, "no SQ lines present in the header");
2734
77.7k
        c->tid = bam_name2id(h, q);
2735
77.7k
        _parse_err(c->tid < -1, "failed to parse header");
2736
77.7k
        _parse_warn(c->tid < 0, "unrecognized reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2737
77.7k
    } else c->tid = -1;
2738
2739
    // pos
2740
92.1k
    c->pos = hts_str2uint(p, &p, 62, &overflow) - 1;
2741
92.1k
    if (*p++ != '\t') goto err_ret;
2742
92.1k
    if (c->pos < 0 && c->tid >= 0) {
2743
5.72k
        _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
2744
5.72k
        c->tid = -1;
2745
5.72k
    }
2746
92.1k
    if (c->tid < 0) c->flag |= BAM_FUNMAP;
2747
2748
    // mapq
2749
92.1k
    c->qual = hts_str2uint(p, &p, 8, &overflow);
2750
92.1k
    if (*p++ != '\t') goto err_ret;
2751
    // cigar
2752
92.1k
    if (*p != '*') {
2753
77.1k
        uint32_t *cigar = NULL;
2754
77.1k
        int old_l_data = b->l_data;
2755
77.1k
        int n_cigar = bam_parse_cigar(p, &p, b);
2756
77.1k
        if (n_cigar < 1 || *p++ != '\t') goto err_ret;
2757
76.9k
        cigar = (uint32_t *)(b->data + old_l_data);
2758
2759
        // can't use bam_endpos() directly as some fields not yet set up
2760
76.9k
        cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
2761
76.9k
        if (cigreflen == 0) cigreflen = 1;
2762
76.9k
    } else {
2763
14.9k
        _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
2764
14.9k
        c->flag |= BAM_FUNMAP;
2765
14.9k
        q = _read_token(p);
2766
14.9k
        cigreflen = 1;
2767
14.9k
    }
2768
91.8k
    _parse_err(HTS_POS_MAX - cigreflen <= c->pos,
2769
91.8k
               "read ends beyond highest supported position");
2770
91.8k
    c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5);
2771
    // mate chr
2772
91.8k
    q = _read_token(p);
2773
91.8k
    if (strcmp(q, "=") == 0) {
2774
3
        c->mtid = c->tid;
2775
91.8k
    } else if (strcmp(q, "*") == 0) {
2776
0
        c->mtid = -1;
2777
91.8k
    } else {
2778
91.8k
        c->mtid = bam_name2id(h, q);
2779
91.8k
        _parse_err(c->mtid < -1, "failed to parse header");
2780
91.8k
        _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2781
91.8k
    }
2782
    // mpos
2783
91.8k
    c->mpos = hts_str2uint(p, &p, 62, &overflow) - 1;
2784
91.8k
    if (*p++ != '\t') goto err_ret;
2785
91.8k
    if (c->mpos < 0 && c->mtid >= 0) {
2786
42.0k
        _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
2787
42.0k
        c->mtid = -1;
2788
42.0k
    }
2789
    // tlen
2790
91.8k
    c->isize = hts_str2int(p, &p, 63, &overflow);
2791
91.8k
    if (*p++ != '\t') goto err_ret;
2792
91.8k
    _parse_err(overflow, "number outside allowed range");
2793
    // seq
2794
91.8k
    q = _read_token(p);
2795
91.8k
    if (strcmp(q, "*")) {
2796
62.1k
        _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long");
2797
62.1k
        c->l_qseq = p - q - 1;
2798
62.1k
        hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname));
2799
62.1k
        _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length");
2800
62.0k
        i = (c->l_qseq + 1) >> 1;
2801
62.0k
        _get_mem(uint8_t, &t, b, i);
2802
2803
62.0k
        unsigned int lqs2 = c->l_qseq&~1, i;
2804
62.3k
        for (i = 0; i < lqs2; i+=2)
2805
263
            t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]];
2806
76.8k
        for (; i < c->l_qseq; ++i)
2807
14.7k
            t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2);
2808
62.0k
    } else c->l_qseq = 0;
2809
    // qual
2810
183k
    _get_mem(uint8_t, &t, b, c->l_qseq);
2811
183k
    if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) {
2812
14.5k
        memset(t, 0xff, c->l_qseq);
2813
14.5k
        p += 2;
2814
77.2k
    } else {
2815
77.2k
        int failed = 0;
2816
77.2k
        _parse_err(s->l - (p - s->s) < c->l_qseq
2817
77.2k
                   || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'),
2818
77.2k
                   "SEQ and QUAL are of different length");
2819
77.2k
        COPY_MINUS_N(t, p, 33, c->l_qseq, failed);
2820
77.2k
        _parse_err(failed, "invalid QUAL character");
2821
77.2k
        p += c->l_qseq + 1;
2822
77.2k
    }
2823
2824
    // aux
2825
91.7k
    if (aux_parse(p, s->s + s->l, b, 0, NULL) < 0)
2826
136
        goto err_ret;
2827
2828
91.6k
    if (bam_tag2cigar(b, 1, 1) < 0)
2829
0
        return -2;
2830
91.6k
    return 0;
2831
2832
0
#undef _parse_warn
2833
0
#undef _parse_err
2834
0
#undef _get_mem
2835
0
#undef _read_token
2836
700
err_ret:
2837
700
    return -2;
2838
91.6k
}
2839
2840
77.1k
static uint32_t read_ncigar(const char *q) {
2841
77.1k
    uint32_t n_cigar = 0;
2842
1.24M
    for (; *q && *q != '\t'; ++q)
2843
1.16M
        if (!isdigit_c(*q)) ++n_cigar;
2844
77.1k
    if (!n_cigar) {
2845
31
        hts_log_error("No CIGAR operations");
2846
31
        return 0;
2847
31
    }
2848
77.1k
    if (n_cigar >= 2147483647) {
2849
0
        hts_log_error("Too many CIGAR operations");
2850
0
        return 0;
2851
0
    }
2852
2853
77.1k
    return n_cigar;
2854
77.1k
}
2855
2856
/*! @function
2857
 @abstract  Parse a CIGAR string into preallocated a uint32_t array
2858
 @param  in      [in]  pointer to the source string
2859
 @param  a_cigar [out]  address of the destination uint32_t buffer
2860
 @return         number of processed input characters; 0 on error
2861
 */
2862
77.1k
static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) {
2863
77.1k
    int i, overflow = 0;
2864
77.1k
    const char *p = in;
2865
229k
    for (i = 0; i < n_cigar; i++) {
2866
152k
        uint32_t len;
2867
152k
        int op;
2868
152k
        char *q;
2869
152k
        len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT;
2870
152k
        if (q == p) {
2871
36
            hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p);
2872
36
            return 0;
2873
36
        }
2874
152k
        if (overflow) {
2875
21
            hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p);
2876
21
            return 0;
2877
21
        }
2878
152k
        p = q;
2879
152k
        op = bam_cigar_table[(unsigned char)*p++];
2880
152k
        if (op < 0) {
2881
103
            hts_log_error("Unrecognized CIGAR operator");
2882
103
            return 0;
2883
103
        }
2884
152k
        a_cigar[i] = len;
2885
152k
        a_cigar[i] |= op;
2886
152k
    }
2887
2888
76.9k
    return p-in;
2889
77.1k
}
2890
2891
0
ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) {
2892
0
    size_t n_cigar = 0;
2893
0
    int diff;
2894
2895
0
    if (!in || !a_cigar || !a_mem) {
2896
0
        hts_log_error("NULL pointer arguments");
2897
0
        return -1;
2898
0
    }
2899
0
    if (end) *end = (char *)in;
2900
2901
0
    if (*in == '*') {
2902
0
        if (end) (*end)++;
2903
0
        return 0;
2904
0
    }
2905
0
    n_cigar = read_ncigar(in);
2906
0
    if (!n_cigar) return 0;
2907
0
    if (n_cigar > *a_mem) {
2908
0
        uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar));
2909
0
        if (a_tmp) {
2910
0
            *a_cigar = a_tmp;
2911
0
            *a_mem = n_cigar;
2912
0
        } else {
2913
0
            hts_log_error("Memory allocation error");
2914
0
            return -1;
2915
0
        }
2916
0
    }
2917
2918
0
    if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1;
2919
0
    if (end) *end = (char *)in+diff;
2920
2921
0
    return n_cigar;
2922
0
}
2923
2924
77.1k
ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) {
2925
77.1k
    size_t n_cigar = 0;
2926
77.1k
    int diff;
2927
2928
77.1k
    if (!in || !b) {
2929
0
        hts_log_error("NULL pointer arguments");
2930
0
        return -1;
2931
0
    }
2932
77.1k
    if (end) *end = (char *)in;
2933
2934
77.1k
    n_cigar = (*in == '*') ? 0 : read_ncigar(in);
2935
77.1k
    if (!n_cigar && b->core.n_cigar == 0) {
2936
31
        if (end) *end = (char *)in+1;
2937
31
        return 0;
2938
31
    }
2939
2940
77.1k
    ssize_t cig_diff = n_cigar - b->core.n_cigar;
2941
77.1k
    if (cig_diff > 0 &&
2942
77.1k
        possibly_expand_bam_data(b, cig_diff * sizeof(uint32_t)) < 0) {
2943
0
        hts_log_error("Memory allocation error");
2944
0
        return -1;
2945
0
    }
2946
2947
77.1k
    uint32_t *cig = bam_get_cigar(b);
2948
77.1k
    if ((uint8_t *)cig != b->data + b->l_data) {
2949
        // Modifying an BAM existing BAM record
2950
0
        uint8_t  *seq = bam_get_seq(b);
2951
0
        memmove(cig + n_cigar, seq, (b->data + b->l_data) - seq);
2952
0
    }
2953
2954
77.1k
    if (n_cigar) {
2955
77.1k
        if (!(diff = parse_cigar(in, cig, n_cigar)))
2956
160
            return -1;
2957
77.1k
    } else {
2958
0
        diff = 1; // handle "*"
2959
0
    }
2960
2961
76.9k
    b->l_data += cig_diff * sizeof(uint32_t);
2962
76.9k
    b->core.n_cigar = n_cigar;
2963
76.9k
    if (end) *end = (char *)in + diff;
2964
2965
76.9k
    return n_cigar;
2966
77.1k
}
2967
2968
/*
2969
 * -----------------------------------------------------------------------------
2970
 * SAM threading
2971
 */
2972
// Size of SAM text block (reading)
2973
0
#define SAM_NBYTES 240000
2974
2975
// Number of BAM records (writing, up to NB_mem in size)
2976
0
#define SAM_NBAM 1000
2977
2978
struct SAM_state;
2979
2980
// Output job - a block of BAM records
2981
typedef struct sp_bams {
2982
    struct sp_bams *next;
2983
    int serial;
2984
2985
    bam1_t *bams;
2986
    int nbams, abams; // used and alloc for bams[] array
2987
    size_t bam_mem;   // very approximate total size
2988
2989
    struct SAM_state *fd;
2990
} sp_bams;
2991
2992
// Input job - a block of SAM text
2993
typedef struct sp_lines {
2994
    struct sp_lines *next;
2995
    int serial;
2996
2997
    char *data;
2998
    int data_size;
2999
    int alloc;
3000
3001
    struct SAM_state *fd;
3002
    sp_bams *bams;
3003
} sp_lines;
3004
3005
enum sam_cmd {
3006
    SAM_NONE = 0,
3007
    SAM_CLOSE,
3008
    SAM_CLOSE_DONE,
3009
    SAM_AT_EOF,
3010
};
3011
3012
typedef struct SAM_state {
3013
    sam_hdr_t *h;
3014
3015
    hts_tpool *p;
3016
    int own_pool;
3017
    pthread_mutex_t lines_m;
3018
    hts_tpool_process *q;
3019
    pthread_t dispatcher;
3020
    int dispatcher_set;
3021
3022
    sp_lines *lines;
3023
    sp_bams *bams;
3024
3025
    sp_bams *curr_bam;
3026
    int curr_idx;
3027
    int serial;
3028
3029
    // Be warned: moving these mutexes around in this struct can reduce
3030
    // threading performance by up to 70%!
3031
    pthread_mutex_t command_m;
3032
    pthread_cond_t command_c;
3033
    enum sam_cmd command;
3034
3035
    // One of the E* errno codes
3036
    int errcode;
3037
3038
    htsFile *fp;
3039
} SAM_state;
3040
3041
// Returns a SAM_state struct from a generic hFILE.
3042
//
3043
// Returns NULL on failure.
3044
0
static SAM_state *sam_state_create(htsFile *fp) {
3045
    // Ideally sam_open wouldn't be a #define to hts_open but instead would
3046
    // be a redirect call with an additional 'S' mode.  This in turn would
3047
    // correctly set the designed format to sam instead of a generic
3048
    // text_format.
3049
0
    if (fp->format.format != sam && fp->format.format != text_format)
3050
0
        return NULL;
3051
3052
0
    SAM_state *fd = calloc(1, sizeof(*fd));
3053
0
    if (!fd)
3054
0
        return NULL;
3055
3056
0
    fp->state = fd;
3057
0
    fd->fp = fp;
3058
3059
0
    return fd;
3060
0
}
3061
3062
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str);
3063
static void *sam_format_worker(void *arg);
3064
3065
0
static void sam_state_err(SAM_state *fd, int errcode) {
3066
0
    pthread_mutex_lock(&fd->command_m);
3067
0
    if (!fd->errcode)
3068
0
        fd->errcode = errcode;
3069
0
    pthread_mutex_unlock(&fd->command_m);
3070
0
}
3071
3072
0
static void sam_free_sp_bams(sp_bams *b) {
3073
0
    if (!b)
3074
0
        return;
3075
3076
0
    if (b->bams) {
3077
0
        int i;
3078
0
        for (i = 0; i < b->abams; i++) {
3079
0
            if (b->bams[i].data)
3080
0
                free(b->bams[i].data);
3081
0
        }
3082
0
        free(b->bams);
3083
0
    }
3084
0
    free(b);
3085
0
}
3086
3087
// Destroys the state produce by sam_state_create.
3088
2.60k
int sam_state_destroy(htsFile *fp) {
3089
2.60k
    int ret = 0;
3090
3091
2.60k
    if (!fp->state)
3092
2.60k
        return 0;
3093
3094
0
    SAM_state *fd = fp->state;
3095
0
    if (fd->p) {
3096
0
        if (fd->h) {
3097
            // Notify sam_dispatcher we're closing
3098
0
            pthread_mutex_lock(&fd->command_m);
3099
0
            if (fd->command != SAM_CLOSE_DONE)
3100
0
                fd->command = SAM_CLOSE;
3101
0
            pthread_cond_signal(&fd->command_c);
3102
0
            ret = -fd->errcode;
3103
0
            if (fd->q)
3104
0
                hts_tpool_wake_dispatch(fd->q); // unstick the reader
3105
3106
0
            if (!fp->is_write && fd->q && fd->dispatcher_set) {
3107
0
                for (;;) {
3108
                    // Avoid deadlocks with dispatcher
3109
0
                    if (fd->command == SAM_CLOSE_DONE)
3110
0
                        break;
3111
0
                    hts_tpool_wake_dispatch(fd->q);
3112
0
                    pthread_mutex_unlock(&fd->command_m);
3113
0
                    hts_usleep(10000);
3114
0
                    pthread_mutex_lock(&fd->command_m);
3115
0
                }
3116
0
            }
3117
0
            pthread_mutex_unlock(&fd->command_m);
3118
3119
0
            if (fp->is_write) {
3120
                // Dispatch the last partial block.
3121
0
                sp_bams *gb = fd->curr_bam;
3122
0
                if (!ret && gb && gb->nbams > 0 && fd->q)
3123
0
                    ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb);
3124
3125
                // Flush and drain output
3126
0
                if (fd->q)
3127
0
                    hts_tpool_process_flush(fd->q);
3128
0
                pthread_mutex_lock(&fd->command_m);
3129
0
                if (!ret) ret = -fd->errcode;
3130
0
                pthread_mutex_unlock(&fd->command_m);
3131
3132
0
                while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) {
3133
0
                    hts_usleep(10000);
3134
0
                    pthread_mutex_lock(&fd->command_m);
3135
0
                    ret = -fd->errcode;
3136
                    // not empty but shutdown implies error
3137
0
                    if (hts_tpool_process_is_shutdown(fd->q) && !ret)
3138
0
                        ret = EIO;
3139
0
                    pthread_mutex_unlock(&fd->command_m);
3140
0
                }
3141
0
                if (fd->q)
3142
0
                    hts_tpool_process_shutdown(fd->q);
3143
0
            }
3144
3145
            // Wait for it to acknowledge
3146
0
            if (fd->dispatcher_set)
3147
0
                pthread_join(fd->dispatcher, NULL);
3148
0
            if (!ret) ret = -fd->errcode;
3149
0
        }
3150
3151
        // Tidy up memory
3152
0
        if (fd->q)
3153
0
            hts_tpool_process_destroy(fd->q);
3154
3155
0
        if (fd->own_pool && fp->format.compression == no_compression) {
3156
0
            hts_tpool_destroy(fd->p);
3157
0
            fd->p = NULL;
3158
0
        }
3159
0
        pthread_mutex_destroy(&fd->lines_m);
3160
0
        pthread_mutex_destroy(&fd->command_m);
3161
0
        pthread_cond_destroy(&fd->command_c);
3162
3163
0
        sp_lines *l = fd->lines;
3164
0
        while (l) {
3165
0
            sp_lines *n = l->next;
3166
0
            free(l->data);
3167
0
            free(l);
3168
0
            l = n;
3169
0
        }
3170
3171
0
        sp_bams *b = fd->bams;
3172
0
        while (b) {
3173
0
            if (fd->curr_bam == b)
3174
0
                fd->curr_bam = NULL;
3175
0
            sp_bams *n = b->next;
3176
0
            sam_free_sp_bams(b);
3177
0
            b = n;
3178
0
        }
3179
3180
0
        if (fd->curr_bam)
3181
0
            sam_free_sp_bams(fd->curr_bam);
3182
3183
        // Decrement counter by one, maybe destroying too.
3184
        // This is to permit the caller using bam_hdr_destroy
3185
        // before sam_close without triggering decode errors
3186
        // in the background threads.
3187
0
        bam_hdr_destroy(fd->h);
3188
0
    }
3189
3190
0
    free(fp->state);
3191
0
    fp->state = NULL;
3192
0
    return ret;
3193
2.60k
}
3194
3195
// Cleanup function - job for sam_parse_worker; result for sam_format_worker
3196
0
static void cleanup_sp_lines(void *arg) {
3197
0
    sp_lines *gl = (sp_lines *)arg;
3198
0
    if (!gl) return;
3199
3200
    // Should always be true for lines passed to / from thread workers.
3201
0
    assert(gl->next == NULL);
3202
3203
0
    free(gl->data);
3204
0
    sam_free_sp_bams(gl->bams);
3205
0
    free(gl);
3206
0
}
3207
3208
// Run from one of the worker threads.
3209
// Convert a passed in array of lines to array of BAMs, returning
3210
// the result back to the thread queue.
3211
0
static void *sam_parse_worker(void *arg) {
3212
0
    sp_lines *gl = (sp_lines *)arg;
3213
0
    sp_bams *gb = NULL;
3214
0
    char *lines = gl->data;
3215
0
    int i;
3216
0
    bam1_t *b;
3217
0
    SAM_state *fd = gl->fd;
3218
3219
    // Use a block of BAM structs we had earlier if available.
3220
0
    pthread_mutex_lock(&fd->lines_m);
3221
0
    if (fd->bams) {
3222
0
        gb = fd->bams;
3223
0
        fd->bams = gb->next;
3224
0
    }
3225
0
    pthread_mutex_unlock(&fd->lines_m);
3226
3227
0
    if (gb == NULL) {
3228
0
        gb = calloc(1, sizeof(*gb));
3229
0
        if (!gb) {
3230
0
            return NULL;
3231
0
        }
3232
0
        gb->abams = 100;
3233
0
        gb->bams = b = calloc(gb->abams, sizeof(*b));
3234
0
        if (!gb->bams) {
3235
0
            sam_state_err(fd, ENOMEM);
3236
0
            goto err;
3237
0
        }
3238
0
        gb->nbams = 0;
3239
0
        gb->bam_mem = 0;
3240
0
    }
3241
0
    gb->serial = gl->serial;
3242
0
    gb->next = NULL;
3243
3244
0
    b = (bam1_t *)gb->bams;
3245
0
    if (!b) {
3246
0
        sam_state_err(fd, ENOMEM);
3247
0
        goto err;
3248
0
    }
3249
3250
0
    i = 0;
3251
0
    char *cp = lines, *cp_end = lines + gl->data_size;
3252
0
    while (cp < cp_end) {
3253
0
        if (i >= gb->abams) {
3254
0
            int old_abams = gb->abams;
3255
0
            gb->abams *= 2;
3256
0
            b = (bam1_t *)realloc(gb->bams, gb->abams*sizeof(bam1_t));
3257
0
            if (!b) {
3258
0
                gb->abams /= 2;
3259
0
                sam_state_err(fd, ENOMEM);
3260
0
                goto err;
3261
0
            }
3262
0
            memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b));
3263
0
            gb->bams = b;
3264
0
        }
3265
3266
        // Ideally we'd get sam_parse1 to return the number of
3267
        // bytes decoded and to be able to stop on newline as
3268
        // well as \0.
3269
        //
3270
        // We can then avoid the additional strchr loop.
3271
        // It's around 6% of our CPU cost, albeit threadable.
3272
        //
3273
        // However this is an API change so for now we copy.
3274
3275
0
        char *nl = strchr(cp, '\n');
3276
0
        char *line_end;
3277
0
        if (nl) {
3278
0
            line_end = nl;
3279
0
            if (line_end > cp && *(line_end - 1) == '\r')
3280
0
                line_end--;
3281
0
            nl++;
3282
0
        } else {
3283
0
            nl = line_end = cp_end;
3284
0
        }
3285
0
        *line_end = '\0';
3286
0
        kstring_t ks = { line_end - cp, gl->alloc, cp };
3287
0
        if (sam_parse1(&ks, fd->h, &b[i]) < 0) {
3288
0
            sam_state_err(fd, errno ? errno : EIO);
3289
0
            cleanup_sp_lines(gl);
3290
0
            goto err;
3291
0
        }
3292
3293
0
        cp = nl;
3294
0
        i++;
3295
0
    }
3296
0
    gb->nbams = i;
3297
3298
0
    pthread_mutex_lock(&fd->lines_m);
3299
0
    gl->next = fd->lines;
3300
0
    fd->lines = gl;
3301
0
    pthread_mutex_unlock(&fd->lines_m);
3302
0
    return gb;
3303
3304
0
 err:
3305
0
    sam_free_sp_bams(gb);
3306
0
    return NULL;
3307
0
}
3308
3309
0
static void *sam_parse_eof(void *arg) {
3310
0
    return NULL;
3311
0
}
3312
3313
// Cleanup function - result for sam_parse_worker; job for sam_format_worker
3314
0
static void cleanup_sp_bams(void *arg) {
3315
0
    sam_free_sp_bams((sp_bams *) arg);
3316
0
}
3317
3318
// Runs in its own thread.
3319
// Reads a block of text (SAM) and sends a new job to the thread queue to
3320
// translate this to BAM.
3321
0
static void *sam_dispatcher_read(void *vp) {
3322
0
    htsFile *fp = vp;
3323
0
    kstring_t line = {0};
3324
0
    int line_frag = 0;
3325
0
    SAM_state *fd = fp->state;
3326
0
    sp_lines *l = NULL;
3327
3328
    // Pre-allocate buffer for left-over bits of line (exact size doesn't
3329
    // matter as it will grow if necessary).
3330
0
    if (ks_resize(&line, 1000) < 0)
3331
0
        goto err;
3332
3333
0
    for (;;) {
3334
        // Check for command
3335
0
        pthread_mutex_lock(&fd->command_m);
3336
0
        switch (fd->command) {
3337
3338
0
        case SAM_CLOSE:
3339
0
            pthread_cond_signal(&fd->command_c);
3340
0
            pthread_mutex_unlock(&fd->command_m);
3341
0
            hts_tpool_process_shutdown(fd->q);
3342
0
            goto tidyup;
3343
3344
0
        default:
3345
0
            break;
3346
0
        }
3347
0
        pthread_mutex_unlock(&fd->command_m);
3348
3349
0
        pthread_mutex_lock(&fd->lines_m);
3350
0
        if (fd->lines) {
3351
            // reuse existing line buffer
3352
0
            l = fd->lines;
3353
0
            fd->lines = l->next;
3354
0
        }
3355
0
        pthread_mutex_unlock(&fd->lines_m);
3356
3357
0
        if (l == NULL) {
3358
            // none to reuse, to create a new one
3359
0
            l = calloc(1, sizeof(*l));
3360
0
            if (!l)
3361
0
                goto err;
3362
0
            l->alloc = SAM_NBYTES;
3363
0
            l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1
3364
0
            if (!l->data) {
3365
0
                free(l);
3366
0
                l = NULL;
3367
0
                goto err;
3368
0
            }
3369
0
            l->fd = fd;
3370
0
        }
3371
0
        l->next = NULL;
3372
3373
0
        if (l->alloc < line_frag+SAM_NBYTES/2) {
3374
0
            char *rp = realloc(l->data, line_frag+SAM_NBYTES/2 +8);
3375
0
            if (!rp)
3376
0
                goto err;
3377
0
            l->alloc = line_frag+SAM_NBYTES/2;
3378
0
            l->data = rp;
3379
0
        }
3380
0
        memcpy(l->data, line.s, line_frag);
3381
3382
0
        l->data_size = line_frag;
3383
0
        ssize_t nbytes;
3384
0
    longer_line:
3385
0
        if (fp->is_bgzf)
3386
0
            nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag);
3387
0
        else
3388
0
            nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag);
3389
0
        if (nbytes < 0) {
3390
0
            sam_state_err(fd, errno ? errno : EIO);
3391
0
            goto err;
3392
0
        } else if (nbytes == 0)
3393
0
            break; // EOF
3394
0
        l->data_size += nbytes;
3395
3396
        // trim to last \n. Maybe \r\n, but that's still fine
3397
0
        if (nbytes == l->alloc - line_frag) {
3398
0
            char *cp_end = l->data + l->data_size;
3399
0
            char *cp = cp_end-1;
3400
3401
0
            while (cp > (char *)l->data && *cp != '\n')
3402
0
                cp--;
3403
3404
            // entire buffer is part of a single line
3405
0
            if (cp == l->data) {
3406
0
                line_frag = l->data_size;
3407
0
                char *rp = realloc(l->data, l->alloc * 2 + 8);
3408
0
                if (!rp)
3409
0
                    goto err;
3410
0
                l->alloc *= 2;
3411
0
                l->data = rp;
3412
0
                assert(l->alloc >= l->data_size);
3413
0
                assert(l->alloc >= line_frag);
3414
0
                assert(l->alloc >= l->alloc - line_frag);
3415
0
                goto longer_line;
3416
0
            }
3417
0
            cp++;
3418
3419
            // line holds the remainder of our line.
3420
0
            if (ks_resize(&line, cp_end - cp) < 0)
3421
0
                goto err;
3422
0
            memcpy(line.s, cp, cp_end - cp);
3423
0
            line_frag = cp_end - cp;
3424
0
            l->data_size = l->alloc - line_frag;
3425
0
        } else {
3426
            // out of buffer
3427
0
            line_frag = 0;
3428
0
        }
3429
3430
0
        l->serial = fd->serial++;
3431
        //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial);
3432
0
        if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l,
3433
0
                                cleanup_sp_lines, cleanup_sp_bams, 0) < 0)
3434
0
            goto err;
3435
0
        pthread_mutex_lock(&fd->command_m);
3436
0
        if (fd->command == SAM_CLOSE) {
3437
0
            pthread_mutex_unlock(&fd->command_m);
3438
0
            l = NULL;
3439
0
            goto tidyup;
3440
0
        }
3441
0
        l = NULL;  // Now "owned" by sam_parse_worker()
3442
0
        pthread_mutex_unlock(&fd->command_m);
3443
0
    }
3444
3445
    // Submit a NULL sp_bams entry to act as an EOF marker
3446
0
    if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0)
3447
0
        goto err;
3448
3449
    // At EOF, wait for close request.
3450
    // (In future if we add support for seek, this is where we need to catch it.)
3451
0
    for (;;) {
3452
0
        pthread_mutex_lock(&fd->command_m);
3453
0
        if (fd->command == SAM_NONE)
3454
0
            pthread_cond_wait(&fd->command_c, &fd->command_m);
3455
0
        switch (fd->command) {
3456
0
        case SAM_CLOSE:
3457
0
            pthread_cond_signal(&fd->command_c);
3458
0
            pthread_mutex_unlock(&fd->command_m);
3459
0
            hts_tpool_process_shutdown(fd->q);
3460
0
            goto tidyup;
3461
3462
0
        default:
3463
0
            pthread_mutex_unlock(&fd->command_m);
3464
0
            break;
3465
0
        }
3466
0
    }
3467
3468
0
 tidyup:
3469
0
    pthread_mutex_lock(&fd->command_m);
3470
0
    fd->command = SAM_CLOSE_DONE;
3471
0
    pthread_cond_signal(&fd->command_c);
3472
0
    pthread_mutex_unlock(&fd->command_m);
3473
3474
0
    if (l) {
3475
0
        pthread_mutex_lock(&fd->lines_m);
3476
0
        l->next = fd->lines;
3477
0
        fd->lines = l;
3478
0
        pthread_mutex_unlock(&fd->lines_m);
3479
0
    }
3480
0
    free(line.s);
3481
3482
0
    return NULL;
3483
3484
0
 err:
3485
0
    sam_state_err(fd, errno ? errno : ENOMEM);
3486
0
    hts_tpool_process_shutdown(fd->q);
3487
0
    goto tidyup;
3488
0
}
3489
3490
// Runs in its own thread.
3491
// Takes encoded blocks of SAM off the thread results queue and writes them
3492
// to our output stream.
3493
0
static void *sam_dispatcher_write(void *vp) {
3494
0
    htsFile *fp = vp;
3495
0
    SAM_state *fd = fp->state;
3496
0
    hts_tpool_result *r;
3497
3498
    // Iterates until result queue is shutdown, where it returns NULL.
3499
0
    while ((r = hts_tpool_next_result_wait(fd->q))) {
3500
0
        sp_lines *gl = (sp_lines *)hts_tpool_result_data(r);
3501
0
        if (!gl) {
3502
0
            sam_state_err(fd, ENOMEM);
3503
0
            goto err;
3504
0
        }
3505
3506
0
        if (fp->idx) {
3507
0
            sp_bams *gb = gl->bams;
3508
0
            int i = 0, count = 0;
3509
0
            while (i < gl->data_size) {
3510
0
                int j = i;
3511
0
                while (i < gl->data_size && gl->data[i] != '\n')
3512
0
                    i++;
3513
0
                if (i < gl->data_size)
3514
0
                    i++;
3515
3516
0
                if (fp->is_bgzf) {
3517
0
                    if (bgzf_flush_try(fp->fp.bgzf, i-j) < 0)
3518
0
                        goto err;
3519
0
                    if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j)
3520
0
                        goto err;
3521
0
                } else {
3522
0
                    if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j)
3523
0
                        goto err;
3524
0
                }
3525
3526
0
                bam1_t *b = &gb->bams[count++];
3527
0
                if (fp->format.compression == bgzf) {
3528
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
3529
0
                                      b->core.tid, b->core.pos, bam_endpos(b),
3530
0
                                      bgzf_tell(fp->fp.bgzf),
3531
0
                                      !(b->core.flag&BAM_FUNMAP)) < 0) {
3532
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3533
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3534
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3535
0
                        goto err;
3536
0
                    }
3537
0
                } else {
3538
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
3539
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
3540
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3541
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3542
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3543
0
                        goto err;
3544
0
                    }
3545
0
                }
3546
0
            }
3547
3548
0
            assert(count == gb->nbams);
3549
3550
            // Add bam array to free-list
3551
0
            pthread_mutex_lock(&fd->lines_m);
3552
0
            gb->next = fd->bams;
3553
0
            fd->bams = gl->bams;
3554
0
            gl->bams = NULL;
3555
0
            pthread_mutex_unlock(&fd->lines_m);
3556
0
        } else {
3557
0
            if (fp->is_bgzf) {
3558
                // We keep track of how much in the current block we have
3559
                // remaining => R.  We look for the last newline in input
3560
                // [i] to [i+R], backwards => position N.
3561
                //
3562
                // If we find a newline, we write out bytes i to N.
3563
                // We know we cannot fit the next record in this bgzf block,
3564
                // so we flush what we have and copy input N to i+R into
3565
                // the start of a new block, and recompute a new R for that.
3566
                //
3567
                // If we don't find a newline (i==N) then we cannot extend
3568
                // the current block at all, so flush whatever is in it now
3569
                // if it ends on a newline.
3570
                // We still copy i(==N) to i+R to the next block and
3571
                // continue as before with a new R.
3572
                //
3573
                // The only exception on the flush is when we run out of
3574
                // data in the input.  In that case we skip it as we don't
3575
                // yet know if the next record will fit.
3576
                //
3577
                // Both conditions share the same code here:
3578
                // - Look for newline (pos N)
3579
                // - Write i to N (which maybe 0)
3580
                // - Flush if block ends on newline and not end of input
3581
                // - write N to i+R
3582
3583
0
                int i = 0;
3584
0
                BGZF *fb = fp->fp.bgzf;
3585
0
                while (i < gl->data_size) {
3586
                    // remaining space in block
3587
0
                    int R = BGZF_BLOCK_SIZE - fb->block_offset;
3588
0
                    int eod = 0;
3589
0
                    if (R > gl->data_size-i)
3590
0
                        R = gl->data_size-i, eod = 1;
3591
3592
                    // Find last newline in input data
3593
0
                    int N = i + R;
3594
0
                    while (--N > i) {
3595
0
                        if (gl->data[N] == '\n')
3596
0
                            break;
3597
0
                    }
3598
3599
0
                    if (N != i) {
3600
                        // Found a newline
3601
0
                        N++;
3602
0
                        if (bgzf_write(fb, &gl->data[i], N-i) != N-i)
3603
0
                            goto err;
3604
0
                    }
3605
3606
                    // Flush bgzf block
3607
0
                    int b_off = fb->block_offset;
3608
0
                    if (!eod && b_off &&
3609
0
                        ((char *)fb->uncompressed_block)[b_off-1] == '\n')
3610
0
                        if (bgzf_flush_try(fb, BGZF_BLOCK_SIZE) < 0)
3611
0
                            goto err;
3612
3613
                    // Copy from N onwards into next block
3614
0
                    if (i+R > N)
3615
0
                        if (bgzf_write(fb, &gl->data[N], i+R - N)
3616
0
                            != i+R - N)
3617
0
                            goto err;
3618
3619
0
                    i = i+R;
3620
0
                }
3621
0
            } else {
3622
0
                if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size)
3623
0
                    goto err;
3624
0
            }
3625
0
        }
3626
3627
0
        hts_tpool_delete_result(r, 0);
3628
3629
        // Also updated by main thread
3630
0
        pthread_mutex_lock(&fd->lines_m);
3631
0
        gl->next = fd->lines;
3632
0
        fd->lines = gl;
3633
0
        pthread_mutex_unlock(&fd->lines_m);
3634
0
    }
3635
3636
0
    sam_state_err(fd, 0); // success
3637
0
    hts_tpool_process_shutdown(fd->q);
3638
0
    return NULL;
3639
3640
0
 err:
3641
0
    sam_state_err(fd, errno ? errno : EIO);
3642
0
    return (void *)-1;
3643
0
}
3644
3645
// Run from one of the worker threads.
3646
// Convert a passed in array of BAMs (sp_bams) and converts to a block
3647
// of text SAM records (sp_lines).
3648
0
static void *sam_format_worker(void *arg) {
3649
0
    sp_bams *gb = (sp_bams *)arg;
3650
0
    sp_lines *gl = NULL;
3651
0
    int i;
3652
0
    SAM_state *fd = gb->fd;
3653
0
    htsFile *fp = fd->fp;
3654
3655
    // Use a block of SAM strings we had earlier if available.
3656
0
    pthread_mutex_lock(&fd->lines_m);
3657
0
    if (fd->lines) {
3658
0
        gl = fd->lines;
3659
0
        fd->lines = gl->next;
3660
0
    }
3661
0
    pthread_mutex_unlock(&fd->lines_m);
3662
3663
0
    if (gl == NULL) {
3664
0
        gl = calloc(1, sizeof(*gl));
3665
0
        if (!gl) {
3666
0
            sam_state_err(fd, ENOMEM);
3667
0
            return NULL;
3668
0
        }
3669
0
        gl->alloc = gl->data_size = 0;
3670
0
        gl->data = NULL;
3671
0
    }
3672
0
    gl->serial = gb->serial;
3673
0
    gl->next = NULL;
3674
3675
0
    kstring_t ks = {0, gl->alloc, gl->data};
3676
3677
0
    for (i = 0; i < gb->nbams; i++) {
3678
0
        if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) {
3679
0
            sam_state_err(fd, errno ? errno : EIO);
3680
0
            goto err;
3681
0
        }
3682
0
        kputc('\n', &ks);
3683
0
    }
3684
3685
0
    pthread_mutex_lock(&fd->lines_m);
3686
0
    gl->data_size = ks.l;
3687
0
    gl->alloc = ks.m;
3688
0
    gl->data = ks.s;
3689
3690
0
    if (fp->idx) {
3691
        // Keep hold of the bam array a little longer as
3692
        // sam_dispatcher_write needs to use them for building the index.
3693
0
        gl->bams = gb;
3694
0
    } else {
3695
        // Add bam array to free-list
3696
0
        gb->next = fd->bams;
3697
0
        fd->bams = gb;
3698
0
    }
3699
0
    pthread_mutex_unlock(&fd->lines_m);
3700
3701
0
    return gl;
3702
3703
0
 err:
3704
    // Possible race between this and fd->curr_bam.
3705
    // Easier to not free and leave it on the input list so it
3706
    // gets freed there instead?
3707
    // sam_free_sp_bams(gb);
3708
0
    if (gl) {
3709
0
        free(gl->data);
3710
0
        free(gl);
3711
0
    }
3712
0
    return NULL;
3713
0
}
3714
3715
0
int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) {
3716
0
    if (fp->state)
3717
0
        return 0;
3718
3719
0
    if (!(fp->state = sam_state_create(fp)))
3720
0
        return -1;
3721
0
    SAM_state *fd = (SAM_state *)fp->state;
3722
3723
0
    pthread_mutex_init(&fd->lines_m, NULL);
3724
0
    pthread_mutex_init(&fd->command_m, NULL);
3725
0
    pthread_cond_init(&fd->command_c, NULL);
3726
0
    fd->p = p->pool;
3727
0
    int qsize = p->qsize;
3728
0
    if (!qsize)
3729
0
        qsize = 2*hts_tpool_size(fd->p);
3730
0
    fd->q = hts_tpool_process_init(fd->p, qsize, 0);
3731
0
    if (!fd->q) {
3732
0
        sam_state_destroy(fp);
3733
0
        return -1;
3734
0
    }
3735
3736
0
    if (fp->format.compression == bgzf)
3737
0
        return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize);
3738
3739
0
    return 0;
3740
0
}
3741
3742
0
int sam_set_threads(htsFile *fp, int nthreads) {
3743
0
    if (nthreads <= 0)
3744
0
        return 0;
3745
3746
0
    htsThreadPool p;
3747
0
    p.pool = hts_tpool_init(nthreads);
3748
0
    p.qsize = nthreads*2;
3749
3750
0
    int ret = sam_set_thread_pool(fp, &p);
3751
0
    if (ret < 0)
3752
0
        return ret;
3753
3754
0
    SAM_state *fd = (SAM_state *)fp->state;
3755
0
    fd->own_pool = 1;
3756
3757
0
    return 0;
3758
0
}
3759
3760
0
#define UMI_TAGS 5
3761
typedef struct {
3762
    kstring_t name;
3763
    kstring_t comment; // NB: pointer into name, do not free
3764
    kstring_t seq;
3765
    kstring_t qual;
3766
    int casava;
3767
    int aux;
3768
    int rnum;
3769
    char BC[3];         // aux tag ID for barcode
3770
    char UMI[UMI_TAGS][3]; // aux tag list for UMIs.
3771
    khash_t(tag) *tags; // which aux tags to use (if empty, use all).
3772
    char nprefix;
3773
    int sra_names;
3774
    regex_t regex;
3775
} fastq_state;
3776
3777
// Initialise fastq state.
3778
// Name char of '@' or '>' distinguishes fastq vs fasta variant
3779
648
static fastq_state *fastq_state_init(int name_char) {
3780
648
    fastq_state *x = (fastq_state *)calloc(1, sizeof(*x));
3781
648
    if (!x)
3782
0
        return NULL;
3783
648
    strcpy(x->BC, "BC");
3784
648
    x->nprefix = name_char;
3785
    // Default Illumina naming convention
3786
648
    char *re = "^[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:([^:#/]+)";
3787
648
    if (regcomp(&x->regex, re, REG_EXTENDED) != 0) {
3788
0
        free(x);
3789
0
        return NULL;
3790
0
    }
3791
3792
648
    return x;
3793
648
}
3794
3795
864
void fastq_state_destroy(htsFile *fp) {
3796
864
    if (fp->state) {
3797
648
        fastq_state *x = (fastq_state *)fp->state;
3798
648
        if (x->tags)
3799
0
            kh_destroy(tag, x->tags);
3800
648
        ks_free(&x->name);
3801
648
        ks_free(&x->seq);
3802
648
        ks_free(&x->qual);
3803
648
        regfree(&x->regex);
3804
648
        free(fp->state);
3805
648
    }
3806
864
}
3807
3808
0
int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
3809
0
    va_list args;
3810
3811
0
    if (!fp)
3812
0
        return -1;
3813
0
    if (!fp->state)
3814
0
        if (!(fp->state = fastq_state_init(fp->format.format == fastq_format
3815
0
                                           ? '@' : '>')))
3816
0
            return -1;
3817
3818
0
    fastq_state *x = (fastq_state *)fp->state;
3819
3820
0
    switch (opt) {
3821
0
    case FASTQ_OPT_CASAVA:
3822
0
        x->casava = 1;
3823
0
        break;
3824
3825
0
    case FASTQ_OPT_NAME2:
3826
0
        x->sra_names = 1;
3827
0
        break;
3828
3829
0
    case FASTQ_OPT_AUX: {
3830
0
        va_start(args, opt);
3831
0
        x->aux = 1;
3832
0
        char *tag = va_arg(args, char *);
3833
0
        va_end(args);
3834
0
        if (tag && strcmp(tag, "1") != 0) {
3835
0
            if (!x->tags)
3836
0
                if (!(x->tags = kh_init(tag)))
3837
0
                    return -1;
3838
3839
0
            size_t i, tlen = strlen(tag);
3840
0
            for (i = 0; i+3 <= tlen+1; i += 3) {
3841
0
                if (tag[i+0] == ',' || tag[i+1] == ',' ||
3842
0
                    !(tag[i+2] == ',' || tag[i+2] == '\0')) {
3843
0
                    hts_log_warning("Bad tag format '%.3s'; skipping option", tag+i);
3844
0
                    break;
3845
0
                }
3846
0
                int ret, tcode = tag[i+0]*256 + tag[i+1];
3847
0
                kh_put(tag, x->tags, tcode, &ret);
3848
0
                if (ret < 0)
3849
0
                    return -1;
3850
0
            }
3851
0
        }
3852
0
        break;
3853
0
    }
3854
3855
0
    case FASTQ_OPT_BARCODE: {
3856
0
        va_start(args, opt);
3857
0
        char *bc = va_arg(args, char *);
3858
0
        va_end(args);
3859
0
        strncpy(x->BC, bc, 2);
3860
0
        x->BC[2] = 0;
3861
0
        break;
3862
0
    }
3863
3864
0
    case FASTQ_OPT_UMI: {
3865
        // UMI tag: an empty string disables UMI by setting x->UMI[0] to \0\0\0
3866
0
        va_start(args, opt);
3867
0
        char *bc = va_arg(args, char *), *bc_orig = bc;
3868
0
        va_end(args);
3869
0
        if (!bc || strcmp(bc, "1") == 0)
3870
0
            bc = "RX";
3871
0
        int ntags = 0, err = 0;
3872
0
        for (ntags = 0; *bc && ntags < UMI_TAGS; ntags++) {
3873
0
            if (!isalpha(bc[0]) || !isalnum_c(bc[1])) {
3874
0
                err = 1;
3875
0
                break;
3876
0
            }
3877
3878
0
            strncpy(x->UMI[ntags], bc, 3);
3879
0
            bc += 2;
3880
0
            if (*bc && *bc != ',') {
3881
0
                err = 1;
3882
0
                break;
3883
0
            }
3884
0
            bc+=(*bc==',');
3885
0
            x->UMI[ntags][2] = 0;
3886
0
        }
3887
0
        for (; ntags < UMI_TAGS; ntags++)
3888
0
            x->UMI[ntags][0] = x->UMI[ntags][1] = x->UMI[ntags][2] = 0;
3889
3890
3891
0
        if (err)
3892
0
            hts_log_warning("Bad UMI tag list '%s'", bc_orig);
3893
3894
0
        break;
3895
0
    }
3896
3897
0
    case FASTQ_OPT_UMI_REGEX: {
3898
0
        va_start(args, opt);
3899
0
        char *re = va_arg(args, char *);
3900
0
        va_end(args);
3901
3902
0
        regfree(&x->regex);
3903
0
        if (regcomp(&x->regex, re, REG_EXTENDED) != 0) {
3904
0
            hts_log_error("Regular expression '%s' is not supported", re);
3905
0
            return -1;
3906
0
        }
3907
0
        break;
3908
0
    }
3909
3910
0
    case FASTQ_OPT_RNUM:
3911
0
        x->rnum = 1;
3912
0
        break;
3913
3914
0
    default:
3915
0
        break;
3916
0
    }
3917
0
    return 0;
3918
0
}
3919
3920
11.3M
static int fastq_parse1(htsFile *fp, bam1_t *b) {
3921
11.3M
    fastq_state *x = (fastq_state *)fp->state;
3922
11.3M
    size_t i, l;
3923
11.3M
    int ret = 0;
3924
3925
11.3M
    if (fp->format.format == fasta_format && fp->line.s) {
3926
        // For FASTA we've already read the >name line; steal it
3927
        // Not the most efficient, but we don't optimise for fasta reading.
3928
11.3M
        if (fp->line.l == 0)
3929
192
            return -1; // EOF
3930
3931
11.3M
        free(x->name.s);
3932
11.3M
        x->name = fp->line;
3933
11.3M
        fp->line.l = fp->line.m = 0;
3934
11.3M
        fp->line.s = NULL;
3935
11.3M
    } else {
3936
        // Read a FASTQ format entry.
3937
1.13k
        ret = hts_getline(fp, KS_SEP_LINE, &x->name);
3938
1.13k
        if (ret == -1)
3939
3
            return -1;  // EOF
3940
1.13k
        else if (ret < -1)
3941
0
            return ret; // ERR
3942
1.13k
    }
3943
3944
    // Name
3945
11.3M
    if (*x->name.s != x->nprefix)
3946
9
        return -2;
3947
3948
    // Reverse the SRA strangeness of putting the run_name.number before
3949
    // the read name.
3950
11.3M
    i = 0;
3951
11.3M
    char *name = x->name.s+1;
3952
11.3M
    if (x->sra_names) {
3953
0
        char *cp = strpbrk(x->name.s, " \t");
3954
0
        if (cp) {
3955
0
            while (*cp == ' ' || *cp == '\t')
3956
0
                cp++;
3957
0
            *--cp = '@';
3958
0
            i = cp - x->name.s;
3959
0
            name = cp+1;
3960
0
        }
3961
0
    }
3962
3963
11.3M
    l = x->name.l;
3964
11.3M
    char *s = x->name.s;
3965
31.2M
    while (i < l && !isspace_c(s[i]))
3966
19.9M
        i++;
3967
11.3M
    if (i < l) {
3968
90.3k
        s[i] = 0;
3969
90.3k
        x->name.l = i++;
3970
90.3k
    }
3971
3972
    // Comment; a kstring struct, but pointer into name line.  (Do not free)
3973
11.7M
    while (i < l && isspace_c(s[i]))
3974
445k
        i++;
3975
11.3M
    x->comment.s = s+i;
3976
11.3M
    x->comment.l = l - i;
3977
3978
    // Seq
3979
11.3M
    x->seq.l = 0;
3980
42.7M
    for (;;) {
3981
42.7M
        if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0)
3982
588
            if (fp->format.format == fastq_format || ret < -1)
3983
387
                return -2;
3984
42.7M
        if (ret == -1 ||
3985
42.7M
            *fp->line.s == (fp->format.format == fastq_format ? '+' : '>'))
3986
11.3M
            break;
3987
31.4M
        if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0)
3988
0
            return -2;
3989
31.4M
    }
3990
3991
    // Qual
3992
11.3M
    if (fp->format.format == fastq_format) {
3993
495
        size_t remainder = x->seq.l;
3994
495
        x->qual.l = 0;
3995
1.77k
        do {
3996
1.77k
            if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0)
3997
0
                return -2;
3998
1.77k
            if (fp->line.l > remainder)
3999
9
                return -2;
4000
1.77k
            if (kputsn(fp->line.s, fp->line.l, &x->qual) < 0)
4001
0
                return -2;
4002
1.77k
            remainder -= fp->line.l;
4003
1.77k
        } while (remainder > 0);
4004
4005
        // Decr qual
4006
170k
        for (i = 0; i < x->qual.l; i++)
4007
169k
            x->qual.s[i] -= '!';
4008
486
    }
4009
4010
11.3M
    int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED;
4011
11.3M
    if (x->name.l > 2 &&
4012
864k
        x->name.s[x->name.l-2] == '/' &&
4013
14.0k
        isdigit_c(x->name.s[x->name.l-1])) {
4014
13.5k
        switch(x->name.s[x->name.l-1]) {
4015
2.67k
        case '1': flag |= BAM_FREAD1 | pflag; break;
4016
423
        case '2': flag |= BAM_FREAD2 | pflag; break;
4017
10.4k
        default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
4018
13.5k
        }
4019
13.5k
        x->name.s[x->name.l-=2] = 0;
4020
13.5k
    }
4021
4022
    // Strip Illumina formatted UMI off read-name
4023
11.3M
    char UMI_seq[256]; // maximum length in spec
4024
11.3M
    size_t UMI_len = 0;
4025
11.3M
    if (x->UMI[0][0]) {
4026
0
        regmatch_t match[3];
4027
0
        if (regexec(&x->regex, x->name.s, 2, match, 0) == 0
4028
0
            && match[0].rm_so >= 0     // whole regex
4029
0
            && match[1].rm_so >= 0) {  // bracketted UMI component
4030
0
            UMI_len = match[1].rm_eo - match[1].rm_so;
4031
0
            if (UMI_len > 255) {
4032
0
                hts_log_error("SAM read name is too long");
4033
0
                return -2;
4034
0
            }
4035
4036
            // The SAMTags spec recommends (but not requires) separating
4037
            // barcodes with hyphen ('-').
4038
0
            size_t i;
4039
0
            for (i = 0; i < UMI_len; i++)
4040
0
                UMI_seq[i] = isalpha_c(x->name.s[i+match[1].rm_so])
4041
0
                    ? x->name.s[i+match[1].rm_so]
4042
0
                    : '-';
4043
4044
            // Move any trailing #num earlier in the name
4045
0
            if (UMI_len) {
4046
0
                UMI_seq[UMI_len++] = 0;
4047
4048
0
                x->name.l = match[1].rm_so;
4049
0
                if (x->name.l > 0 && x->name.s[x->name.l-1] == ':')
4050
0
                    x->name.l--; // remove colon too
4051
0
                char *cp = x->name.s + match[1].rm_eo;
4052
0
                while (*cp)
4053
0
                    x->name.s[x->name.l++] = *cp++;
4054
0
                x->name.s[x->name.l] = 0;
4055
0
            }
4056
0
        }
4057
0
    }
4058
4059
    // Convert to BAM
4060
11.3M
    ret = bam_set1(b,
4061
11.3M
                   x->name.s + x->name.l - name, name,
4062
11.3M
                   flag,
4063
11.3M
                   -1, -1, 0, // ref '*', pos, mapq,
4064
11.3M
                   0, NULL,     // no cigar,
4065
11.3M
                   -1, -1, 0,    // mate
4066
11.3M
                   x->seq.l, x->seq.s, x->qual.s,
4067
11.3M
                   0);
4068
11.3M
    if (ret < 0) return -2;
4069
4070
    // Add UMI tag if removed from read-name above
4071
11.3M
    if (UMI_len) {
4072
0
        if (bam_aux_append(b, x->UMI[0], 'Z', UMI_len, (uint8_t *)UMI_seq) < 0)
4073
0
            ret = -2;
4074
0
    }
4075
4076
    // Identify Illumina CASAVA strings.
4077
    // <read>:<is_filtered>:<control_bits>:<barcode_sequence>
4078
11.3M
    char *barcode = NULL;
4079
11.3M
    int barcode_len = 0;
4080
11.3M
    kstring_t *kc = &x->comment;
4081
11.3M
    char *endptr;
4082
11.3M
    if (x->casava &&
4083
        // \d:[YN]:\d+:[ACGTN]+
4084
0
        kc->l > 6 && (kc->s[1] | kc->s[3]) == ':' && isdigit_c(kc->s[0]) &&
4085
0
        strtol(kc->s+4, &endptr, 10) >= 0 && endptr != kc->s+4
4086
0
        && *endptr == ':') {
4087
4088
        // read num
4089
0
        switch(kc->s[0]) {
4090
0
        case '1': b->core.flag |= BAM_FREAD1 | pflag; break;
4091
0
        case '2': b->core.flag |= BAM_FREAD2 | pflag; break;
4092
0
        default : b->core.flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
4093
0
        }
4094
4095
0
        if (kc->s[2] == 'Y')
4096
0
            b->core.flag |= BAM_FQCFAIL;
4097
4098
        // Barcode, maybe numeric in which case we skip it
4099
0
        if (!isdigit_c(endptr[1])) {
4100
0
            barcode = endptr+1;
4101
0
            for (i = barcode - kc->s; i < kc->l; i++)
4102
0
                if (isspace_c(kc->s[i]))
4103
0
                    break;
4104
4105
0
            kc->s[i] = 0;
4106
0
            barcode_len = i+1-(barcode - kc->s);
4107
0
        }
4108
0
    }
4109
4110
11.3M
    if (ret >= 0 && barcode_len)
4111
0
        if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0)
4112
0
            ret = -2;
4113
4114
11.3M
    if (!x->aux)
4115
11.3M
        return ret;
4116
4117
    // Identify any SAM style aux tags in comments too.
4118
0
    if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0)
4119
0
        ret = -2;
4120
4121
0
    return ret;
4122
11.3M
}
4123
4124
// Internal component of sam_read1 below
4125
209
static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4126
209
    int ret = bam_read1(fp->fp.bgzf, b);
4127
209
    if (h && ret >= 0) {
4128
187
        if (b->core.tid  >= h->n_targets || b->core.tid  < -1 ||
4129
185
            b->core.mtid >= h->n_targets || b->core.mtid < -1) {
4130
2
            errno = ERANGE;
4131
2
            return -3;
4132
2
        }
4133
187
    }
4134
207
    return ret;
4135
209
}
4136
4137
// Internal component of sam_read1 below
4138
530
static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) {
4139
530
    int ret = cram_get_bam_seq(fp->fp.cram, b);
4140
530
    if (ret < 0)
4141
530
        return cram_eof(fp->fp.cram) ? -1 : -2;
4142
4143
0
    if (bam_tag2cigar(*b, 1, 1) < 0)
4144
0
        return -2;
4145
4146
0
    return ret;
4147
0
}
4148
4149
// Internal component of sam_read1 below
4150
92.8k
static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4151
92.8k
    int ret;
4152
4153
    // Consume 1st line after header parsing as it wasn't using peek
4154
92.8k
    if (fp->line.l != 0) {
4155
0
        ret = sam_parse1(&fp->line, h, b);
4156
0
        fp->line.l = 0;
4157
0
        return ret;
4158
0
    }
4159
4160
92.8k
    if (fp->state) {
4161
0
        SAM_state *fd = (SAM_state *)fp->state;
4162
4163
0
        if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) {
4164
            // We don't support multi-threaded SAM parsing with seeks yet.
4165
0
            int ret;
4166
0
            if ((ret = sam_state_destroy(fp)) < 0) {
4167
0
                errno = -ret;
4168
0
                return -2;
4169
0
            }
4170
0
            if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0)
4171
0
                return -2;
4172
0
            fp->fp.bgzf->seeked = 0;
4173
0
            goto err_recover;
4174
0
        }
4175
4176
0
        if (!fd->h) {
4177
0
            fd->h = h;
4178
0
            fd->h->ref_count++;
4179
            // Ensure hrecs is initialised now as we don't want multiple
4180
            // threads trying to do this simultaneously.
4181
0
            if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0)
4182
0
                return -2;
4183
4184
            // We can only do this once we've got a header
4185
0
            if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read,
4186
0
                               fp) != 0)
4187
0
                return -2;
4188
0
            fd->dispatcher_set = 1;
4189
0
        }
4190
4191
0
        if (fd->h != h) {
4192
0
            hts_log_error("SAM multi-threaded decoding does not support changing header");
4193
0
            return -2;
4194
0
        }
4195
4196
0
        sp_bams *gb = fd->curr_bam;
4197
0
        if (!gb) {
4198
0
            if (fd->errcode) {
4199
                // In case reader failed
4200
0
                errno = fd->errcode;
4201
0
                return -2;
4202
0
            }
4203
4204
0
            pthread_mutex_lock(&fd->command_m);
4205
0
            int cmd = fd->command;
4206
0
            pthread_mutex_unlock(&fd->command_m);
4207
0
            if (cmd == SAM_AT_EOF)
4208
0
                return -1;
4209
4210
0
            hts_tpool_result *r = hts_tpool_next_result_wait(fd->q);
4211
0
            if (!r)
4212
0
                return -2;
4213
0
            fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r);
4214
0
            hts_tpool_delete_result(r, 0);
4215
0
        }
4216
0
        if (!gb) {
4217
0
            pthread_mutex_lock(&fd->command_m);
4218
0
            fd->command = SAM_AT_EOF;
4219
0
            pthread_mutex_unlock(&fd->command_m);
4220
0
            return fd->errcode ? -2 : -1;
4221
0
        }
4222
0
        bam1_t *b_array = (bam1_t *)gb->bams;
4223
0
        if (fd->curr_idx < gb->nbams)
4224
0
            if (!bam_copy1(b, &b_array[fd->curr_idx++]))
4225
0
                return -2;
4226
0
        if (fd->curr_idx == gb->nbams) {
4227
0
            pthread_mutex_lock(&fd->lines_m);
4228
0
            gb->next = fd->bams;
4229
0
            fd->bams = gb;
4230
0
            pthread_mutex_unlock(&fd->lines_m);
4231
4232
0
            fd->curr_bam = NULL;
4233
0
            fd->curr_idx = 0;
4234
        // Consider prefetching next record?  I.e.
4235
        // } else {
4236
        //     __builtin_prefetch(&b_array[fd->curr_idx], 0, 3);
4237
0
        }
4238
4239
0
        ret = 0;
4240
4241
92.8k
    } else  {
4242
92.8k
    err_recover:
4243
92.8k
        ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
4244
92.8k
        if (ret < 0) return ret;
4245
4246
92.3k
        ret = sam_parse1(&fp->line, h, b);
4247
92.3k
        fp->line.l = 0;
4248
92.3k
        if (ret < 0) {
4249
700
            hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
4250
700
            if (h && h->ignore_sam_err) goto err_recover;
4251
700
        }
4252
92.3k
    }
4253
4254
92.3k
    return ret;
4255
92.8k
}
4256
4257
// Returns 0 on success,
4258
//        -1 on EOF,
4259
//       <-1 on error
4260
int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b)
4261
11.3M
{
4262
11.3M
    int ret, pass_filter;
4263
4264
11.3M
    do {
4265
11.3M
        switch (fp->format.format) {
4266
209
        case bam:
4267
209
            ret = sam_read1_bam(fp, h, b);
4268
209
            break;
4269
4270
530
        case cram:
4271
530
            ret = sam_read1_cram(fp, h, &b);
4272
530
            break;
4273
4274
92.8k
        case sam:
4275
92.8k
            ret = sam_read1_sam(fp, h, b);
4276
92.8k
            break;
4277
4278
11.3M
        case fasta_format:
4279
11.3M
        case fastq_format: {
4280
11.3M
            fastq_state *x = (fastq_state *)fp->state;
4281
11.3M
            if (!x) {
4282
648
                if (!(fp->state = fastq_state_init(fp->format.format
4283
648
                                                   == fastq_format ? '@' : '>')))
4284
0
                    return -2;
4285
648
            }
4286
4287
11.3M
            return fastq_parse1(fp, b);
4288
11.3M
        }
4289
4290
0
        case empty_format:
4291
0
            errno = EPIPE;
4292
0
            return -3;
4293
4294
0
        default:
4295
0
            errno = EFTYPE;
4296
0
            return -3;
4297
11.3M
        }
4298
4299
93.5k
        pass_filter = (ret >= 0 && fp->filter)
4300
93.5k
            ? sam_passes_filter(h, b, fp->filter)
4301
93.5k
            : 1;
4302
93.5k
    } while (pass_filter == 0);
4303
4304
93.5k
    return pass_filter < 0 ? -2 : ret;
4305
11.3M
}
4306
4307
// With gcc, -O3 or -ftree-loop-vectorize is really key here as otherwise
4308
// this code isn't vectorised and runs far slower than is necessary (even
4309
// with the restrict keyword being used).
4310
static inline void HTS_OPT3
4311
265
add33(uint8_t *a, const uint8_t * b, int32_t len) {
4312
265
    uint32_t i;
4313
57.1k
    for (i = 0; i < len; i++)
4314
56.8k
        a[i] = b[i]+33;
4315
265
}
4316
4317
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4318
3.79M
{
4319
3.79M
    int i, r = 0;
4320
3.79M
    uint8_t *s, *end;
4321
3.79M
    const bam1_core_t *c = &b->core;
4322
4323
3.79M
    if (c->l_qname == 0)
4324
0
        return -1;
4325
3.79M
    r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str);
4326
3.79M
    r |= kputc_('\t', str); // query name
4327
3.79M
    r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag
4328
3.79M
    if (c->tid >= 0) { // chr
4329
21.0k
        r |= kputs(h->target_name[c->tid] , str);
4330
21.0k
        r |= kputc_('\t', str);
4331
3.77M
    } else r |= kputsn_("*\t", 2, str);
4332
3.79M
    r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos
4333
3.79M
    r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual
4334
3.79M
    if (c->n_cigar) { // cigar
4335
25.7k
        uint32_t *cigar = bam_get_cigar(b);
4336
683k
        for (i = 0; i < c->n_cigar; ++i) {
4337
657k
            r |= kputw(bam_cigar_oplen(cigar[i]), str);
4338
657k
            r |= kputc_(bam_cigar_opchr(cigar[i]), str);
4339
657k
        }
4340
3.77M
    } else r |= kputc_('*', str);
4341
3.79M
    r |= kputc_('\t', str);
4342
3.79M
    if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr
4343
520
    else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str);
4344
447
    else {
4345
447
        r |= kputs(h->target_name[c->mtid], str);
4346
447
        r |= kputc_('\t', str);
4347
447
    }
4348
3.79M
    r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos
4349
3.79M
    r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len
4350
3.79M
    if (c->l_qseq) { // seq and qual
4351
113k
        uint8_t *s = bam_get_seq(b);
4352
113k
        if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err;
4353
113k
        char *cp = str->s + str->l;
4354
4355
        // Sequence, 2 bases at a time
4356
113k
        nibble2base(s, cp, c->l_qseq);
4357
113k
        cp[c->l_qseq] = '\t';
4358
113k
        cp += c->l_qseq+1;
4359
4360
        // Quality
4361
113k
        s = bam_get_qual(b);
4362
113k
        i = 0;
4363
113k
        if (s[0] == 0xff) {
4364
113k
            cp[i++] = '*';
4365
113k
        } else {
4366
265
            add33((uint8_t *)cp, s, c->l_qseq); // cp[i] = s[i]+33;
4367
265
            i = c->l_qseq;
4368
265
        }
4369
113k
        cp[i] = 0;
4370
113k
        cp += i;
4371
113k
        str->l = cp - str->s;
4372
3.68M
    } else r |= kputsn_("*\t*", 3, str);
4373
4374
3.79M
    s = bam_get_aux(b); // aux
4375
3.79M
    end = b->data + b->l_data;
4376
4377
4.31M
    while (end - s >= 4) {
4378
517k
        r |= kputc_('\t', str);
4379
517k
        if ((s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)) == NULL)
4380
7
            goto bad_aux;
4381
517k
    }
4382
3.79M
    r |= kputsn("", 0, str); // nul terminate
4383
3.79M
    if (r < 0) goto mem_err;
4384
4385
3.79M
    return str->l;
4386
4387
7
 bad_aux:
4388
7
    hts_log_error("Corrupted aux data for read %.*s flag %d",
4389
7
                  b->core.l_qname, bam_get_qname(b), b->core.flag);
4390
7
    errno = EINVAL;
4391
7
    return -1;
4392
4393
0
 mem_err:
4394
0
    hts_log_error("Out of memory");
4395
0
    errno = ENOMEM;
4396
0
    return -1;
4397
3.79M
}
4398
4399
int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4400
3.79M
{
4401
3.79M
    str->l = 0;
4402
3.79M
    return sam_format1_append(h, b, str);
4403
3.79M
}
4404
4405
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end);
4406
int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str)
4407
0
{
4408
0
    unsigned flag = b->core.flag;
4409
0
    int i, e = 0, len = b->core.l_qseq;
4410
0
    uint8_t *seq, *qual;
4411
4412
0
    str->l = 0;
4413
4414
    // Name
4415
0
    if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF)
4416
0
        return -1;
4417
4418
    // UMI tag
4419
0
    if (x && *x->UMI[0]) {
4420
        // Temporary copy of '#num' if present
4421
0
        char plex[256];
4422
0
        size_t len = str->l;
4423
0
        while (len && str->s[len] != ':' && str->s[len] != '#')
4424
0
            len--;
4425
4426
0
        if (str->s[len] == '#' && str->l - len < 255) {
4427
0
            memcpy(plex, &str->s[len], str->l - len);
4428
0
            plex[str->l - len] = 0;
4429
0
            str->l = len;
4430
0
        } else {
4431
0
            *plex = 0;
4432
0
        }
4433
4434
0
        uint8_t *bc = NULL;
4435
0
        int n;
4436
0
        for (n = 0; !bc && n < UMI_TAGS; n++)
4437
0
            bc = bam_aux_get(b, x->UMI[n]);
4438
0
        if (bc && *bc == 'Z') {
4439
0
            int err = kputc(':', str) < 0;
4440
            // Replace any non-alpha with '+'
4441
0
            while (*++bc)
4442
0
                err |= kputc(isalpha_c(*bc) ? toupper_c(*bc) : '+', str) < 0;
4443
0
            if (err)
4444
0
                return -1;
4445
0
        }
4446
4447
0
        if (*plex && kputs(plex, str) < 0)
4448
0
            return -1;
4449
0
    }
4450
4451
    // /1 or /2 suffix
4452
0
    if (x && x->rnum && (flag & BAM_FPAIRED)) {
4453
0
        int r12 = flag & (BAM_FREAD1 | BAM_FREAD2);
4454
0
        if (r12 == BAM_FREAD1) {
4455
0
            if (kputs("/1", str) == EOF)
4456
0
                return -1;
4457
0
        } else if (r12 == BAM_FREAD2) {
4458
0
            if (kputs("/2", str) == EOF)
4459
0
                return -1;
4460
0
        }
4461
0
    }
4462
4463
    // Illumina CASAVA tag.
4464
    // This is <rnum>:<Y/N qcfail>:<control-bits>:<barcode-or-zero>
4465
0
    if (x && x->casava) {
4466
0
        int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0;
4467
0
        char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N';
4468
0
        uint8_t *bc = bam_aux_get(b, x->BC);
4469
0
        if (ksprintf(str, " %d:%c:0:%s", rnum, filtered,
4470
0
                     bc ? (char *)bc+1 : "0") < 0)
4471
0
            return -1;
4472
4473
0
        if (bc && (*bc != 'Z' || (!isupper_c(bc[1]) && !islower_c(bc[1])))) {
4474
0
            hts_log_warning("BC tag starts with non-sequence base; using '0'");
4475
0
            str->l -= strlen((char *)bc)-2; // limit to 1 char
4476
0
            str->s[str->l-1] = '0';
4477
0
            str->s[str->l] = 0;
4478
0
            bc = NULL;
4479
0
        }
4480
4481
        // Replace any non-alpha with '+'.  Ie seq-seq to seq+seq
4482
0
        if (bc) {
4483
0
            int l = strlen((char *)bc+1);
4484
0
            char *c = (char *)str->s + str->l - l;
4485
0
            for (i = 0; i < l; i++) {
4486
0
                if (!isalpha_c(c[i]))
4487
0
                    c[i] = '+';
4488
0
                else if (islower_c(c[i]))
4489
0
                    c[i] = toupper_c(c[i]);
4490
0
            }
4491
0
        }
4492
0
    }
4493
4494
    // Aux tags
4495
0
    if (x && x->aux) {
4496
0
        uint8_t *s = bam_get_aux(b), *end = b->data + b->l_data;
4497
0
        while (s && end - s >= 4) {
4498
0
            int tt = s[0]*256 + s[1];
4499
0
            if (x->tags == NULL ||
4500
0
                kh_get(tag, x->tags, tt) != kh_end(x->tags)) {
4501
0
                e |= kputc_('\t', str) < 0;
4502
0
                if (!(s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)))
4503
0
                    return -1;
4504
0
            } else {
4505
0
                s = skip_aux(s+2, end);
4506
0
            }
4507
0
        }
4508
0
        e |= kputsn("", 0, str) < 0; // nul terminate
4509
0
    }
4510
4511
0
    if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1;
4512
0
    e |= kputc_('\n', str) < 0;
4513
4514
    // Seq line
4515
0
    seq = bam_get_seq(b);
4516
0
    if (flag & BAM_FREVERSE)
4517
0
        for (i = len-1; i >= 0; i--)
4518
0
            e |= kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str) < 0;
4519
0
    else
4520
0
        for (i = 0; i < len; i++)
4521
0
            e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0;
4522
4523
4524
    // Qual line
4525
0
    if (x->nprefix == '@') {
4526
0
        kputsn("\n+\n", 3, str);
4527
0
        qual = bam_get_qual(b);
4528
0
        if (qual[0] == 0xff)
4529
0
            for (i = 0; i < len; i++)
4530
0
                e |= kputc_('B', str) < 0;
4531
0
        else if (flag & BAM_FREVERSE)
4532
0
            for (i = len-1; i >= 0; i--)
4533
0
                e |= kputc_(33 + qual[i], str) < 0;
4534
0
        else
4535
0
            for (i = 0; i < len; i++)
4536
0
                e |= kputc_(33 + qual[i], str) < 0;
4537
4538
0
    }
4539
0
    e |= kputc('\n', str) < 0;
4540
4541
0
    return e ? -1 : str->l;
4542
0
}
4543
4544
// Sadly we need to be able to modify the bam_hdr here so we can
4545
// reference count the structure.
4546
int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b)
4547
11.3M
{
4548
11.3M
    switch (fp->format.format) {
4549
0
    case binary_format:
4550
0
        fp->format.category = sequence_data;
4551
0
        fp->format.format = bam;
4552
        /* fall-through */
4553
3.79M
    case bam:
4554
3.79M
        return bam_write_idx1(fp, h, b);
4555
4556
3.79M
    case cram:
4557
3.79M
        return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
4558
4559
0
    case text_format:
4560
0
        fp->format.category = sequence_data;
4561
0
        fp->format.format = sam;
4562
        /* fall-through */
4563
3.79M
    case sam:
4564
3.79M
        if (fp->state) {
4565
0
            SAM_state *fd = (SAM_state *)fp->state;
4566
4567
            // Threaded output
4568
0
            if (!fd->h) {
4569
                // NB: discard const.  We don't actually modify sam_hdr_t here,
4570
                // just data pointed to by it (which is a bit weasely still),
4571
                // but out cached pointer must be non-const as we want to
4572
                // destroy it later on and sam_hdr_destroy takes non-const.
4573
                //
4574
                // We do this because some tools do sam_hdr_destroy; sam_close
4575
                // while others do sam_close; sam_hdr_destroy.  The former is
4576
                // an issue as we need the header still when flushing.
4577
0
                fd->h = (sam_hdr_t *)h;
4578
0
                fd->h->ref_count++;
4579
4580
0
                if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write,
4581
0
                                   fp) != 0)
4582
0
                    return -2;
4583
0
                fd->dispatcher_set = 1;
4584
0
            }
4585
4586
0
            if (fd->h != h) {
4587
0
                hts_log_error("SAM multi-threaded decoding does not support changing header");
4588
0
                return -2;
4589
0
            }
4590
4591
            // Find a suitable BAM array to copy to
4592
0
            sp_bams *gb = fd->curr_bam;
4593
0
            if (!gb) {
4594
0
                pthread_mutex_lock(&fd->lines_m);
4595
0
                if (fd->bams) {
4596
0
                    fd->curr_bam = gb = fd->bams;
4597
0
                    fd->bams = gb->next;
4598
0
                    gb->next = NULL;
4599
0
                    gb->nbams = 0;
4600
0
                    gb->bam_mem = 0;
4601
0
                    pthread_mutex_unlock(&fd->lines_m);
4602
0
                } else {
4603
0
                    pthread_mutex_unlock(&fd->lines_m);
4604
0
                    if (!(gb = calloc(1, sizeof(*gb)))) return -1;
4605
0
                    if (!(gb->bams = calloc(SAM_NBAM, sizeof(*gb->bams)))) {
4606
0
                        free(gb);
4607
0
                        return -1;
4608
0
                    }
4609
0
                    gb->nbams = 0;
4610
0
                    gb->abams = SAM_NBAM;
4611
0
                    gb->bam_mem = 0;
4612
0
                    gb->fd = fd;
4613
0
                    fd->curr_idx = 0;
4614
0
                    fd->curr_bam = gb;
4615
0
                }
4616
0
            }
4617
4618
0
            if (!bam_copy1(&gb->bams[gb->nbams++], b))
4619
0
                return -2;
4620
0
            gb->bam_mem += b->l_data + sizeof(*b);
4621
4622
            // Dispatch if full
4623
0
            if (gb->nbams == SAM_NBAM || gb->bam_mem > SAM_NBYTES*0.8) {
4624
0
                gb->serial = fd->serial++;
4625
0
                pthread_mutex_lock(&fd->command_m);
4626
0
                if (fd->errcode != 0) {
4627
0
                    pthread_mutex_unlock(&fd->command_m);
4628
0
                    return -fd->errcode;
4629
0
                }
4630
0
                if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb,
4631
0
                                        cleanup_sp_bams,
4632
0
                                        cleanup_sp_lines, 0) < 0) {
4633
0
                    pthread_mutex_unlock(&fd->command_m);
4634
0
                    return -1;
4635
0
                }
4636
0
                pthread_mutex_unlock(&fd->command_m);
4637
0
                fd->curr_bam = NULL;
4638
0
            }
4639
4640
            // Dummy value as we don't know how long it really is.
4641
            // We could track file sizes via a SAM_state field, but I don't think
4642
            // it is necessary.
4643
0
            return 1;
4644
3.79M
        } else {
4645
3.79M
            if (sam_format1(h, b, &fp->line) < 0) return -1;
4646
3.79M
            kputc('\n', &fp->line);
4647
3.79M
            if (fp->is_bgzf) {
4648
0
                if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4649
0
                    return -1;
4650
0
                if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4651
3.79M
            } else {
4652
3.79M
                if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4653
3.79M
            }
4654
4655
3.79M
            if (fp->idx) {
4656
0
                if (fp->format.compression == bgzf) {
4657
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4658
0
                                      bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4659
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4660
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4661
0
                        return -1;
4662
0
                    }
4663
0
                } else {
4664
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4665
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4666
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4667
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4668
0
                        return -1;
4669
0
                    }
4670
0
                }
4671
0
            }
4672
4673
3.79M
            return fp->line.l;
4674
3.79M
        }
4675
4676
4677
0
    case fasta_format:
4678
0
    case fastq_format: {
4679
0
        fastq_state *x = (fastq_state *)fp->state;
4680
0
        if (!x) {
4681
0
            if (!(fp->state = fastq_state_init(fp->format.format
4682
0
                                               == fastq_format ? '@' : '>')))
4683
0
                return -2;
4684
0
        }
4685
4686
0
        if (fastq_format1(fp->state, b, &fp->line) < 0)
4687
0
            return -1;
4688
0
        if (fp->is_bgzf) {
4689
0
            if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4690
0
                return -1;
4691
0
            if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l)
4692
0
                return -1;
4693
0
        } else {
4694
0
            if (hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l)
4695
0
                return -1;
4696
0
        }
4697
0
        return fp->line.l;
4698
0
    }
4699
4700
0
    default:
4701
0
        errno = EBADF;
4702
0
        return -1;
4703
11.3M
    }
4704
11.3M
}
4705
4706
/************************
4707
 *** Auxiliary fields ***
4708
 ************************/
4709
#ifndef HTS_LITTLE_ENDIAN
4710
static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
4711
    int tsz = aux_type2size(type);
4712
4713
    if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
4714
4715
    switch (tsz) {
4716
        case 'H': case 'Z': case 1:  // Trivial
4717
            memcpy(out, in, len);
4718
            break;
4719
4720
#define aux_val_to_le(type_t, store_le) do {                            \
4721
        type_t v;                                                       \
4722
        size_t i;                                                       \
4723
        for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
4724
            memcpy(&v, in + i, sizeof(type_t));                         \
4725
            store_le(v, out);                                           \
4726
        }                                                               \
4727
    } while (0)
4728
4729
        case 2: aux_val_to_le(uint16_t, u16_to_le); break;
4730
        case 4: aux_val_to_le(uint32_t, u32_to_le); break;
4731
        case 8: aux_val_to_le(uint64_t, u64_to_le); break;
4732
4733
#undef aux_val_to_le
4734
4735
        case 'B': { // Recurse!
4736
            uint32_t n;
4737
            if (len < 5) return -1;
4738
            memcpy(&n, in + 1, 4);
4739
            out[0] = in[0];
4740
            u32_to_le(n, out + 1);
4741
            return aux_to_le(in[0], out + 5, in + 5, len - 5);
4742
        }
4743
4744
        default: // Unknown type code
4745
            return -1;
4746
    }
4747
4748
4749
4750
    return 0;
4751
}
4752
#endif
4753
4754
int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
4755
0
{
4756
0
    uint32_t new_len;
4757
4758
0
    assert(b->l_data >= 0);
4759
0
    new_len = b->l_data + 3 + len;
4760
0
    if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
4761
4762
0
    if (realloc_bam_data(b, new_len) < 0) return -1;
4763
4764
0
    b->data[b->l_data] = tag[0];
4765
0
    b->data[b->l_data + 1] = tag[1];
4766
0
    b->data[b->l_data + 2] = type;
4767
4768
0
#ifdef HTS_LITTLE_ENDIAN
4769
0
    memcpy(b->data + b->l_data + 3, data, len);
4770
#else
4771
    if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
4772
        errno = EINVAL;
4773
        return -1;
4774
    }
4775
#endif
4776
4777
0
    b->l_data = new_len;
4778
4779
0
    return 0;
4780
4781
0
 nomem:
4782
0
    errno = ENOMEM;
4783
0
    return -1;
4784
0
}
4785
4786
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
4787
1.40M
{
4788
1.40M
    int size;
4789
1.40M
    uint32_t n;
4790
1.40M
    if (s >= end) return end;
4791
1.40M
    size = aux_type2size(*s); ++s; // skip type
4792
1.40M
    switch (size) {
4793
241k
    case 'Z':
4794
249k
    case 'H':
4795
249k
        s = memchr(s, 0, end-s);
4796
249k
        return s ? s+1 : end;
4797
41.2k
    case 'B':
4798
41.2k
        if (end - s < 5) return NULL;
4799
41.2k
        size = aux_type2size(*s); ++s;
4800
41.2k
        n = le_to_u32(s);
4801
41.2k
        s += 4;
4802
41.2k
        if (size == 0 || end - s < size * n) return NULL;
4803
41.2k
        return s + size * n;
4804
47
    case 0:
4805
47
        return NULL;
4806
1.11M
    default:
4807
1.11M
        if (end - s < size) return NULL;
4808
1.11M
        return s + size;
4809
1.40M
    }
4810
1.40M
}
4811
4812
uint8_t *bam_aux_first(const bam1_t *b)
4813
3.84M
{
4814
3.84M
    uint8_t *s = bam_get_aux(b);
4815
3.84M
    uint8_t *end = b->data + b->l_data;
4816
3.84M
    if (end - s <= 2) { errno = ENOENT; return NULL; }
4817
68.9k
    return s+2;
4818
3.84M
}
4819
4820
uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s)
4821
1.39M
{
4822
1.39M
    uint8_t *end = b->data + b->l_data;
4823
1.39M
    uint8_t *next = s? skip_aux((uint8_t *) s, end) : end;
4824
1.39M
    if (next == NULL) goto bad_aux;
4825
1.39M
    if (end - next <= 2) { errno = ENOENT; return NULL; }
4826
1.33M
    return next+2;
4827
4828
47
 bad_aux:
4829
47
    hts_log_error("Corrupted aux data for read %s flag %d",
4830
47
                  bam_get_qname(b), b->core.flag);
4831
47
    errno = EINVAL;
4832
47
    return NULL;
4833
1.39M
}
4834
4835
uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
4836
3.84M
{
4837
3.84M
    uint8_t *s;
4838
5.24M
    for (s = bam_aux_first(b); s; s = bam_aux_next(b, s))
4839
1.40M
        if (s[-2] == tag[0] && s[-1] == tag[1]) {
4840
            // Check the tag value is valid and complete
4841
14.5k
            uint8_t *e = skip_aux(s, b->data + b->l_data);
4842
14.5k
            if (e == NULL) goto bad_aux;
4843
14.5k
            if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux;
4844
4845
14.5k
            return s;
4846
14.5k
        }
4847
4848
    // errno now as set by bam_aux_first()/bam_aux_next()
4849
3.83M
    return NULL;
4850
4851
0
 bad_aux:
4852
0
    hts_log_error("Corrupted aux data for read %s flag %d",
4853
0
                  bam_get_qname(b), b->core.flag);
4854
0
    errno = EINVAL;
4855
0
    return NULL;
4856
3.84M
}
4857
4858
int bam_aux_del(bam1_t *b, uint8_t *s)
4859
0
{
4860
0
    s = bam_aux_remove(b, s);
4861
0
    return (s || errno == ENOENT)? 0 : -1;
4862
0
}
4863
4864
uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s)
4865
0
{
4866
0
    uint8_t *end = b->data + b->l_data;
4867
0
    uint8_t *next = skip_aux(s, end);
4868
0
    if (next == NULL) goto bad_aux;
4869
4870
0
    b->l_data -= next - (s-2);
4871
0
    if (next >= end) { errno = ENOENT; return NULL; }
4872
4873
0
    memmove(s-2, next, end - next);
4874
0
    return s;
4875
4876
0
 bad_aux:
4877
0
    hts_log_error("Corrupted aux data for read %s flag %d",
4878
0
                  bam_get_qname(b), b->core.flag);
4879
0
    errno = EINVAL;
4880
0
    return NULL;
4881
0
}
4882
4883
int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
4884
0
{
4885
    // FIXME: This is not at all efficient!
4886
0
    size_t ln = len >= 0 ? len : strlen(data) + 1;
4887
0
    size_t old_ln = 0;
4888
0
    int need_nul = ln == 0 || data[ln - 1] != '\0';
4889
0
    int save_errno = errno;
4890
0
    int new_tag = 0;
4891
0
    uint8_t *s = bam_aux_get(b,tag), *e;
4892
4893
0
    if (s) {  // Replacing existing tag
4894
0
        char type = *s;
4895
0
        if (type != 'Z') {
4896
0
            hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type);
4897
0
            errno = EINVAL;
4898
0
            return -1;
4899
0
        }
4900
0
        s++;
4901
0
        e = memchr(s, '\0', b->data + b->l_data - s);
4902
0
        old_ln = (e ? e - s : b->data + b->l_data - s) + 1;
4903
0
        s -= 3;
4904
0
    } else {
4905
0
        if (errno != ENOENT) { // Invalid aux data, give up
4906
0
            return -1;
4907
0
        } else { // Tag doesn't exist - put it on the end
4908
0
            errno = save_errno;
4909
0
            s = b->data + b->l_data;
4910
0
            new_tag = 3;
4911
0
        }
4912
0
    }
4913
4914
0
    if (old_ln < ln + need_nul + new_tag) {
4915
0
        ptrdiff_t s_offset = s - b->data;
4916
0
        if (possibly_expand_bam_data(b, ln + need_nul + new_tag - old_ln) < 0)
4917
0
            return -1;
4918
0
        s = b->data + s_offset;
4919
0
    }
4920
0
    if (!new_tag) {
4921
0
        memmove(s + 3 + ln + need_nul,
4922
0
                s + 3 + old_ln,
4923
0
                b->l_data - (s + 3 - b->data) - old_ln);
4924
0
    }
4925
0
    b->l_data += new_tag + ln + need_nul - old_ln;
4926
4927
0
    s[0] = tag[0];
4928
0
    s[1] = tag[1];
4929
0
    s[2] = 'Z';
4930
0
    memmove(s+3,data,ln);
4931
0
    if (need_nul) s[3 + ln] = '\0';
4932
0
    return 0;
4933
0
}
4934
4935
int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val)
4936
0
{
4937
0
    uint32_t sz, old_sz = 0, new = 0;
4938
0
    uint8_t *s, type;
4939
4940
0
    if (val < INT32_MIN || val > UINT32_MAX) {
4941
0
        errno = EOVERFLOW;
4942
0
        return -1;
4943
0
    }
4944
0
    if (val < INT16_MIN)       { type = 'i'; sz = 4; }
4945
0
    else if (val < INT8_MIN)   { type = 's'; sz = 2; }
4946
0
    else if (val < 0)          { type = 'c'; sz = 1; }
4947
0
    else if (val < UINT8_MAX)  { type = 'C'; sz = 1; }
4948
0
    else if (val < UINT16_MAX) { type = 'S'; sz = 2; }
4949
0
    else                       { type = 'I'; sz = 4; }
4950
4951
0
    s = bam_aux_get(b, tag);
4952
0
    if (s) {  // Tag present - how big was the old one?
4953
0
        switch (*s) {
4954
0
            case 'c': case 'C': old_sz = 1; break;
4955
0
            case 's': case 'S': old_sz = 2; break;
4956
0
            case 'i': case 'I': old_sz = 4; break;
4957
0
            default: errno = EINVAL; return -1;  // Not an integer
4958
0
        }
4959
0
    } else {
4960
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
4961
0
            s = b->data + b->l_data;
4962
0
            new = 1;
4963
0
        }  else { // Invalid aux data, give up.
4964
0
            return -1;
4965
0
        }
4966
0
    }
4967
4968
0
    if (new || old_sz < sz) {
4969
        // Make room for new tag
4970
0
        ptrdiff_t s_offset = s - b->data;
4971
0
        if (possibly_expand_bam_data(b, (new ? 3 : 0) + sz - old_sz) < 0)
4972
0
            return -1;
4973
0
        s =  b->data + s_offset;
4974
0
        if (new) { // Add tag id
4975
0
            *s++ = tag[0];
4976
0
            *s++ = tag[1];
4977
0
        } else {   // Shift following data so we have space
4978
0
            memmove(s + sz, s + old_sz, b->l_data - s_offset - old_sz);
4979
0
        }
4980
0
    } else {
4981
        // Reuse old space.  Data value may be bigger than necessary but
4982
        // we avoid having to move everything else
4983
0
        sz = old_sz;
4984
0
        type = (val < 0 ? "\0cs\0i" : "\0CS\0I")[old_sz];
4985
0
        assert(type > 0);
4986
0
    }
4987
0
    *s++ = type;
4988
0
#ifdef HTS_LITTLE_ENDIAN
4989
0
    memcpy(s, &val, sz);
4990
#else
4991
    switch (sz) {
4992
        case 4:  u32_to_le(val, s); break;
4993
        case 2:  u16_to_le(val, s); break;
4994
        default: *s = val; break;
4995
    }
4996
#endif
4997
0
    b->l_data += (new ? 3 : 0) + sz - old_sz;
4998
0
    return 0;
4999
0
}
5000
5001
int bam_aux_update_float(bam1_t *b, const char tag[2], float val)
5002
0
{
5003
0
    uint8_t *s = bam_aux_get(b, tag);
5004
0
    int shrink = 0, new = 0;
5005
5006
0
    if (s) { // Tag present - what was it?
5007
0
        switch (*s) {
5008
0
            case 'f': break;
5009
0
            case 'd': shrink = 1; break;
5010
0
            default: errno = EINVAL; return -1;  // Not a float
5011
0
        }
5012
0
    } else {
5013
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
5014
0
            new = 1;
5015
0
        }  else { // Invalid aux data, give up.
5016
0
            return -1;
5017
0
        }
5018
0
    }
5019
5020
0
    if (new) { // Ensure there's room
5021
0
        if (possibly_expand_bam_data(b, 3 + 4) < 0)
5022
0
            return -1;
5023
0
        s = b->data + b->l_data;
5024
0
        *s++ = tag[0];
5025
0
        *s++ = tag[1];
5026
0
    } else if (shrink) { // Convert non-standard double tag to float
5027
0
        memmove(s + 5, s + 9, b->l_data - ((s + 9) - b->data));
5028
0
        b->l_data -= 4;
5029
0
    }
5030
0
    *s++ = 'f';
5031
0
    float_to_le(val, s);
5032
0
    if (new) b->l_data += 7;
5033
5034
0
    return 0;
5035
0
}
5036
5037
int bam_aux_update_array(bam1_t *b, const char tag[2],
5038
                         uint8_t type, uint32_t items, void *data)
5039
0
{
5040
0
    uint8_t *s = bam_aux_get(b, tag);
5041
0
    size_t old_sz = 0, new_sz;
5042
0
    int new = 0;
5043
5044
0
    if (s) { // Tag present
5045
0
        if (*s != 'B') { errno = EINVAL; return -1; }
5046
0
        old_sz = aux_type2size(s[1]);
5047
0
        if (old_sz < 1 || old_sz > 4) { errno = EINVAL; return -1; }
5048
0
        old_sz *= le_to_u32(s + 2);
5049
0
    } else {
5050
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
5051
0
            s = b->data + b->l_data;
5052
0
            new = 1;
5053
0
        }  else { // Invalid aux data, give up.
5054
0
            return -1;
5055
0
        }
5056
0
    }
5057
5058
0
    new_sz = aux_type2size(type);
5059
0
    if (new_sz < 1 || new_sz > 4) { errno = EINVAL; return -1; }
5060
0
    if (items > INT32_MAX / new_sz) { errno = ENOMEM; return -1; }
5061
0
    new_sz *= items;
5062
5063
0
    if (new || old_sz < new_sz) {
5064
        // Make room for new tag
5065
0
        ptrdiff_t s_offset = s - b->data;
5066
0
        if (possibly_expand_bam_data(b, (new ? 8 : 0) + new_sz - old_sz) < 0)
5067
0
            return -1;
5068
0
        s =  b->data + s_offset;
5069
0
    }
5070
0
    if (new) { // Add tag id and type
5071
0
        *s++ = tag[0];
5072
0
        *s++ = tag[1];
5073
0
        *s = 'B';
5074
0
        b->l_data += 8 + new_sz;
5075
0
    } else if (old_sz != new_sz) { // shift following data if necessary
5076
0
        memmove(s + 6 + new_sz, s + 6 + old_sz,
5077
0
                b->l_data - ((s + 6 + old_sz) - b->data));
5078
0
        b->l_data -= old_sz;
5079
0
        b->l_data += new_sz;
5080
0
    }
5081
5082
0
    s[1] = type;
5083
0
    u32_to_le(items, s + 2);
5084
0
    if (new_sz > 0) {
5085
0
#ifdef HTS_LITTLE_ENDIAN
5086
0
        memcpy(s + 6, data, new_sz);
5087
#else
5088
        return aux_to_le(type, s + 6, data, new_sz);
5089
#endif
5090
0
    }
5091
0
    return 0;
5092
0
}
5093
5094
static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s,
5095
                                      uint32_t idx)
5096
0
{
5097
0
    switch (type) {
5098
0
        case 'c': return le_to_i8(s + idx);
5099
0
        case 'C': return s[idx];
5100
0
        case 's': return le_to_i16(s + 2 * idx);
5101
0
        case 'S': return le_to_u16(s + 2 * idx);
5102
0
        case 'i': return le_to_i32(s + 4 * idx);
5103
0
        case 'I': return le_to_u32(s + 4 * idx);
5104
0
        default:
5105
0
            errno = EINVAL;
5106
0
            return 0;
5107
0
    }
5108
0
}
5109
5110
int64_t bam_aux2i(const uint8_t *s)
5111
0
{
5112
0
    int type;
5113
0
    type = *s++;
5114
0
    return get_int_aux_val(type, s, 0);
5115
0
}
5116
5117
double bam_aux2f(const uint8_t *s)
5118
0
{
5119
0
    int type;
5120
0
    type = *s++;
5121
0
    if (type == 'd') return le_to_double(s);
5122
0
    else if (type == 'f') return le_to_float(s);
5123
0
    else return get_int_aux_val(type, s, 0);
5124
0
}
5125
5126
char bam_aux2A(const uint8_t *s)
5127
0
{
5128
0
    int type;
5129
0
    type = *s++;
5130
0
    if (type == 'A') return *(char*)s;
5131
0
    errno = EINVAL;
5132
0
    return 0;
5133
0
}
5134
5135
char *bam_aux2Z(const uint8_t *s)
5136
0
{
5137
0
    int type;
5138
0
    type = *s++;
5139
0
    if (type == 'Z' || type == 'H') return (char*)s;
5140
0
    errno = EINVAL;
5141
0
    return 0;
5142
0
}
5143
5144
uint32_t bam_auxB_len(const uint8_t *s)
5145
0
{
5146
0
    if (s[0] != 'B') {
5147
0
        errno = EINVAL;
5148
0
        return 0;
5149
0
    }
5150
0
    return le_to_u32(s + 2);
5151
0
}
5152
5153
int64_t bam_auxB2i(const uint8_t *s, uint32_t idx)
5154
0
{
5155
0
    uint32_t len = bam_auxB_len(s);
5156
0
    if (idx >= len) {
5157
0
        errno = ERANGE;
5158
0
        return 0;
5159
0
    }
5160
0
    return get_int_aux_val(s[1], s + 6, idx);
5161
0
}
5162
5163
double bam_auxB2f(const uint8_t *s, uint32_t idx)
5164
0
{
5165
0
    uint32_t len = bam_auxB_len(s);
5166
0
    if (idx >= len) {
5167
0
        errno = ERANGE;
5168
0
        return 0.0;
5169
0
    }
5170
0
    if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx);
5171
0
    else return get_int_aux_val(s[1], s + 6, idx);
5172
0
}
5173
5174
int sam_open_mode(char *mode, const char *fn, const char *format)
5175
0
{
5176
    // TODO Parse "bam5" etc for compression level
5177
0
    if (format == NULL) {
5178
        // Try to pick a format based on the filename extension
5179
0
        char extension[HTS_MAX_EXT_LEN];
5180
0
        if (find_file_extension(fn, extension) < 0) return -1;
5181
0
        return sam_open_mode(mode, fn, extension);
5182
0
    }
5183
0
    else if (strcasecmp(format, "bam") == 0) strcpy(mode, "b");
5184
0
    else if (strcasecmp(format, "cram") == 0) strcpy(mode, "c");
5185
0
    else if (strcasecmp(format, "sam") == 0) strcpy(mode, "");
5186
0
    else if (strcasecmp(format, "sam.gz") == 0) strcpy(mode, "z");
5187
0
    else if (strcasecmp(format, "fastq") == 0 ||
5188
0
             strcasecmp(format, "fq") == 0) strcpy(mode, "f");
5189
0
    else if (strcasecmp(format, "fastq.gz") == 0 ||
5190
0
             strcasecmp(format, "fq.gz") == 0) strcpy(mode, "fz");
5191
0
    else if (strcasecmp(format, "fasta") == 0 ||
5192
0
             strcasecmp(format, "fa") == 0) strcpy(mode, "F");
5193
0
    else if (strcasecmp(format, "fasta.gz") == 0 ||
5194
0
             strcasecmp(format, "fa.gz") == 0) strcpy(mode, "Fz");
5195
0
    else return -1;
5196
5197
0
    return 0;
5198
0
}
5199
5200
// A version of sam_open_mode that can handle ,key=value options.
5201
// The format string is allocated and returned, to be freed by the caller.
5202
// Prefix should be "r" or "w",
5203
char *sam_open_mode_opts(const char *fn,
5204
                         const char *mode,
5205
                         const char *format)
5206
0
{
5207
0
    char *mode_opts = malloc((format ? strlen(format) : 1) +
5208
0
                             (mode   ? strlen(mode)   : 1) + 12);
5209
0
    char *opts, *cp;
5210
0
    int format_len;
5211
5212
0
    if (!mode_opts)
5213
0
        return NULL;
5214
5215
0
    strcpy(mode_opts, mode ? mode : "r");
5216
0
    cp = mode_opts + strlen(mode_opts);
5217
5218
0
    if (format == NULL) {
5219
        // Try to pick a format based on the filename extension
5220
0
        char extension[HTS_MAX_EXT_LEN];
5221
0
        if (find_file_extension(fn, extension) < 0) {
5222
0
            free(mode_opts);
5223
0
            return NULL;
5224
0
        }
5225
0
        if (sam_open_mode(cp, fn, extension) == 0) {
5226
0
            return mode_opts;
5227
0
        } else {
5228
0
            free(mode_opts);
5229
0
            return NULL;
5230
0
        }
5231
0
    }
5232
5233
0
    if ((opts = strchr(format, ','))) {
5234
0
        format_len = opts-format;
5235
0
    } else {
5236
0
        opts="";
5237
0
        format_len = strlen(format);
5238
0
    }
5239
5240
0
    if (strncmp(format, "bam", format_len) == 0) {
5241
0
        *cp++ = 'b';
5242
0
    } else if (strncmp(format, "cram", format_len) == 0) {
5243
0
        *cp++ = 'c';
5244
0
    } else if (strncmp(format, "cram2", format_len) == 0) {
5245
0
        *cp++ = 'c';
5246
0
        strcpy(cp, ",VERSION=2.1");
5247
0
        cp += 12;
5248
0
    } else if (strncmp(format, "cram3", format_len) == 0) {
5249
0
        *cp++ = 'c';
5250
0
        strcpy(cp, ",VERSION=3.0");
5251
0
        cp += 12;
5252
0
    } else if (strncmp(format, "sam", format_len) == 0) {
5253
0
        ; // format mode=""
5254
0
    } else if (strncmp(format, "sam.gz", format_len) == 0) {
5255
0
        *cp++ = 'z';
5256
0
    } else if (strncmp(format, "fastq", format_len) == 0 ||
5257
0
               strncmp(format, "fq", format_len) == 0) {
5258
0
        *cp++ = 'f';
5259
0
    } else if (strncmp(format, "fastq.gz", format_len) == 0 ||
5260
0
               strncmp(format, "fq.gz", format_len) == 0) {
5261
0
        *cp++ = 'f';
5262
0
        *cp++ = 'z';
5263
0
    } else if (strncmp(format, "fasta", format_len) == 0 ||
5264
0
               strncmp(format, "fa", format_len) == 0) {
5265
0
        *cp++ = 'F';
5266
0
    } else if (strncmp(format, "fasta.gz", format_len) == 0 ||
5267
0
               strncmp(format, "fa", format_len) == 0) {
5268
0
        *cp++ = 'F';
5269
0
        *cp++ = 'z';
5270
0
    } else {
5271
0
        free(mode_opts);
5272
0
        return NULL;
5273
0
    }
5274
5275
0
    strcpy(cp, opts);
5276
5277
0
    return mode_opts;
5278
0
}
5279
5280
0
#define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
5281
int bam_str2flag(const char *str)
5282
0
{
5283
0
    char *end, *beg = (char*) str;
5284
0
    long int flag = strtol(str, &end, 0);
5285
0
    if ( end!=str ) return flag;    // the conversion was successful
5286
0
    flag = 0;
5287
0
    while ( *str )
5288
0
    {
5289
0
        end = beg;
5290
0
        while ( *end && *end!=',' ) end++;
5291
0
        if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
5292
0
        else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
5293
0
        else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
5294
0
        else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
5295
0
        else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
5296
0
        else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
5297
0
        else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
5298
0
        else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
5299
0
        else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
5300
0
        else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
5301
0
        else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
5302
0
        else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
5303
0
        else return -1;
5304
0
        if ( !*end ) break;
5305
0
        beg = end + 1;
5306
0
    }
5307
0
    return flag;
5308
0
}
5309
5310
char *bam_flag2str(int flag)
5311
0
{
5312
0
    kstring_t str = {0,0,0};
5313
0
    if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
5314
0
    if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
5315
0
    if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
5316
0
    if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
5317
0
    if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
5318
0
    if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
5319
0
    if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
5320
0
    if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
5321
0
    if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
5322
0
    if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
5323
0
    if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
5324
0
    if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
5325
0
    if ( str.l == 0 ) kputsn("", 0, &str);
5326
0
    return str.s;
5327
0
}
5328
5329
5330
/**************************
5331
 *** Pileup and Mpileup ***
5332
 **************************/
5333
5334
#if !defined(BAM_NO_PILEUP)
5335
5336
#include <assert.h>
5337
5338
/*******************
5339
 *** Memory pool ***
5340
 *******************/
5341
5342
typedef struct {
5343
    int k, y;
5344
    hts_pos_t x, end;
5345
} cstate_t;
5346
5347
static cstate_t g_cstate_null = { -1, 0, 0, 0 };
5348
5349
typedef struct __linkbuf_t {
5350
    bam1_t b;
5351
    hts_pos_t beg, end;
5352
    cstate_t s;
5353
    struct __linkbuf_t *next;
5354
    bam_pileup_cd cd;
5355
} lbnode_t;
5356
5357
typedef struct {
5358
    int cnt, n, max;
5359
    lbnode_t **buf;
5360
} mempool_t;
5361
5362
static mempool_t *mp_init(void)
5363
0
{
5364
0
    mempool_t *mp;
5365
0
    mp = (mempool_t*)calloc(1, sizeof(mempool_t));
5366
0
    return mp;
5367
0
}
5368
static void mp_destroy(mempool_t *mp)
5369
0
{
5370
0
    int k;
5371
0
    for (k = 0; k < mp->n; ++k) {
5372
0
        free(mp->buf[k]->b.data);
5373
0
        free(mp->buf[k]);
5374
0
    }
5375
0
    free(mp->buf);
5376
0
    free(mp);
5377
0
}
5378
static inline lbnode_t *mp_alloc(mempool_t *mp)
5379
0
{
5380
0
    ++mp->cnt;
5381
0
    if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
5382
0
    else return mp->buf[--mp->n];
5383
0
}
5384
static inline void mp_free(mempool_t *mp, lbnode_t *p)
5385
0
{
5386
0
    --mp->cnt; p->next = 0; // clear lbnode_t::next here
5387
0
    if (mp->n == mp->max) {
5388
0
        mp->max = mp->max? mp->max<<1 : 256;
5389
0
        mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
5390
0
    }
5391
0
    mp->buf[mp->n++] = p;
5392
0
}
5393
5394
/**********************
5395
 *** CIGAR resolver ***
5396
 **********************/
5397
5398
/* s->k: the index of the CIGAR operator that has just been processed.
5399
   s->x: the reference coordinate of the start of s->k
5400
   s->y: the query coordinate of the start of s->k
5401
 */
5402
static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s)
5403
0
{
5404
0
#define _cop(c) ((c)&BAM_CIGAR_MASK)
5405
0
#define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
5406
5407
0
    bam1_t *b = p->b;
5408
0
    bam1_core_t *c = &b->core;
5409
0
    uint32_t *cigar = bam_get_cigar(b);
5410
0
    int k;
5411
    // determine the current CIGAR operation
5412
    //fprintf(stderr, "%s\tpos=%ld\tend=%ld\t(%d,%ld,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
5413
0
    if (s->k == -1) { // never processed
5414
0
        p->qpos = 0;
5415
0
        if (c->n_cigar == 1) { // just one operation, save a loop
5416
0
          if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
5417
0
        } else { // find the first match or deletion
5418
0
            for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
5419
0
                int op = _cop(cigar[k]);
5420
0
                int l = _cln(cigar[k]);
5421
0
                if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP ||
5422
0
                    op == BAM_CEQUAL || op == BAM_CDIFF) break;
5423
0
                else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5424
0
            }
5425
0
            assert(k < c->n_cigar);
5426
0
            s->k = k;
5427
0
        }
5428
0
    } else { // the read has been processed before
5429
0
        int op, l = _cln(cigar[s->k]);
5430
0
        if (pos - s->x >= l) { // jump to the next operation
5431
0
            assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
5432
0
            op = _cop(cigar[s->k+1]);
5433
0
            if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
5434
0
              if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5435
0
                s->x += l;
5436
0
                ++s->k;
5437
0
            } else { // find the next M/D/N/=/X
5438
0
              if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5439
0
                s->x += l;
5440
0
                for (k = s->k + 1; k < c->n_cigar; ++k) {
5441
0
                    op = _cop(cigar[k]), l = _cln(cigar[k]);
5442
0
                    if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
5443
0
                    else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5444
0
                }
5445
0
                s->k = k;
5446
0
            }
5447
0
            assert(s->k < c->n_cigar); // otherwise a bug
5448
0
        } // else, do nothing
5449
0
    }
5450
0
    { // collect pileup information
5451
0
        int op, l;
5452
0
        op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
5453
0
        p->is_del = p->indel = p->is_refskip = 0;
5454
0
        if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
5455
0
            int op2 = _cop(cigar[s->k+1]);
5456
0
            int l2 = _cln(cigar[s->k+1]);
5457
0
            if (op2 == BAM_CDEL && op != BAM_CDEL) {
5458
                // At start of a new deletion, merge e.g. 1D2D to 3D.
5459
                // Within a deletion (the 2D in 1D2D) we keep p->indel=0
5460
                // and rely on is_del=1 as we would for 3D.
5461
0
                p->indel = -(int)l2;
5462
0
                for (k = s->k+2; k < c->n_cigar; ++k) {
5463
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5464
0
                    if (op2 == BAM_CDEL) p->indel -= l2;
5465
0
                    else break;
5466
0
                }
5467
0
            } else if (op2 == BAM_CINS) {
5468
0
                p->indel = l2;
5469
0
                for (k = s->k+2; k < c->n_cigar; ++k) {
5470
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5471
0
                    if (op2 == BAM_CINS) p->indel += l2;
5472
0
                    else if (op2 != BAM_CPAD) break;
5473
0
                }
5474
0
            } else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) {
5475
0
                int l3 = 0;
5476
0
                for (k = s->k + 2; k < c->n_cigar; ++k) {
5477
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5478
0
                    if (op2 == BAM_CINS) l3 += l2;
5479
0
                    else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
5480
0
                }
5481
0
                if (l3 > 0) p->indel = l3;
5482
0
            }
5483
0
        }
5484
0
        if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
5485
0
            p->qpos = s->y + (pos - s->x);
5486
0
        } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
5487
0
            p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
5488
0
            p->is_refskip = (op == BAM_CREF_SKIP);
5489
0
        } // cannot be other operations; otherwise a bug
5490
0
        p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
5491
0
    }
5492
0
    p->cigar_ind = s->k;
5493
0
    return 1;
5494
0
}
5495
5496
/*******************************
5497
 *** Expansion of insertions ***
5498
 *******************************/
5499
5500
/*
5501
 * Fills out the kstring with the padded insertion sequence for the current
5502
 * location in 'p'.  If this is not an insertion site, the string is blank.
5503
 *
5504
 * This variant handles base modifications, but only when "m" is non-NULL.
5505
 *
5506
 * Returns the number of inserted base on success, with string length being
5507
 *        accessable via ins->l;
5508
 *        -1 on failure.
5509
 */
5510
int bam_plp_insertion_mod(const bam_pileup1_t *p,
5511
                          hts_base_mod_state *m,
5512
0
                          kstring_t *ins, int *del_len) {
5513
0
    int j, k, indel, nb = 0;
5514
0
    uint32_t *cigar;
5515
5516
0
    if (p->indel <= 0) {
5517
0
        if (ks_resize(ins, 1) < 0)
5518
0
            return -1;
5519
0
        ins->l = 0;
5520
0
        ins->s[0] = '\0';
5521
0
        return 0;
5522
0
    }
5523
5524
0
    if (del_len)
5525
0
        *del_len = 0;
5526
5527
    // Measure indel length including pads
5528
0
    indel = 0;
5529
0
    k = p->cigar_ind+1;
5530
0
    cigar = bam_get_cigar(p->b);
5531
0
    while (k < p->b->core.n_cigar) {
5532
0
        switch (cigar[k] & BAM_CIGAR_MASK) {
5533
0
        case BAM_CPAD:
5534
0
        case BAM_CINS:
5535
0
            indel += (cigar[k] >> BAM_CIGAR_SHIFT);
5536
0
            break;
5537
0
        default:
5538
0
            k = p->b->core.n_cigar;
5539
0
            break;
5540
0
        }
5541
0
        k++;
5542
0
    }
5543
0
    nb = ins->l = indel;
5544
5545
    // Produce sequence
5546
0
    if (ks_resize(ins, indel+1) < 0)
5547
0
        return -1;
5548
0
    indel = 0;
5549
0
    k = p->cigar_ind+1;
5550
0
    j = 1;
5551
0
    while (k < p->b->core.n_cigar) {
5552
0
        int l, c;
5553
0
        switch (cigar[k] & BAM_CIGAR_MASK) {
5554
0
        case BAM_CPAD:
5555
0
            for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++)
5556
0
                ins->s[indel++] = '*';
5557
0
            break;
5558
0
        case BAM_CINS:
5559
0
            for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) {
5560
0
                c = p->qpos + j - p->is_del < p->b->core.l_qseq
5561
0
                    ? seq_nt16_str[bam_seqi(bam_get_seq(p->b),
5562
0
                                            p->qpos + j - p->is_del)]
5563
0
                    : 'N';
5564
0
                ins->s[indel++] = c;
5565
0
                int nm;
5566
0
                hts_base_mod mod[256];
5567
0
                if (m && (nm = bam_mods_at_qpos(p->b, p->qpos + j - p->is_del,
5568
0
                                                m, mod, 256)) > 0) {
5569
0
                    int o_indel = indel;
5570
0
                    if (ks_resize(ins, ins->l + nm*16+3) < 0)
5571
0
                        return -1;
5572
0
                    ins->s[indel++] = '[';
5573
0
                    int j;
5574
0
                    for (j = 0; j < nm; j++) {
5575
0
                        char qual[20];
5576
0
                        if (mod[j].qual >= 0)
5577
0
                            snprintf(qual, sizeof(qual), "%d", mod[j].qual);
5578
0
                        else
5579
0
                            *qual=0;
5580
0
                        if (mod[j].modified_base < 0)
5581
                            // ChEBI
5582
0
                            indel += snprintf(&ins->s[indel], ins->m - indel,
5583
0
                                              "%c(%d)%s",
5584
0
                                              "+-"[mod[j].strand],
5585
0
                                              -mod[j].modified_base,
5586
0
                                              qual);
5587
0
                        else
5588
0
                            indel += snprintf(&ins->s[indel], ins->m - indel,
5589
0
                                              "%c%c%s",
5590
0
                                              "+-"[mod[j].strand],
5591
0
                                              mod[j].modified_base,
5592
0
                                              qual);
5593
0
                    }
5594
0
                    ins->s[indel++] = ']';
5595
0
                    ins->l += indel - o_indel; // grow by amount we used
5596
0
                }
5597
0
            }
5598
0
            break;
5599
0
        case BAM_CDEL:
5600
            // eg cigar 1M2I1D gives mpileup output in T+2AA-1C style
5601
0
            if (del_len)
5602
0
                *del_len = cigar[k]>>BAM_CIGAR_SHIFT;
5603
            // fall through
5604
0
        default:
5605
0
            k = p->b->core.n_cigar;
5606
0
            break;
5607
0
        }
5608
0
        k++;
5609
0
    }
5610
0
    ins->s[indel] = '\0';
5611
0
    ins->l = indel; // string length
5612
5613
0
    return nb;      // base length
5614
0
}
5615
5616
/*
5617
 * Fills out the kstring with the padded insertion sequence for the current
5618
 * location in 'p'.  If this is not an insertion site, the string is blank.
5619
 *
5620
 * This is the original interface with no capability for reporting base
5621
 * modifications.
5622
 *
5623
 * Returns the length of insertion string on success;
5624
 *        -1 on failure.
5625
 */
5626
0
int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) {
5627
0
    return bam_plp_insertion_mod(p, NULL, ins, del_len);
5628
0
}
5629
5630
/***********************
5631
 *** Pileup iterator ***
5632
 ***********************/
5633
5634
// Dictionary of overlapping reads
5635
KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
5636
typedef khash_t(olap_hash) olap_hash_t;
5637
5638
struct bam_plp_s {
5639
    mempool_t *mp;
5640
    lbnode_t *head, *tail;
5641
    int32_t tid, max_tid;
5642
    hts_pos_t pos, max_pos;
5643
    int is_eof, max_plp, error, maxcnt;
5644
    uint64_t id;
5645
    bam_pileup1_t *plp;
5646
    // for the "auto" interface only
5647
    bam1_t *b;
5648
    bam_plp_auto_f func;
5649
    void *data;
5650
    olap_hash_t *overlaps;
5651
5652
    // For notification of creation and destruction events
5653
    // and associated client-owned pointer.
5654
    int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
5655
    int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd);
5656
};
5657
5658
bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
5659
0
{
5660
0
    bam_plp_t iter;
5661
0
    iter = (bam_plp_t)calloc(1, sizeof(struct bam_plp_s));
5662
0
    iter->mp = mp_init();
5663
0
    iter->head = iter->tail = mp_alloc(iter->mp);
5664
0
    iter->max_tid = iter->max_pos = -1;
5665
0
    iter->maxcnt = 8000;
5666
0
    if (func) {
5667
0
        iter->func = func;
5668
0
        iter->data = data;
5669
0
        iter->b = bam_init1();
5670
0
    }
5671
0
    return iter;
5672
0
}
5673
5674
int bam_plp_init_overlaps(bam_plp_t iter)
5675
0
{
5676
0
    iter->overlaps = kh_init(olap_hash);  // hash for tweaking quality of bases in overlapping reads
5677
0
    return iter->overlaps ? 0 : -1;
5678
0
}
5679
5680
void bam_plp_destroy(bam_plp_t iter)
5681
0
{
5682
0
    lbnode_t *p, *pnext;
5683
0
    if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
5684
0
    for (p = iter->head; p != NULL; p = pnext) {
5685
0
        if (iter->plp_destruct && p != iter->tail)
5686
0
            iter->plp_destruct(iter->data, &p->b, &p->cd);
5687
0
        pnext = p->next;
5688
0
        mp_free(iter->mp, p);
5689
0
    }
5690
0
    mp_destroy(iter->mp);
5691
0
    if (iter->b) bam_destroy1(iter->b);
5692
0
    free(iter->plp);
5693
0
    free(iter);
5694
0
}
5695
5696
void bam_plp_constructor(bam_plp_t plp,
5697
0
                         int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5698
0
    plp->plp_construct = func;
5699
0
}
5700
5701
void bam_plp_destructor(bam_plp_t plp,
5702
0
                        int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5703
0
    plp->plp_destruct = func;
5704
0
}
5705
5706
//---------------------------------
5707
//---  Tweak overlapping reads
5708
//---------------------------------
5709
5710
/**
5711
 *  cigar_iref2iseq_set()  - find the first CMATCH setting the ref and the read index
5712
 *  cigar_iref2iseq_next() - get the next CMATCH base
5713
 *  @cigar:       pointer to current cigar block (rw)
5714
 *  @cigar_max:   pointer just beyond the last cigar block
5715
 *  @icig:        position within the current cigar block (rw)
5716
 *  @iseq:        position in the sequence (rw)
5717
 *  @iref:        position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
5718
 *
5719
 *  Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered,
5720
 *  or -2 on error.
5721
 */
5722
static inline int cigar_iref2iseq_set(const uint32_t **cigar,
5723
                                      const uint32_t *cigar_max,
5724
                                      hts_pos_t *icig,
5725
                                      hts_pos_t *iseq,
5726
                                      hts_pos_t *iref)
5727
0
{
5728
0
    hts_pos_t pos = *iref;
5729
0
    if ( pos < 0 ) return -1;
5730
0
    *icig = 0;
5731
0
    *iseq = 0;
5732
0
    *iref = 0;
5733
0
    while ( *cigar<cigar_max )
5734
0
    {
5735
0
        int cig  = (**cigar) & BAM_CIGAR_MASK;
5736
0
        int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5737
5738
0
        if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5739
0
        if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
5740
0
        if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5741
0
        {
5742
0
            pos -= ncig;
5743
0
            if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
5744
0
            (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
5745
0
            continue;
5746
0
        }
5747
0
        if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5748
0
        if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
5749
0
        {
5750
0
            pos -= ncig;
5751
0
            if ( pos<0 ) pos = 0;
5752
0
            (*cigar)++; *icig = 0; *iref += ncig;
5753
0
            continue;
5754
0
        }
5755
0
        hts_log_error("Unexpected cigar %d", cig);
5756
0
        return -2;
5757
0
    }
5758
0
    *iseq = -1;
5759
0
    return -1;
5760
0
}
5761
static inline int cigar_iref2iseq_next(const uint32_t **cigar,
5762
                                       const uint32_t *cigar_max,
5763
                                       hts_pos_t *icig,
5764
                                       hts_pos_t *iseq,
5765
                                       hts_pos_t *iref)
5766
0
{
5767
0
    while ( *cigar < cigar_max )
5768
0
    {
5769
0
        int cig  = (**cigar) & BAM_CIGAR_MASK;
5770
0
        int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5771
5772
0
        if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5773
0
        {
5774
0
            if ( *icig >= ncig - 1 ) { *icig = -1;  (*cigar)++; continue; }
5775
0
            (*iseq)++; (*icig)++; (*iref)++;
5776
0
            return BAM_CMATCH;
5777
0
        }
5778
0
        if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = -1; continue; }
5779
0
        if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5780
0
        if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5781
0
        if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = -1; continue; }
5782
0
        hts_log_error("Unexpected cigar %d", cig);
5783
0
        return -2;
5784
0
    }
5785
0
    *iseq = -1;
5786
0
    *iref = -1;
5787
0
    return -1;
5788
0
}
5789
5790
// Given overlapping read 'a' (left) and 'b' (right) on the same
5791
// template, adjust quality values to zero for either a or b.
5792
// Note versions 1.12 and earlier always removed quality from 'b' for
5793
// matching bases.  Now we select a or b semi-randomly based on name hash.
5794
// Returns 0 on success,
5795
//        -1 on failure
5796
static int tweak_overlap_quality(bam1_t *a, bam1_t *b)
5797
0
{
5798
0
    const uint32_t *a_cigar = bam_get_cigar(a),
5799
0
        *a_cigar_max = a_cigar + a->core.n_cigar;
5800
0
    const uint32_t *b_cigar = bam_get_cigar(b),
5801
0
        *b_cigar_max = b_cigar + b->core.n_cigar;
5802
0
    hts_pos_t a_icig = 0, a_iseq = 0;
5803
0
    hts_pos_t b_icig = 0, b_iseq = 0;
5804
0
    uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
5805
0
    uint8_t *a_seq  = bam_get_seq(a), *b_seq = bam_get_seq(b);
5806
5807
0
    hts_pos_t iref   = b->core.pos;
5808
0
    hts_pos_t a_iref = iref - a->core.pos;
5809
0
    hts_pos_t b_iref = iref - b->core.pos;
5810
5811
0
    int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max,
5812
0
                                    &a_icig, &a_iseq, &a_iref);
5813
0
    if ( a_ret<0 )
5814
        // no overlap or error
5815
0
        return a_ret<-1 ? -1:0;
5816
5817
0
    int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max,
5818
0
                                    &b_icig, &b_iseq, &b_iref);
5819
0
    if ( b_ret<0 )
5820
        // no overlap or error
5821
0
        return b_ret<-1 ? -1:0;
5822
5823
    // Determine which seq is the one getting modified qualities.
5824
0
    uint8_t amul, bmul;
5825
0
    if (__ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(a))) & 1) {
5826
0
        amul = 1;
5827
0
        bmul = 0;
5828
0
    } else {
5829
0
        amul = 0;
5830
0
        bmul = 1;
5831
0
    }
5832
5833
    // Loop over the overlapping region nulling qualities in either
5834
    // seq a or b.
5835
0
    int err = 0;
5836
0
    while ( 1 ) {
5837
        // Step to next matching reference position in a and b
5838
0
        while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos )
5839
0
            a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
5840
0
                                         &a_icig, &a_iseq, &a_iref);
5841
0
        if ( a_ret<0 ) { // done
5842
0
            err = a_ret<-1?-1:0;
5843
0
            break;
5844
0
        }
5845
5846
0
        while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos )
5847
0
            b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig,
5848
0
                                         &b_iseq, &b_iref);
5849
0
        if ( b_ret<0 ) { // done
5850
0
            err = b_ret<-1?-1:0;
5851
0
            break;
5852
0
        }
5853
5854
0
        if ( iref < a_iref + a->core.pos )
5855
0
            iref = a_iref + a->core.pos;
5856
5857
0
        if ( iref < b_iref + b->core.pos )
5858
0
            iref = b_iref + b->core.pos;
5859
5860
0
        iref++;
5861
5862
        // If A or B has a deletion then we catch up the other to this point.
5863
        // We also amend quality values using the same rules for mismatch.
5864
0
        if (a_iref+a->core.pos != b_iref+b->core.pos) {
5865
0
            if (a_iref+a->core.pos < b_iref+b->core.pos
5866
0
                && b_cigar > bam_get_cigar(b)
5867
0
                && bam_cigar_op(b_cigar[-1]) == BAM_CDEL) {
5868
                // Del in B means it's moved on further than A
5869
0
                do {
5870
0
                    a_qual[a_iseq] = amul
5871
0
                        ? a_qual[a_iseq]*0.8
5872
0
                        : 0;
5873
0
                    a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
5874
0
                                                 &a_icig, &a_iseq, &a_iref);
5875
0
                    if (a_ret < 0)
5876
0
                        return -(a_ret<-1); // 0 or -1
5877
0
                } while (a_iref + a->core.pos < b_iref+b->core.pos);
5878
0
            } else if (a_cigar > bam_get_cigar(a)
5879
0
                       && bam_cigar_op(a_cigar[-1]) == BAM_CDEL) {
5880
                // Del in A means it's moved on further than B
5881
0
                do {
5882
0
                    b_qual[b_iseq] = bmul
5883
0
                        ? b_qual[b_iseq]*0.8
5884
0
                        : 0;
5885
0
                    b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max,
5886
0
                                                 &b_icig, &b_iseq, &b_iref);
5887
0
                    if (b_ret < 0)
5888
0
                        return -(b_ret<-1); // 0 or -1
5889
0
                } while (b_iref + b->core.pos < a_iref+a->core.pos);
5890
0
            } else {
5891
                // Anything else, eg ref-skip, we don't support here
5892
0
                continue;
5893
0
            }
5894
0
        }
5895
5896
        // fprintf(stderr, "a_cig=%ld,%ld b_cig=%ld,%ld iref=%ld "
5897
        //         "a_iref=%ld b_iref=%ld a_iseq=%ld b_iseq=%ld\n",
5898
        //         a_cigar-bam_get_cigar(a), a_icig,
5899
        //         b_cigar-bam_get_cigar(b), b_icig,
5900
        //         iref, a_iref+a->core.pos+1, b_iref+b->core.pos+1,
5901
        //         a_iseq, b_iseq);
5902
5903
0
        if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq)
5904
            // Fell off end of sequence, bad CIGAR?
5905
0
            return -1;
5906
5907
        // We're finally at the same ref base in both a and b.
5908
        // Check if the bases match (confident) or mismatch
5909
        // (not so confident).
5910
0
        if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) {
5911
            // We are very confident about this base.  Use sum of quals
5912
0
            int qual = a_qual[a_iseq] + b_qual[b_iseq];
5913
0
            a_qual[a_iseq] = amul * (qual>200 ? 200 : qual);
5914
0
            b_qual[b_iseq] = bmul * (qual>200 ? 200 : qual);;
5915
0
        } else {
5916
            // Not so confident about anymore given the mismatch.
5917
            // Reduce qual for lowest quality base.
5918
0
            if ( a_qual[a_iseq] > b_qual[b_iseq] ) {
5919
                // A highest qual base; keep
5920
0
                a_qual[a_iseq] = 0.8 * a_qual[a_iseq];
5921
0
                b_qual[b_iseq] = 0;
5922
0
            } else if (a_qual[a_iseq] < b_qual[b_iseq] ) {
5923
                // B highest qual base; keep
5924
0
                b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
5925
0
                a_qual[a_iseq] = 0;
5926
0
            } else {
5927
                // Both equal, so pick randomly
5928
0
                a_qual[a_iseq] = amul * 0.8 * a_qual[a_iseq];
5929
0
                b_qual[b_iseq] = bmul * 0.8 * b_qual[b_iseq];
5930
0
            }
5931
0
        }
5932
0
    }
5933
5934
0
    return err;
5935
0
}
5936
5937
// Fix overlapping reads. Simple soft-clipping did not give good results.
5938
// Lowering qualities of unwanted bases is more selective and works better.
5939
//
5940
// Returns 0 on success, -1 on failure
5941
static int overlap_push(bam_plp_t iter, lbnode_t *node)
5942
0
{
5943
0
    if ( !iter->overlaps ) return 0;
5944
5945
    // mapped mates and paired reads only
5946
0
    if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return 0;
5947
5948
    // no overlap possible, unless some wild cigar
5949
0
    if ( (node->b.core.mtid >= 0 && node->b.core.tid != node->b.core.mtid)
5950
0
         || (llabs(node->b.core.isize) >= 2*node->b.core.l_qseq
5951
0
         && node->b.core.mpos >= node->end) // for those wild cigars
5952
0
       ) return 0;
5953
5954
0
    khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
5955
0
    if ( kitr==kh_end(iter->overlaps) )
5956
0
    {
5957
        // Only add reads where the mate is still to arrive
5958
0
        if (node->b.core.mpos >= node->b.core.pos ||
5959
0
            ((node->b.core.flag & BAM_FPAIRED) && node->b.core.mpos == -1)) {
5960
0
            int ret;
5961
0
            kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
5962
0
            if (ret < 0) return -1;
5963
0
            kh_value(iter->overlaps, kitr) = node;
5964
0
        }
5965
0
    }
5966
0
    else
5967
0
    {
5968
0
        lbnode_t *a = kh_value(iter->overlaps, kitr);
5969
0
        int err = tweak_overlap_quality(&a->b, &node->b);
5970
0
        kh_del(olap_hash, iter->overlaps, kitr);
5971
0
        assert(a->end-1 == a->s.end);
5972
0
        return err;
5973
0
    }
5974
0
    return 0;
5975
0
}
5976
5977
static void overlap_remove(bam_plp_t iter, const bam1_t *b)
5978
0
{
5979
0
    if ( !iter->overlaps ) return;
5980
5981
0
    khiter_t kitr;
5982
0
    if ( b )
5983
0
    {
5984
0
        kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
5985
0
        if ( kitr!=kh_end(iter->overlaps) )
5986
0
            kh_del(olap_hash, iter->overlaps, kitr);
5987
0
    }
5988
0
    else
5989
0
    {
5990
        // remove all
5991
0
        for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
5992
0
            if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
5993
0
    }
5994
0
}
5995
5996
5997
5998
// Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
5999
// pointer to the piled records if next position is ready or NULL if there is not enough records in the
6000
// buffer yet (the current position is still the maximum position across all buffered reads).
6001
const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
6002
0
{
6003
0
    if (iter->error) { *_n_plp = -1; return NULL; }
6004
0
    *_n_plp = 0;
6005
0
    if (iter->is_eof && iter->head == iter->tail) return NULL;
6006
0
    while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
6007
0
        int n_plp = 0;
6008
        // write iter->plp at iter->pos
6009
0
        lbnode_t **pptr = &iter->head;
6010
0
        while (*pptr != iter->tail) {
6011
0
            lbnode_t *p = *pptr;
6012
0
            if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
6013
0
                overlap_remove(iter, &p->b);
6014
0
                if (iter->plp_destruct)
6015
0
                    iter->plp_destruct(iter->data, &p->b, &p->cd);
6016
0
                *pptr = p->next; mp_free(iter->mp, p);
6017
0
            }
6018
0
            else {
6019
0
                if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
6020
0
                    if (n_plp == iter->max_plp) { // then double the capacity
6021
0
                        iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
6022
0
                        iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
6023
0
                    }
6024
0
                    iter->plp[n_plp].b = &p->b;
6025
0
                    iter->plp[n_plp].cd = p->cd;
6026
0
                    if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
6027
0
                }
6028
0
                pptr = &(*pptr)->next;
6029
0
            }
6030
0
        }
6031
0
        *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
6032
        // update iter->tid and iter->pos
6033
0
        if (iter->head != iter->tail) {
6034
0
            if (iter->tid > iter->head->b.core.tid) {
6035
0
                hts_log_error("Unsorted input. Pileup aborts");
6036
0
                iter->error = 1;
6037
0
                *_n_plp = -1;
6038
0
                return NULL;
6039
0
            }
6040
0
        }
6041
0
        if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
6042
0
            iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
6043
0
        } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
6044
0
            iter->pos = iter->head->beg; // jump to the next position
6045
0
        } else ++iter->pos; // scan contiguously
6046
        // return
6047
0
        if (n_plp) return iter->plp;
6048
0
        if (iter->is_eof && iter->head == iter->tail) break;
6049
0
    }
6050
0
    return NULL;
6051
0
}
6052
6053
const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
6054
0
{
6055
0
    hts_pos_t pos64 = 0;
6056
0
    const bam_pileup1_t *p = bam_plp64_next(iter, _tid, &pos64, _n_plp);
6057
0
    if (pos64 < INT_MAX) {
6058
0
        *_pos = pos64;
6059
0
    } else {
6060
0
        hts_log_error("Position %"PRId64" too large", pos64);
6061
0
        *_pos = INT_MAX;
6062
0
        iter->error = 1;
6063
0
        *_n_plp = -1;
6064
0
        return NULL;
6065
0
    }
6066
0
    return p;
6067
0
}
6068
6069
int bam_plp_push(bam_plp_t iter, const bam1_t *b)
6070
0
{
6071
0
    if (iter->error) return -1;
6072
0
    if (b) {
6073
0
        if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
6074
        // Skip only unmapped reads here, any additional filtering must be done in iter->func
6075
0
        if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
6076
0
        if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
6077
0
        {
6078
0
            overlap_remove(iter, b);
6079
0
            return 0;
6080
0
        }
6081
0
        if (bam_copy1(&iter->tail->b, b) == NULL)
6082
0
            return -1;
6083
0
        iter->tail->b.id = iter->id++;
6084
0
        iter->tail->beg = b->core.pos;
6085
        // Use raw rlen rather than bam_endpos() which adjusts rlen=0 to rlen=1
6086
0
        iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
6087
0
        iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
6088
0
        if (b->core.tid < iter->max_tid) {
6089
0
            hts_log_error("The input is not sorted (chromosomes out of order)");
6090
0
            iter->error = 1;
6091
0
            return -1;
6092
0
        }
6093
0
        if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
6094
0
            hts_log_error("The input is not sorted (reads out of order)");
6095
0
            iter->error = 1;
6096
0
            return -1;
6097
0
        }
6098
0
        iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
6099
0
        if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
6100
0
            lbnode_t *next = mp_alloc(iter->mp);
6101
0
            if (!next) {
6102
0
                iter->error = 1;
6103
0
                return -1;
6104
0
            }
6105
0
            if (iter->plp_construct) {
6106
0
                if (iter->plp_construct(iter->data, &iter->tail->b,
6107
0
                                        &iter->tail->cd) < 0) {
6108
0
                    mp_free(iter->mp, next);
6109
0
                    iter->error = 1;
6110
0
                    return -1;
6111
0
                }
6112
0
            }
6113
0
            if (overlap_push(iter, iter->tail) < 0) {
6114
0
                mp_free(iter->mp, next);
6115
0
                iter->error = 1;
6116
0
                return -1;
6117
0
            }
6118
0
            iter->tail->next = next;
6119
0
            iter->tail = iter->tail->next;
6120
0
        }
6121
0
    } else iter->is_eof = 1;
6122
0
    return 0;
6123
0
}
6124
6125
const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
6126
0
{
6127
0
    const bam_pileup1_t *plp;
6128
0
    if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
6129
0
    if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6130
0
    else { // no pileup line can be obtained; read alignments
6131
0
        *_n_plp = 0;
6132
0
        if (iter->is_eof) return 0;
6133
0
        int ret;
6134
0
        while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
6135
0
            if (bam_plp_push(iter, iter->b) < 0) {
6136
0
                *_n_plp = -1;
6137
0
                return 0;
6138
0
            }
6139
0
            if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6140
            // otherwise no pileup line can be returned; read the next alignment.
6141
0
        }
6142
0
        if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
6143
0
        if (bam_plp_push(iter, 0) < 0) {
6144
0
            *_n_plp = -1;
6145
0
            return 0;
6146
0
        }
6147
0
        if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6148
0
        return 0;
6149
0
    }
6150
0
}
6151
6152
const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
6153
0
{
6154
0
    hts_pos_t pos64 = 0;
6155
0
    const bam_pileup1_t *p = bam_plp64_auto(iter, _tid, &pos64, _n_plp);
6156
0
    if (pos64 < INT_MAX) {
6157
0
        *_pos = pos64;
6158
0
    } else {
6159
0
        hts_log_error("Position %"PRId64" too large", pos64);
6160
0
        *_pos = INT_MAX;
6161
0
        iter->error = 1;
6162
0
        *_n_plp = -1;
6163
0
        return NULL;
6164
0
    }
6165
0
    return p;
6166
0
}
6167
6168
void bam_plp_reset(bam_plp_t iter)
6169
0
{
6170
0
    overlap_remove(iter, NULL);
6171
0
    iter->max_tid = iter->max_pos = -1;
6172
0
    iter->tid = iter->pos = 0;
6173
0
    iter->is_eof = 0;
6174
0
    while (iter->head != iter->tail) {
6175
0
        lbnode_t *p = iter->head;
6176
0
        iter->head = p->next;
6177
0
        mp_free(iter->mp, p);
6178
0
    }
6179
0
}
6180
6181
void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
6182
0
{
6183
0
    iter->maxcnt = maxcnt;
6184
0
}
6185
6186
/************************
6187
 *** Mpileup iterator ***
6188
 ************************/
6189
6190
struct bam_mplp_s {
6191
    int n;
6192
    int32_t min_tid, *tid;
6193
    hts_pos_t min_pos, *pos;
6194
    bam_plp_t *iter;
6195
    int *n_plp;
6196
    const bam_pileup1_t **plp;
6197
};
6198
6199
bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
6200
0
{
6201
0
    int i;
6202
0
    bam_mplp_t iter;
6203
0
    iter = (bam_mplp_t)calloc(1, sizeof(struct bam_mplp_s));
6204
0
    iter->pos = (hts_pos_t*)calloc(n, sizeof(hts_pos_t));
6205
0
    iter->tid = (int32_t*)calloc(n, sizeof(int32_t));
6206
0
    iter->n_plp = (int*)calloc(n, sizeof(int));
6207
0
    iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
6208
0
    iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
6209
0
    iter->n = n;
6210
0
    iter->min_pos = HTS_POS_MAX;
6211
0
    iter->min_tid = (uint32_t)-1;
6212
0
    for (i = 0; i < n; ++i) {
6213
0
        iter->iter[i] = bam_plp_init(func, data[i]);
6214
0
        iter->pos[i] = iter->min_pos;
6215
0
        iter->tid[i] = iter->min_tid;
6216
0
    }
6217
0
    return iter;
6218
0
}
6219
6220
int bam_mplp_init_overlaps(bam_mplp_t iter)
6221
0
{
6222
0
    int i, r = 0;
6223
0
    for (i = 0; i < iter->n; ++i)
6224
0
        r |= bam_plp_init_overlaps(iter->iter[i]);
6225
0
    return r == 0 ? 0 : -1;
6226
0
}
6227
6228
void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
6229
0
{
6230
0
    int i;
6231
0
    for (i = 0; i < iter->n; ++i)
6232
0
        iter->iter[i]->maxcnt = maxcnt;
6233
0
}
6234
6235
void bam_mplp_destroy(bam_mplp_t iter)
6236
0
{
6237
0
    int i;
6238
0
    for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
6239
0
    free(iter->iter); free(iter->pos); free(iter->tid);
6240
0
    free(iter->n_plp); free(iter->plp);
6241
0
    free(iter);
6242
0
}
6243
6244
int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp)
6245
0
{
6246
0
    int i, ret = 0;
6247
0
    hts_pos_t new_min_pos = HTS_POS_MAX;
6248
0
    uint32_t new_min_tid = (uint32_t)-1;
6249
0
    for (i = 0; i < iter->n; ++i) {
6250
0
        if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
6251
0
            int tid;
6252
0
            hts_pos_t pos;
6253
0
            iter->plp[i] = bam_plp64_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
6254
0
            if ( iter->iter[i]->error ) return -1;
6255
0
            if (iter->plp[i]) {
6256
0
                iter->tid[i] = tid;
6257
0
                iter->pos[i] = pos;
6258
0
            } else {
6259
0
                iter->tid[i] = 0;
6260
0
                iter->pos[i] = 0;
6261
0
            }
6262
0
        }
6263
0
        if (iter->plp[i]) {
6264
0
            if (iter->tid[i] < new_min_tid) {
6265
0
                new_min_tid = iter->tid[i];
6266
0
                new_min_pos = iter->pos[i];
6267
0
            } else if (iter->tid[i] == new_min_tid && iter->pos[i] < new_min_pos) {
6268
0
                new_min_pos = iter->pos[i];
6269
0
            }
6270
0
        }
6271
0
    }
6272
0
    iter->min_pos = new_min_pos;
6273
0
    iter->min_tid = new_min_tid;
6274
0
    if (new_min_pos == HTS_POS_MAX) return 0;
6275
0
    *_tid = new_min_tid; *_pos = new_min_pos;
6276
0
    for (i = 0; i < iter->n; ++i) {
6277
0
        if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
6278
0
            n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
6279
0
            ++ret;
6280
0
        } else n_plp[i] = 0, plp[i] = 0;
6281
0
    }
6282
0
    return ret;
6283
0
}
6284
6285
int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
6286
0
{
6287
0
    hts_pos_t pos64 = 0;
6288
0
    int ret = bam_mplp64_auto(iter, _tid, &pos64, n_plp, plp);
6289
0
    if (ret >= 0) {
6290
0
        if (pos64 < INT_MAX) {
6291
0
            *_pos = pos64;
6292
0
        } else {
6293
0
            hts_log_error("Position %"PRId64" too large", pos64);
6294
0
            *_pos = INT_MAX;
6295
0
            return -1;
6296
0
        }
6297
0
    }
6298
0
    return ret;
6299
0
}
6300
6301
void bam_mplp_reset(bam_mplp_t iter)
6302
0
{
6303
0
    int i;
6304
0
    iter->min_pos = HTS_POS_MAX;
6305
0
    iter->min_tid = (uint32_t)-1;
6306
0
    for (i = 0; i < iter->n; ++i) {
6307
0
        bam_plp_reset(iter->iter[i]);
6308
0
        iter->pos[i] = HTS_POS_MAX;
6309
0
        iter->tid[i] = (uint32_t)-1;
6310
0
        iter->n_plp[i] = 0;
6311
0
        iter->plp[i] = NULL;
6312
0
    }
6313
0
}
6314
6315
void bam_mplp_constructor(bam_mplp_t iter,
6316
0
                          int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
6317
0
    int i;
6318
0
    for (i = 0; i < iter->n; ++i)
6319
0
        bam_plp_constructor(iter->iter[i], func);
6320
0
}
6321
6322
void bam_mplp_destructor(bam_mplp_t iter,
6323
0
                         int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
6324
0
    int i;
6325
0
    for (i = 0; i < iter->n; ++i)
6326
0
        bam_plp_destructor(iter->iter[i], func);
6327
0
}
6328
6329
#endif // ~!defined(BAM_NO_PILEUP)