Coverage Report

Created: 2026-06-08 06:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/sam.c
Line
Count
Source
1
/*  sam.c -- SAM and BAM file I/O and manipulation.
2
3
    Copyright (C) 2008-2010, 2012-2025 Genome Research Ltd.
4
    Copyright (C) 2010, 2012, 2013 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <strings.h>
30
#include <stdio.h>
31
#include <stdlib.h>
32
#include <string.h>
33
#include <errno.h>
34
#include <zlib.h>
35
#include <assert.h>
36
#include <signal.h>
37
#include <inttypes.h>
38
#include <unistd.h>
39
#include <regex.h>
40
41
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
42
#include "fuzz_settings.h"
43
#endif
44
45
// Suppress deprecation message for cigar_tab, which we initialise
46
#include "htslib/hts_defs.h"
47
#undef HTS_DEPRECATED
48
#define HTS_DEPRECATED(message)
49
50
#include "htslib/sam.h"
51
#include "htslib/bgzf.h"
52
#include "cram/cram.h"
53
#include "hts_internal.h"
54
#include "sam_internal.h"
55
#include "htslib/hfile.h"
56
#include "htslib/hts_alloc.h"
57
#include "htslib/hts_endian.h"
58
#include "htslib/hts_expr.h"
59
#include "header.h"
60
#include "bgzf_internal.h"
61
62
#include "htslib/khash.h"
63
KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
64
KHASH_SET_INIT_INT(tag)
65
66
#ifndef EFTYPE
67
0
#define EFTYPE ENOEXEC
68
#endif
69
#ifndef EOVERFLOW
70
#define EOVERFLOW ERANGE
71
#endif
72
73
/**********************
74
 *** BAM header I/O ***
75
 **********************/
76
77
HTSLIB_EXPORT
78
const int8_t bam_cigar_table[256] = {
79
    // 0 .. 47
80
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
81
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
82
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
83
84
    // 48 .. 63  (including =)
85
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, BAM_CEQUAL, -1, -1,
86
87
    // 64 .. 79  (including MIDNHB)
88
    -1, -1, BAM_CBACK, -1,  BAM_CDEL, -1, -1, -1,
89
        BAM_CHARD_CLIP, BAM_CINS, -1, -1,  -1, BAM_CMATCH, BAM_CREF_SKIP, -1,
90
91
    // 80 .. 95  (including SPX)
92
    BAM_CPAD, -1, -1, BAM_CSOFT_CLIP,  -1, -1, -1, -1,
93
        BAM_CDIFF, -1, -1, -1,  -1, -1, -1, -1,
94
95
    // 96 .. 127
96
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
97
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
98
99
    // 128 .. 255
100
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
101
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
102
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
103
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
104
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
105
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
106
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
107
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1
108
};
109
110
sam_hdr_t *sam_hdr_init(void)
111
55.8k
{
112
55.8k
    sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t));
113
55.8k
    if (bh == NULL) return NULL;
114
115
55.8k
    bh->cigar_tab = bam_cigar_table;
116
55.8k
    return bh;
117
55.8k
}
118
119
void sam_hdr_destroy(sam_hdr_t *bh)
120
133k
{
121
133k
    int32_t i;
122
123
133k
    if (bh == NULL) return;
124
125
67.5k
    if (bh->ref_count > 0) {
126
11.7k
        --bh->ref_count;
127
11.7k
        return;
128
11.7k
    }
129
130
55.8k
    if (bh->target_name) {
131
7.40k
        for (i = 0; i < bh->n_targets; ++i)
132
4.75k
            free(bh->target_name[i]);
133
2.64k
        free(bh->target_name);
134
2.64k
        free(bh->target_len);
135
2.64k
    }
136
55.8k
    free(bh->text);
137
55.8k
    if (bh->hrecs)
138
38.6k
        sam_hrecs_free(bh->hrecs);
139
55.8k
    if (bh->sdict)
140
260
        kh_destroy(s2i, (khash_t(s2i) *) bh->sdict);
141
55.8k
    free(bh);
142
55.8k
}
143
144
// Copy the sam_hdr_t::sdict hash, used to store the real lengths of long
145
// references before sam_hdr_t::hrecs is populated
146
int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h)
147
0
{
148
0
    const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict;
149
0
    khash_t(s2i) *dest_long_refs = kh_init(s2i);
150
0
    int i;
151
0
    if (!dest_long_refs) return -1;
152
153
0
    for (i = 0; i < h->n_targets; i++) {
154
0
        int ret;
155
0
        khiter_t ksrc, kdest;
156
0
        if (h->target_len[i] < UINT32_MAX) continue;
157
0
        ksrc = kh_get(s2i, src_long_refs, h->target_name[i]);
158
0
        if (ksrc == kh_end(src_long_refs)) continue;
159
0
        kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret);
160
0
        if (ret < 0) {
161
0
            kh_destroy(s2i, dest_long_refs);
162
0
            return -1;
163
0
        }
164
0
        kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc);
165
0
    }
166
167
0
    h->sdict = dest_long_refs;
168
0
    return 0;
169
0
}
170
171
sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0)
172
27.4k
{
173
27.4k
    if (h0 == NULL) return NULL;
174
27.4k
    sam_hdr_t *h;
175
27.4k
    if ((h = sam_hdr_init()) == NULL) return NULL;
176
    // copy the simple data
177
27.4k
    h->n_targets = 0;
178
27.4k
    h->ignore_sam_err = h0->ignore_sam_err;
179
27.4k
    h->l_text = 0;
180
181
    // Then the pointery stuff
182
183
27.4k
    if (!h0->hrecs) {
184
177
        h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t));
185
177
        if (!h->target_len) goto fail;
186
177
        h->target_name = (char**)calloc(h0->n_targets, sizeof(char*));
187
177
        if (!h->target_name) goto fail;
188
189
177
        int i;
190
372
        for (i = 0; i < h0->n_targets; ++i) {
191
195
            h->target_len[i] = h0->target_len[i];
192
195
            h->target_name[i] = strdup(h0->target_name[i]);
193
195
            if (!h->target_name[i]) break;
194
195
        }
195
177
        h->n_targets = i;
196
177
        if (i < h0->n_targets) goto fail;
197
198
177
        if (h0->sdict) {
199
0
            if (sam_hdr_dup_sdict(h0, h) < 0) goto fail;
200
0
        }
201
177
    }
202
203
27.4k
    if (h0->hrecs) {
204
27.2k
        kstring_t tmp = { 0, 0, NULL };
205
27.2k
        if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) {
206
0
            free(ks_release(&tmp));
207
0
            goto fail;
208
0
        }
209
210
27.2k
        h->l_text = tmp.l;
211
27.2k
        h->text   = ks_release(&tmp);
212
213
27.2k
        if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0)
214
0
            goto fail;
215
27.2k
    } else {
216
177
        h->l_text = h0->text ? h0->l_text : 0;
217
177
        h->text = hts_malloc_ps(sizeof(*h->text), h->l_text, 1);
218
177
        if (!h->text) goto fail;
219
177
        if (h0->text)
220
177
            memcpy(h->text, h0->text, h->l_text);
221
177
        h->text[h->l_text] = '\0';
222
177
    }
223
224
27.4k
    return h;
225
226
0
 fail:
227
0
    sam_hdr_destroy(h);
228
0
    return NULL;
229
27.4k
}
230
231
sam_hdr_t *bam_hdr_read(BGZF *fp)
232
2.13k
{
233
2.13k
    sam_hdr_t *h;
234
2.13k
    uint8_t buf[4];
235
2.13k
    int magic_len, has_EOF;
236
2.13k
    int32_t i, name_len, num_names = 0;
237
2.13k
    size_t bufsize;
238
2.13k
    ssize_t bytes;
239
    // check EOF
240
2.13k
    has_EOF = bgzf_check_EOF(fp);
241
2.13k
    if (has_EOF < 0) {
242
0
        perror("[W::bam_hdr_read] bgzf_check_EOF");
243
2.13k
    } else if (has_EOF == 0) {
244
2.13k
        hts_log_warning("EOF marker is absent. The input is probably truncated");
245
2.13k
    }
246
    // read "BAM1"
247
2.13k
    magic_len = bgzf_read(fp, buf, 4);
248
2.13k
    if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) {
249
3
        hts_log_error("Invalid BAM binary header");
250
3
        return 0;
251
3
    }
252
2.13k
    h = sam_hdr_init();
253
2.13k
    if (!h) goto nomem;
254
255
    // read plain text and the number of reference sequences
256
2.13k
    bytes = bgzf_read(fp, buf, 4);
257
2.13k
    if (bytes != 4) goto read_err;
258
2.12k
    h->l_text = le_to_u32(buf);
259
260
2.12k
    bufsize = h->l_text + 1;
261
2.12k
    if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
262
2.12k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
263
2.12k
    if (bufsize > FUZZ_ALLOC_LIMIT) goto nomem;
264
2.12k
#endif
265
2.12k
    h->text = (char*)malloc(bufsize);
266
2.12k
    if (!h->text) goto nomem;
267
2.12k
    h->text[h->l_text] = 0; // make sure it is NULL terminated
268
2.12k
    bytes = bgzf_read(fp, h->text, h->l_text);
269
2.12k
    if (bytes != h->l_text) goto read_err;
270
271
1.95k
    bytes = bgzf_read(fp, &h->n_targets, 4);
272
1.95k
    if (bytes != 4) goto read_err;
273
1.94k
    if (fp->is_be) ed_swap_4p(&h->n_targets);
274
275
1.94k
    if (h->n_targets < 0) goto invalid;
276
277
    // read reference sequence names and lengths
278
1.89k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
279
1.89k
    if (h->n_targets > (FUZZ_ALLOC_LIMIT - bufsize)/(sizeof(char*)+sizeof(uint32_t)))
280
12
        goto nomem;
281
1.88k
#endif
282
1.88k
    if (h->n_targets > 0) {
283
840
        h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
284
840
        if (!h->target_name) goto nomem;
285
840
        h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
286
840
        if (!h->target_len) goto nomem;
287
840
    }
288
1.04k
    else {
289
1.04k
        h->target_name = NULL;
290
1.04k
        h->target_len = NULL;
291
1.04k
    }
292
293
3.26k
    for (i = 0; i != h->n_targets; ++i) {
294
1.56k
        bytes = bgzf_read(fp, &name_len, 4);
295
1.56k
        if (bytes != 4) goto read_err;
296
1.50k
        if (fp->is_be) ed_swap_4p(&name_len);
297
1.50k
        if (name_len <= 0) goto invalid;
298
299
1.46k
        h->target_name[i] = (char*)malloc(name_len);
300
1.46k
        if (!h->target_name[i]) goto nomem;
301
1.46k
        num_names++;
302
303
1.46k
        bytes = bgzf_read(fp, h->target_name[i], name_len);
304
1.46k
        if (bytes != name_len) goto read_err;
305
306
1.39k
        if (h->target_name[i][name_len - 1] != '\0') {
307
            /* Fix missing NUL-termination.  Is this being too nice?
308
               We could alternatively bail out with an error. */
309
996
            char *new_name;
310
996
            if (name_len == INT32_MAX) goto invalid;
311
996
            new_name = realloc(h->target_name[i], name_len + 1);
312
996
            if (new_name == NULL) goto nomem;
313
996
            h->target_name[i] = new_name;
314
996
            h->target_name[i][name_len] = '\0';
315
996
        }
316
317
1.39k
        bytes = bgzf_read(fp, &h->target_len[i], 4);
318
1.39k
        if (bytes != 4) goto read_err;
319
1.37k
        if (fp->is_be) ed_swap_4p(&h->target_len[i]);
320
1.37k
    }
321
1.69k
    return h;
322
323
18
 nomem:
324
18
    hts_log_error("Out of memory");
325
18
    goto clean;
326
327
327
 read_err:
328
327
    if (bytes < 0) {
329
21
        hts_log_error("Error reading BGZF stream");
330
306
    } else {
331
306
        hts_log_error("Truncated BAM header");
332
306
    }
333
327
    goto clean;
334
335
93
 invalid:
336
93
    hts_log_error("Invalid BAM binary header");
337
338
438
 clean:
339
438
    if (h != NULL) {
340
438
        h->n_targets = num_names; // ensure we free only allocated target_names
341
438
        sam_hdr_destroy(h);
342
438
    }
343
438
    return NULL;
344
93
}
345
346
int bam_hdr_write(BGZF *fp, const sam_hdr_t *h)
347
5.52k
{
348
5.52k
    int32_t i, name_len, x;
349
5.52k
    kstring_t hdr_ks = { 0, 0, NULL };
350
5.52k
    char *text;
351
5.52k
    uint32_t l_text;
352
353
5.52k
    if (!h) return -1;
354
355
5.52k
    if (h->hrecs) {
356
5.46k
        if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1;
357
5.46k
        if (hdr_ks.l > UINT32_MAX) {
358
0
            hts_log_error("Header too long for BAM format");
359
0
            free(hdr_ks.s);
360
0
            return -1;
361
5.46k
        } else if (hdr_ks.l > INT32_MAX) {
362
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
363
0
            hts_log_warning("Output file may not be portable");
364
0
        }
365
5.46k
        text = hdr_ks.s;
366
5.46k
        l_text = hdr_ks.l;
367
5.46k
    } else {
368
59
        if (h->l_text > UINT32_MAX) {
369
0
            hts_log_error("Header too long for BAM format");
370
0
            return -1;
371
59
        } else if (h->l_text > INT32_MAX) {
372
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
373
0
            hts_log_warning("Output file may not be portable");
374
0
        }
375
59
        text = h->text;
376
59
        l_text = h->l_text;
377
59
    }
378
    // write "BAM1"
379
5.52k
    if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; }
380
    // write plain text and the number of reference sequences
381
5.52k
    if (fp->is_be) {
382
0
        x = ed_swap_4(l_text);
383
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
384
0
        if (l_text) {
385
0
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
386
0
        }
387
0
        x = ed_swap_4(h->n_targets);
388
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
389
5.52k
    } else {
390
5.52k
        if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; }
391
5.52k
        if (l_text) {
392
1.76k
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
393
1.76k
        }
394
5.52k
        if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; }
395
5.52k
    }
396
5.52k
    free(hdr_ks.s);
397
    // write sequence names and lengths
398
6.08k
    for (i = 0; i != h->n_targets; ++i) {
399
565
        char *p = h->target_name[i];
400
565
        name_len = strlen(p) + 1;
401
565
        if (fp->is_be) {
402
0
            x = ed_swap_4(name_len);
403
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
404
565
        } else {
405
565
            if (bgzf_write(fp, &name_len, 4) < 0) return -1;
406
565
        }
407
565
        if (bgzf_write(fp, p, name_len) < 0) return -1;
408
565
        if (fp->is_be) {
409
0
            x = ed_swap_4(h->target_len[i]);
410
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
411
565
        } else {
412
565
            if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
413
565
        }
414
565
    }
415
5.52k
    if (bgzf_flush(fp) < 0) return -1;
416
5.52k
    return 0;
417
5.52k
}
418
419
// Wrap around bam_name2id() to get the right signature for hts_name2id_f
420
0
static int bam_name2id_wrapper(void *vhdr, const char *ref) {
421
0
    return bam_name2id((sam_hdr_t *) vhdr, ref);
422
0
}
423
424
const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
425
0
                             hts_pos_t *beg, hts_pos_t *end, int flags) {
426
0
    return hts_parse_region(s, tid, beg, end, bam_name2id_wrapper, h, flags);
427
0
}
428
429
/*************************
430
 *** BAM alignment I/O ***
431
 *************************/
432
433
bam1_t *bam_init1(void)
434
16.4k
{
435
16.4k
    return (bam1_t*)calloc(1, sizeof(bam1_t));
436
16.4k
}
437
438
int sam_realloc_bam_data(bam1_t *b, size_t desired)
439
1.12M
{
440
1.12M
    uint32_t new_m_data;
441
1.12M
    uint8_t *new_data;
442
1.12M
    new_m_data = desired;
443
1.12M
    kroundup32(new_m_data); // next power of 2
444
1.12M
    new_m_data += 32; // reduces malloc arena migrations?
445
1.12M
    if (new_m_data < desired) {
446
0
        errno = ENOMEM; // Not strictly true but we can't store the size
447
0
        return -1;
448
0
    }
449
1.12M
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
450
1.12M
    if (new_m_data > FUZZ_ALLOC_LIMIT) {
451
39
        errno = ENOMEM;
452
39
        return -1;
453
39
    }
454
1.12M
#endif
455
1.12M
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
456
1.12M
        new_data = realloc(b->data, new_m_data);
457
1.12M
    } else {
458
0
        if ((new_data = malloc(new_m_data)) != NULL) {
459
0
            if (b->l_data > 0)
460
0
                memcpy(new_data, b->data,
461
0
                       b->l_data < b->m_data ? b->l_data : b->m_data);
462
0
            bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA));
463
0
        }
464
0
    }
465
1.12M
    if (!new_data) return -1;
466
1.12M
    b->data = new_data;
467
1.12M
    b->m_data = new_m_data;
468
1.12M
    return 0;
469
1.12M
}
470
471
void bam_destroy1(bam1_t *b)
472
26.2M
{
473
26.2M
    if (b == 0) return;
474
26.2M
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
475
26.2M
        free(b->data);
476
26.2M
        if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) {
477
            // In case of reuse
478
26.2M
            b->data = NULL;
479
26.2M
            b->m_data = 0;
480
26.2M
            b->l_data = 0;
481
26.2M
        }
482
26.2M
    }
483
484
26.2M
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0)
485
16.4k
        free(b);
486
26.2M
}
487
488
bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
489
5.50M
{
490
5.50M
    if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL;
491
5.50M
    memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data
492
5.50M
    memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest
493
5.50M
    bdst->l_data = bsrc->l_data;
494
5.50M
    bdst->id = bsrc->id;
495
5.50M
    return bdst;
496
5.50M
}
497
498
bam1_t *bam_dup1(const bam1_t *bsrc)
499
0
{
500
0
    if (bsrc == NULL) return NULL;
501
0
    bam1_t *bdst = bam_init1();
502
0
    if (bdst == NULL) return NULL;
503
0
    if (bam_copy1(bdst, bsrc) == NULL) {
504
0
        bam_destroy1(bdst);
505
0
        return NULL;
506
0
    }
507
0
    return bdst;
508
0
}
509
510
static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar,
511
                             hts_pos_t *rlen, hts_pos_t *qlen)
512
1.51k
{
513
1.51k
    int k;
514
1.51k
    *rlen = *qlen = 0;
515
45.4k
    for (k = 0; k < n_cigar; ++k) {
516
43.8k
        int type = bam_cigar_type(bam_cigar_op(cigar[k]));
517
43.8k
        int len = bam_cigar_oplen(cigar[k]);
518
43.8k
        if (type & 1) *qlen += len;
519
43.8k
        if (type & 2) *rlen += len;
520
43.8k
    }
521
1.51k
}
522
523
static int subtract_check_underflow(size_t length, size_t *limit)
524
82.5M
{
525
82.5M
    if (length <= *limit) {
526
82.5M
        *limit -= length;
527
82.5M
        return 0;
528
82.5M
    }
529
530
0
    return -1;
531
82.5M
}
532
533
int bam_set1(bam1_t *bam,
534
             size_t l_qname, const char *qname,
535
             uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq,
536
             size_t n_cigar, const uint32_t *cigar,
537
             int32_t mtid, hts_pos_t mpos, hts_pos_t isize,
538
             size_t l_seq, const char *seq, const char *qual,
539
             size_t l_aux)
540
16.5M
{
541
    // use a default qname "*" if none is provided
542
16.5M
    if (l_qname == 0) {
543
14.9M
        l_qname = 1;
544
14.9M
        qname = "*";
545
14.9M
    }
546
547
    // note: the qname is stored nul terminated and padded as described in the
548
    // documentation for the bam1_t struct.
549
16.5M
    size_t qname_nuls = 4 - l_qname % 4;
550
551
    // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos().
552
    // can't use bam_endpos() directly as some fields not yet set up.
553
16.5M
    hts_pos_t rlen = 0, qlen = 0;
554
16.5M
    if (!(flag & BAM_FUNMAP)) {
555
0
        bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen);
556
0
    }
557
16.5M
    if (rlen == 0) {
558
16.5M
        rlen = 1;
559
16.5M
    }
560
561
    // validate parameters
562
16.5M
    if (l_qname > 254) {
563
114
        hts_log_error("Query name too long");
564
114
        errno = EINVAL;
565
114
        return -1;
566
114
    }
567
16.5M
    if (HTS_POS_MAX - rlen <= pos) {
568
0
        hts_log_error("Read ends beyond highest supported position");
569
0
        errno = EINVAL;
570
0
        return -1;
571
0
    }
572
16.5M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) {
573
0
        hts_log_error("Mapped query must have a CIGAR");
574
0
        errno = EINVAL;
575
0
        return -1;
576
0
    }
577
16.5M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) {
578
0
        hts_log_error("CIGAR and query sequence are of different length");
579
0
        errno = EINVAL;
580
0
        return -1;
581
0
    }
582
583
16.5M
    size_t limit = INT32_MAX;
584
16.5M
    int u = subtract_check_underflow(l_qname + qname_nuls, &limit);
585
16.5M
    u    += subtract_check_underflow(n_cigar * 4, &limit);
586
16.5M
    u    += subtract_check_underflow((l_seq + 1) / 2, &limit);
587
16.5M
    u    += subtract_check_underflow(l_seq, &limit);
588
16.5M
    u    += subtract_check_underflow(l_aux, &limit);
589
16.5M
    if (u != 0) {
590
0
        hts_log_error("Size overflow");
591
0
        errno = EINVAL;
592
0
        return -1;
593
0
    }
594
595
    // re-allocate the data buffer as needed.
596
16.5M
    size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq;
597
16.5M
    if (realloc_bam_data(bam, hts_add_sat2(data_len, l_aux)) < 0) {
598
0
        return -1;
599
0
    }
600
601
16.5M
    bam->l_data = (int)data_len;
602
16.5M
    bam->core.pos = pos;
603
16.5M
    bam->core.tid = tid;
604
16.5M
    bam->core.bin = bam_reg2bin(pos, pos + rlen);
605
16.5M
    bam->core.qual = mapq;
606
16.5M
    bam->core.l_extranul = (uint8_t)(qname_nuls - 1);
607
16.5M
    bam->core.flag = flag;
608
16.5M
    bam->core.l_qname = (uint16_t)(l_qname + qname_nuls);
609
16.5M
    bam->core.n_cigar = (uint32_t)n_cigar;
610
16.5M
    bam->core.l_qseq = (int32_t)l_seq;
611
16.5M
    bam->core.mtid = mtid;
612
16.5M
    bam->core.mpos = mpos;
613
16.5M
    bam->core.isize = isize;
614
615
16.5M
    uint8_t *cp = bam->data;
616
16.5M
    strncpy((char *)cp, qname, l_qname);
617
16.5M
    int i;
618
65.5M
    for (i = 0; i < qname_nuls; i++) {
619
49.0M
        cp[l_qname + i] = '\0';
620
49.0M
    }
621
16.5M
    cp += l_qname + qname_nuls;
622
623
16.5M
    if (n_cigar > 0) {
624
0
        memcpy(cp, cigar, n_cigar * 4);
625
0
    }
626
16.5M
    cp += n_cigar * 4;
627
628
641M
#define NN 16
629
16.5M
    const uint8_t *useq = (uint8_t *)seq;
630
68.6M
    for (i = 0; i + NN < l_seq; i += NN) {
631
52.1M
        int j;
632
52.1M
        const uint8_t *u2 = useq+i;
633
469M
        for (j = 0; j < NN/2; j++)
634
416M
            cp[j] = (seq_nt16_table[u2[j*2]]<<4) | seq_nt16_table[u2[j*2+1]];
635
52.1M
        cp += NN/2;
636
52.1M
    }
637
17.9M
    for (; i + 1 < l_seq; i += 2) {
638
1.43M
        *cp++ = (seq_nt16_table[useq[i]] << 4) | seq_nt16_table[useq[i + 1]];
639
1.43M
    }
640
641
16.7M
    for (; i < l_seq; i++) {
642
235k
        *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4;
643
235k
    }
644
645
16.5M
    if (qual) {
646
513
        memcpy(cp, qual, l_seq);
647
513
    }
648
16.5M
    else {
649
16.5M
        memset(cp, '\xff', l_seq);
650
16.5M
    }
651
652
16.5M
    return (int)data_len;
653
16.5M
}
654
655
hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
656
5.35M
{
657
5.35M
    int k;
658
5.35M
    hts_pos_t l;
659
5.36M
    for (k = l = 0; k < n_cigar; ++k)
660
9.29k
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
661
4.66k
            l += bam_cigar_oplen(cigar[k]);
662
5.35M
    return l;
663
5.35M
}
664
665
hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
666
756
{
667
756
    int k;
668
756
    hts_pos_t l;
669
4.34k
    for (k = l = 0; k < n_cigar; ++k)
670
3.59k
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
671
2.77k
            l += bam_cigar_oplen(cigar[k]);
672
756
    return l;
673
756
}
674
675
hts_pos_t bam_endpos(const bam1_t *b)
676
6
{
677
6
    hts_pos_t rlen = (b->core.flag & BAM_FUNMAP)? 0 : bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
678
6
    if (rlen == 0) rlen = 1;
679
6
    return b->core.pos + rlen;
680
6
}
681
682
// return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
683
int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning)
684
8.04k
{
685
8.04k
    bam1_core_t *c = &b->core;
686
687
    // Bail out as fast as possible for the easy case
688
8.04k
    uint32_t test_CG = BAM_CSOFT_CLIP | (c->l_qseq << BAM_CIGAR_SHIFT);
689
8.04k
    if (c->n_cigar == 0 || test_CG != *bam_get_cigar(b))
690
6.48k
        return 0;
691
692
    // The above isn't fool proof - we may have old CIGAR tags that aren't used,
693
    // but this is much less likely so do as a secondary check.
694
1.55k
    if (c->tid < 0 || c->pos < 0)
695
1.43k
        return 0;
696
697
    // Do we have a CG tag?
698
122
    uint8_t *CG = bam_aux_get(b, "CG");
699
122
    int saved_errno = errno;
700
122
    if (!CG) {
701
103
        if (errno != ENOENT) return -1;  // Bad aux data
702
103
        errno = saved_errno; // restore errno on expected no-CG-tag case
703
78
        return 0;
704
103
    }
705
706
    // Now we start with the serious work migrating CG to CIGAR
707
19
    uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data,
708
19
        *cigar0, CG_len, fake_bytes;
709
19
    cigar0 = bam_get_cigar(b);
710
19
    fake_bytes = c->n_cigar * 4;
711
19
    if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i'))
712
13
        return 0; // not of type B,I
713
6
    CG_len = le_to_u32(CG + 2);
714
    // don't move if the real CIGAR length is shorter than the fake cigar length
715
6
    if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0;
716
717
    // move from the CG tag to the right position
718
6
    cigar_st = (uint8_t*)cigar0 - b->data;
719
6
    c->n_cigar = CG_len;
720
6
    n_cigar4 = c->n_cigar * 4;
721
6
    CG_st = CG - b->data - 2;
722
6
    CG_en = CG_st + 8 + n_cigar4;
723
6
    if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1;
724
    // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
725
6
    b->l_data = b->l_data - fake_bytes + n_cigar4;
726
    // insert c->n_cigar-fake_bytes empty space to make room
727
6
    memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes));
728
    // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
729
6
    memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4);
730
6
    if (ori_len > CG_en) // move data after the CG tag
731
0
        memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
732
6
    b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
733
6
    if (recal_bin)
734
6
        b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5);
735
6
    if (give_warning)
736
6
        hts_log_warning("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
737
6
    return 1;
738
6
}
739
740
static inline int aux_type2size(uint8_t type)
741
655k
{
742
655k
    switch (type) {
743
427k
    case 'A': case 'c': case 'C':
744
427k
        return 1;
745
90.0k
    case 's': case 'S':
746
90.0k
        return 2;
747
112k
    case 'i': case 'I': case 'f':
748
112k
        return 4;
749
13.6k
    case 'd':
750
13.6k
        return 8;
751
11.7k
    case 'Z': case 'H': case 'B':
752
11.7k
        return type;
753
476
    default:
754
476
        return 0;
755
655k
    }
756
655k
}
757
758
static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
759
0
{
760
0
    uint32_t *cigar = (uint32_t*)(data + c->l_qname);
761
0
    uint32_t i;
762
0
    for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
763
0
}
764
765
// Fix bad records where qname is not terminated correctly.
766
1.30k
static int fixup_missing_qname_nul(bam1_t *b) {
767
1.30k
    bam1_core_t *c = &b->core;
768
769
    // Note this is called before c->l_extranul is added to c->l_qname
770
1.30k
    if (c->l_extranul > 0) {
771
864
        b->data[c->l_qname++] = '\0';
772
864
        c->l_extranul--;
773
864
    } else {
774
438
        if (b->l_data > INT_MAX - 4) return -1;
775
438
        if (realloc_bam_data(b, b->l_data + 4) < 0) return -1;
776
438
        b->l_data += 4;
777
438
        b->data[c->l_qname++] = '\0';
778
438
        c->l_extranul = 3;
779
438
    }
780
1.30k
    return 0;
781
1.30k
}
782
783
/*
784
 * Note a second interface that returns a bam pointer instead would avoid bam_copy1
785
 * in multi-threaded handling.  This may be worth considering for htslib2.
786
 */
787
int bam_read1(BGZF *fp, bam1_t *b)
788
3.05k
{
789
3.05k
    bam1_core_t *c = &b->core;
790
3.05k
    int32_t block_len, ret, i;
791
3.05k
    uint32_t new_l_data;
792
3.05k
    uint8_t tmp[32], *x;
793
794
3.05k
    b->l_data = 0;
795
796
3.05k
    if ((ret = bgzf_read_small(fp, &block_len, 4)) != 4) {
797
202
        if (ret == 0) return -1; // normal end-of-file
798
90
        else return -2; // truncated
799
202
    }
800
2.85k
    if (fp->is_be)
801
0
        ed_swap_4p(&block_len);
802
2.85k
    if (block_len < 32) return -4;  // block_len includes core data
803
2.63k
    if (fp->block_length - fp->block_offset > 32) {
804
        // Avoid bgzf_read and a temporary copy to a local buffer
805
2.30k
        x = (uint8_t *)fp->uncompressed_block + fp->block_offset;
806
2.30k
        fp->block_offset += 32;
807
2.30k
    } else {
808
326
        x = tmp;
809
326
        if (bgzf_read(fp, x, 32) != 32) return -3;
810
326
    }
811
812
2.36k
    c->tid        = le_to_u32(x);
813
2.36k
    c->pos        = le_to_i32(x+4);
814
2.36k
    uint32_t x2   = le_to_u32(x+8);
815
2.36k
    c->bin        = x2>>16;
816
2.36k
    c->qual       = x2>>8&0xff;
817
2.36k
    c->l_qname    = x2&0xff;
818
2.36k
    c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
819
2.36k
    uint32_t x3   = le_to_u32(x+12);
820
2.36k
    c->flag       = x3>>16;
821
2.36k
    c->n_cigar    = x3&0xffff;
822
2.36k
    c->l_qseq     = le_to_u32(x+16);
823
2.36k
    c->mtid       = le_to_u32(x+20);
824
2.36k
    c->mpos       = le_to_i32(x+24);
825
2.36k
    c->isize      = le_to_i32(x+28);
826
827
2.36k
    new_l_data = block_len - 32 + c->l_extranul;
828
2.36k
    if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
829
2.28k
    if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
830
2.28k
        + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data)
831
122
        return -4;
832
2.16k
    if (realloc_bam_data(b, new_l_data) < 0) return -4;
833
2.12k
    b->l_data = new_l_data;
834
835
2.12k
    if (bgzf_read_small(fp, b->data, c->l_qname) != c->l_qname) return -4;
836
2.07k
    if (b->data[c->l_qname - 1] != '\0') { // try to fix missing nul termination
837
1.30k
        if (fixup_missing_qname_nul(b) < 0) return -4;
838
1.30k
    }
839
5.14k
    for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
840
2.07k
    c->l_qname += c->l_extranul;
841
2.07k
    if (b->l_data < c->l_qname ||
842
2.07k
        bgzf_read_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
843
147
        return -4;
844
1.93k
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
845
1.93k
    if (bam_tag2cigar(b, 0, 0) < 0)
846
25
        return -4;
847
848
    // TODO: consider making this conditional
849
1.90k
    if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
850
1.51k
        hts_pos_t rlen, qlen;
851
1.51k
        bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
852
1.51k
        if ((b->core.flag & BAM_FUNMAP) || rlen == 0) rlen = 1;
853
1.51k
        b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
854
        // Sanity check for broken CIGAR alignments
855
1.51k
        if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
856
48
            hts_log_error("CIGAR and query sequence lengths differ for %s",
857
48
                    bam_get_qname(b));
858
48
            return -4;
859
48
        }
860
1.51k
    }
861
862
1.85k
    return 4 + block_len;
863
1.90k
}
864
865
int bam_write1(BGZF *fp, const bam1_t *b)
866
5.50M
{
867
5.50M
    const bam1_core_t *c = &b->core;
868
5.50M
    uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
869
5.50M
    int i, ok;
870
5.50M
    if (c->l_qname - c->l_extranul > 255) {
871
2
        hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b));
872
2
        errno = EOVERFLOW;
873
2
        return -1;
874
2
    }
875
5.50M
    if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
876
5.50M
    if (c->pos > INT_MAX ||
877
5.50M
        c->mpos > INT_MAX ||
878
5.50M
        c->isize < INT_MIN || c->isize > INT_MAX) {
879
101
        hts_log_error("Positional data is too large for BAM format");
880
101
        return -1;
881
101
    }
882
5.50M
    x[0] = c->tid;
883
5.50M
    x[1] = c->pos;
884
5.50M
    x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
885
5.50M
    if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
886
5.50M
    else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
887
5.50M
    x[4] = c->l_qseq;
888
5.50M
    x[5] = c->mtid;
889
5.50M
    x[6] = c->mpos;
890
5.50M
    x[7] = c->isize;
891
5.50M
    ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
892
5.50M
    if (fp->is_be) {
893
0
        for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
894
0
        y = block_len;
895
0
        if (ok) ok = (bgzf_write_small(fp, ed_swap_4p(&y), 4) >= 0);
896
0
        swap_data(c, b->l_data, b->data, 1);
897
5.50M
    } else {
898
5.50M
        if (ok) ok = (bgzf_write_small(fp, &block_len, 4) >= 0);
899
5.50M
    }
900
5.50M
    if (ok) ok = (bgzf_write_small(fp, x, 32) >= 0);
901
5.50M
    if (ok) ok = (bgzf_write_small(fp, b->data, c->l_qname - c->l_extranul) >= 0);
902
5.50M
    if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
903
5.50M
        if (ok) ok = (bgzf_write_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
904
5.50M
    } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
905
0
        uint8_t buf[8];
906
0
        uint32_t cigar_st, cigar_en, cigar[2];
907
0
        hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b));
908
0
        if (cigreflen >= (1<<28)) {
909
            // Length of reference covered is greater than the biggest
910
            // CIGAR operation currently allowed.
911
0
            hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos
912
0
                          " cannot be written in BAM.  Try writing SAM or CRAM instead.\n",
913
0
                          bam_get_qname(b), c->n_cigar, cigreflen);
914
0
            return -1;
915
0
        }
916
0
        cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
917
0
        cigar_en = cigar_st + c->n_cigar * 4;
918
0
        cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
919
0
        cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP;
920
0
        u32_to_le(cigar[0], buf);
921
0
        u32_to_le(cigar[1], buf + 4);
922
0
        if (ok) ok = (bgzf_write_small(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
923
0
        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
924
0
        if (ok) ok = (bgzf_write_small(fp, "CGBI", 4) >= 0); // write CG:B,I
925
0
        u32_to_le(c->n_cigar, buf);
926
0
        if (ok) ok = (bgzf_write_small(fp, buf, 4) >= 0); // write the true CIGAR length
927
0
        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
928
0
    }
929
5.50M
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
930
5.50M
    return ok? 4 + block_len : -1;
931
5.50M
}
932
933
/*
934
 * Write a BAM file and append to the in-memory index simultaneously.
935
 */
936
5.50M
static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) {
937
5.50M
    BGZF *bfp = fp->fp.bgzf;
938
939
5.50M
    if (!fp->idx)
940
5.50M
        return bam_write1(bfp, b);
941
942
0
    uint32_t block_len = b->l_data - b->core.l_extranul + 32;
943
0
    if (bgzf_flush_try(bfp, 4 + block_len) < 0)
944
0
        return -1;
945
0
    if (!bfp->mt)
946
0
        hts_idx_amend_last(fp->idx, bgzf_tell(bfp));
947
948
0
    int ret = bam_write1(bfp, b);
949
0
    if (ret < 0)
950
0
        return -1;
951
952
0
    if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) {
953
0
        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
954
0
                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
955
0
        ret = -1;
956
0
    }
957
958
0
    return ret;
959
0
}
960
961
/*
962
 * Set the qname in a BAM record
963
 */
964
int bam_set_qname(bam1_t *rec, const char *qname)
965
0
{
966
0
    if (!rec) return -1;
967
0
    if (!qname || !*qname) return -1;
968
969
0
    size_t old_len = rec->core.l_qname;
970
0
    size_t new_len = strlen(qname) + 1;
971
0
    if (new_len < 1 || new_len > 255) return -1;
972
973
0
    int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0;
974
975
0
    size_t new_data_len = rec->l_data - old_len + new_len + extranul;
976
0
    if (realloc_bam_data(rec, new_data_len) < 0) return -1;
977
978
    // Make room
979
0
    if (new_len + extranul != rec->core.l_qname)
980
0
        memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname);
981
    // Copy in new name and pad if needed
982
0
    memcpy(rec->data, qname, new_len);
983
0
    int n;
984
0
    for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0';
985
986
0
    rec->l_data = new_data_len;
987
0
    rec->core.l_qname = new_len + extranul;
988
0
    rec->core.l_extranul = extranul;
989
990
0
    return 0;
991
0
}
992
993
/********************
994
 *** BAM indexing ***
995
 ********************/
996
997
static hts_idx_t *sam_index(htsFile *fp, int min_shift)
998
0
{
999
0
    int n_lvls, i, fmt, ret;
1000
0
    bam1_t *b;
1001
0
    hts_idx_t *idx;
1002
0
    sam_hdr_t *h;
1003
0
    h = sam_hdr_read(fp);
1004
0
    if (h == NULL) return NULL;
1005
0
    if (min_shift > 0) {
1006
0
        hts_pos_t max_len = 0;
1007
0
        for (i = 0; i < h->n_targets; ++i) {
1008
0
            hts_pos_t len = sam_hdr_tid2len(h, i);
1009
0
            if (max_len < len) max_len = len;
1010
0
        }
1011
0
        n_lvls = 0;
1012
0
        hts_adjust_csi_settings(max_len, &min_shift, &n_lvls);
1013
0
        fmt = HTS_FMT_CSI;
1014
0
    } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1015
0
    idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1016
0
    b = bam_init1();
1017
0
    while ((ret = sam_read1(fp, h, b)) >= 0) {
1018
0
        ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP));
1019
0
        if (ret < 0) { // unsorted or doesn't fit
1020
0
            hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
1021
0
            goto err;
1022
0
        }
1023
0
    }
1024
0
    if (ret < -1) goto err; // corrupted BAM file
1025
1026
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
1027
0
    sam_hdr_destroy(h);
1028
0
    bam_destroy1(b);
1029
0
    return idx;
1030
1031
0
err:
1032
0
    bam_destroy1(b);
1033
0
    hts_idx_destroy(idx);
1034
0
    return NULL;
1035
0
}
1036
1037
int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
1038
0
{
1039
0
    hts_idx_t *idx;
1040
0
    htsFile *fp;
1041
0
    int ret = 0;
1042
1043
0
    if ((fp = hts_open(fn, "r")) == 0) return -2;
1044
0
    if (nthreads)
1045
0
        hts_set_threads(fp, nthreads);
1046
1047
0
    switch (fp->format.format) {
1048
0
    case cram:
1049
1050
0
        ret = cram_index_build(fp->fp.cram, fn, fnidx);
1051
0
        break;
1052
1053
0
    case bam:
1054
0
    case sam:
1055
0
        if (fp->format.compression != bgzf) {
1056
0
            hts_log_error("%s file \"%s\" not BGZF compressed",
1057
0
                          fp->format.format == bam ? "BAM" : "SAM", fn);
1058
0
            ret = -1;
1059
0
            break;
1060
0
        }
1061
0
        idx = sam_index(fp, min_shift);
1062
0
        if (idx) {
1063
0
            ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
1064
0
            if (ret < 0) ret = -4;
1065
0
            hts_idx_destroy(idx);
1066
0
        }
1067
0
        else ret = -1;
1068
0
        break;
1069
1070
0
    default:
1071
0
        ret = -3;
1072
0
        break;
1073
0
    }
1074
0
    hts_close(fp);
1075
1076
0
    return ret;
1077
0
}
1078
1079
int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
1080
0
{
1081
0
    return sam_index_build3(fn, fnidx, min_shift, 0);
1082
0
}
1083
1084
int sam_index_build(const char *fn, int min_shift)
1085
0
{
1086
0
    return sam_index_build3(fn, NULL, min_shift, 0);
1087
0
}
1088
1089
// Provide bam_index_build() symbol for binary compatibility with earlier HTSlib
1090
#undef bam_index_build
1091
int bam_index_build(const char *fn, int min_shift)
1092
0
{
1093
0
    return sam_index_build2(fn, NULL, min_shift);
1094
0
}
1095
1096
// Initialise fp->idx for the current format type.
1097
// This must be called after the header has been written but no other data.
1098
0
int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) {
1099
0
    fp->fnidx = fnidx;
1100
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1101
0
        (fp->format.format == sam && fp->format.compression == bgzf)) {
1102
0
        int n_lvls, fmt = HTS_FMT_CSI;
1103
0
        if (min_shift > 0) {
1104
0
            int64_t max_len = 0;
1105
0
            int i;
1106
0
            for (i = 0; i < h->n_targets; ++i)
1107
0
                if (max_len < h->target_len[i]) max_len = h->target_len[i];
1108
0
            n_lvls = 0;
1109
0
            hts_adjust_csi_settings(max_len, &min_shift, &n_lvls);
1110
0
        } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1111
1112
0
        fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1113
0
        return fp->idx ? 0 : -1;
1114
0
    }
1115
1116
0
    if (fp->format.format == cram) {
1117
0
        fp->fp.cram->idxfp = bgzf_open(fnidx, "wg");
1118
0
        return fp->fp.cram->idxfp ? 0 : -1;
1119
0
    }
1120
1121
0
    return -1;
1122
0
}
1123
1124
// Finishes an index. Call after the last record has been written.
1125
// Returns 0 on success, <0 on failure.
1126
0
int sam_idx_save(htsFile *fp) {
1127
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1128
0
        fp->format.format == vcf || fp->format.format == sam) {
1129
0
        int ret;
1130
0
        if ((ret = sam_state_destroy(fp)) < 0) {
1131
0
            errno = -ret;
1132
0
            return -1;
1133
0
        }
1134
0
        if (!fp->is_bgzf || bgzf_flush(fp->fp.bgzf) < 0)
1135
0
            return -1;
1136
0
        hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
1137
1138
0
        if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0)
1139
0
            return -1;
1140
1141
0
        return hts_idx_save_but_not_close(fp->idx, fp->fnidx, hts_idx_fmt(fp->idx));
1142
1143
0
    } else if (fp->format.format == cram) {
1144
        // flushed and closed by cram_close
1145
0
    }
1146
1147
0
    return 0;
1148
0
}
1149
1150
static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1151
0
{
1152
0
    htsFile *fp = (htsFile *)fpv;
1153
0
    bam1_t *b = bv;
1154
0
    fp->line.l = 0;
1155
0
    int ret = sam_read1(fp, fp->bam_header, b);
1156
0
    if (ret >= 0) {
1157
0
        *tid = b->core.tid;
1158
0
        *beg = b->core.pos;
1159
0
        *end = bam_endpos(b);
1160
0
    }
1161
0
    return ret;
1162
0
}
1163
1164
// This is used only with read_rest=1 iterators, so need not set tid/beg/end.
1165
static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1166
0
{
1167
0
    htsFile *fp = (htsFile *)fpv;
1168
0
    bam1_t *b = bv;
1169
0
    fp->line.l = 0;
1170
0
    int ret = sam_read1(fp, fp->bam_header, b);
1171
0
    return ret;
1172
0
}
1173
1174
// Internal (for now) func used by bam_sym_lookup.  This is copied from
1175
// samtools/bam.c.
1176
static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b)
1177
0
{
1178
0
    const char *rg;
1179
0
    kstring_t lib = { 0, 0, NULL };
1180
0
    rg = (char *)bam_aux_get(b, "RG");
1181
1182
0
    if (!rg)
1183
0
        return NULL;
1184
0
    else
1185
0
        rg++;
1186
1187
0
    if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib)  < 0)
1188
0
        return NULL;
1189
1190
0
    static char LB_text[1024];
1191
0
    int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1;
1192
1193
0
    memcpy(LB_text, lib.s, len);
1194
0
    LB_text[len] = 0;
1195
1196
0
    free(lib.s);
1197
1198
0
    return LB_text;
1199
0
}
1200
1201
1202
// Bam record pointer and SAM header combined
1203
typedef struct {
1204
    const sam_hdr_t *h;
1205
    const bam1_t *b;
1206
} hb_pair;
1207
1208
// Looks up variable names in str and replaces them with their value.
1209
// Also supports aux tags.
1210
//
1211
// Note the expression parser deliberately overallocates str size so it
1212
// is safe to use memcmp over strcmp.
1213
static int bam_sym_lookup(void *data, char *str, char **end,
1214
0
                          hts_expr_val_t *res) {
1215
0
    hb_pair *hb = (hb_pair *)data;
1216
0
    const bam1_t *b = hb->b;
1217
1218
0
    res->is_str = 0;
1219
0
    switch(*str) {
1220
0
    case 'c':
1221
0
        if (memcmp(str, "cigar", 5) == 0) {
1222
0
            *end = str+5;
1223
0
            res->is_str = 1;
1224
0
            ks_clear(&res->s);
1225
0
            uint32_t *cigar = bam_get_cigar(b);
1226
0
            int i, n = b->core.n_cigar, r = 0;
1227
0
            if (n) {
1228
0
                for (i = 0; i < n; i++) {
1229
0
                    r |= kputw (bam_cigar_oplen(cigar[i]), &res->s) < 0;
1230
0
                    r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s) < 0;
1231
0
                }
1232
0
                r |= kputs("", &res->s) < 0;
1233
0
            } else {
1234
0
                r |= kputs("*", &res->s) < 0;
1235
0
            }
1236
0
            return r ? -1 : 0;
1237
0
        }
1238
0
        break;
1239
1240
0
    case 'e':
1241
0
        if (memcmp(str, "endpos", 6) == 0) {
1242
0
            *end = str+6;
1243
0
            res->d = bam_endpos(b);
1244
0
            return 0;
1245
0
        }
1246
0
        break;
1247
1248
0
    case 'f':
1249
0
        if (memcmp(str, "flag", 4) == 0) {
1250
0
            str = *end = str+4;
1251
0
            if (*str != '.') {
1252
0
                res->d = b->core.flag;
1253
0
                return 0;
1254
0
            } else {
1255
0
                str++;
1256
0
                if (!memcmp(str, "paired", 6)) {
1257
0
                    *end = str+6;
1258
0
                    res->d = b->core.flag & BAM_FPAIRED;
1259
0
                    return 0;
1260
0
                } else if (!memcmp(str, "proper_pair", 11)) {
1261
0
                    *end = str+11;
1262
0
                    res->d = b->core.flag & BAM_FPROPER_PAIR;
1263
0
                    return 0;
1264
0
                } else if (!memcmp(str, "unmap", 5)) {
1265
0
                    *end = str+5;
1266
0
                    res->d = b->core.flag & BAM_FUNMAP;
1267
0
                    return 0;
1268
0
                } else if (!memcmp(str, "munmap", 6)) {
1269
0
                    *end = str+6;
1270
0
                    res->d = b->core.flag & BAM_FMUNMAP;
1271
0
                    return 0;
1272
0
                } else if (!memcmp(str, "reverse", 7)) {
1273
0
                    *end = str+7;
1274
0
                    res->d = b->core.flag & BAM_FREVERSE;
1275
0
                    return 0;
1276
0
                } else if (!memcmp(str, "mreverse", 8)) {
1277
0
                    *end = str+8;
1278
0
                    res->d = b->core.flag & BAM_FMREVERSE;
1279
0
                    return 0;
1280
0
                } else if (!memcmp(str, "read1", 5)) {
1281
0
                    *end = str+5;
1282
0
                    res->d = b->core.flag & BAM_FREAD1;
1283
0
                    return 0;
1284
0
                } else if (!memcmp(str, "read2", 5)) {
1285
0
                    *end = str+5;
1286
0
                    res->d = b->core.flag & BAM_FREAD2;
1287
0
                    return 0;
1288
0
                } else if (!memcmp(str, "secondary", 9)) {
1289
0
                    *end = str+9;
1290
0
                    res->d = b->core.flag & BAM_FSECONDARY;
1291
0
                    return 0;
1292
0
                } else if (!memcmp(str, "qcfail", 6)) {
1293
0
                    *end = str+6;
1294
0
                    res->d = b->core.flag & BAM_FQCFAIL;
1295
0
                    return 0;
1296
0
                } else if (!memcmp(str, "dup", 3)) {
1297
0
                    *end = str+3;
1298
0
                    res->d = b->core.flag & BAM_FDUP;
1299
0
                    return 0;
1300
0
                } else if (!memcmp(str, "supplementary", 13)) {
1301
0
                    *end = str+13;
1302
0
                    res->d = b->core.flag & BAM_FSUPPLEMENTARY;
1303
0
                    return 0;
1304
0
                } else {
1305
0
                    hts_log_error("Unrecognised flag string");
1306
0
                    return -1;
1307
0
                }
1308
0
            }
1309
0
        }
1310
0
        break;
1311
1312
0
    case 'h':
1313
0
        if (memcmp(str, "hclen", 5) == 0) {
1314
0
            int hclen = 0;
1315
0
            uint32_t *cigar = bam_get_cigar(b);
1316
0
            uint32_t ncigar = b->core.n_cigar;
1317
1318
            // left
1319
0
            if (ncigar > 0 && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP)
1320
0
                hclen = bam_cigar_oplen(cigar[0]);
1321
1322
            // right
1323
0
            if (ncigar > 1 && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP)
1324
0
                hclen += bam_cigar_oplen(cigar[ncigar-1]);
1325
1326
0
            *end = str+5;
1327
0
            res->d = hclen;
1328
0
            return 0;
1329
0
        }
1330
0
        break;
1331
1332
0
    case 'l':
1333
0
        if (memcmp(str, "library", 7) == 0) {
1334
0
            *end = str+7;
1335
0
            res->is_str = 1;
1336
0
            const char *lib = bam_get_library(hb->h, b);
1337
0
            kputs(lib ? lib : "", ks_clear(&res->s));
1338
0
            return 0;
1339
0
        }
1340
0
        break;
1341
1342
0
    case 'm':
1343
0
        if (memcmp(str, "mapq", 4) == 0) {
1344
0
            *end = str+4;
1345
0
            res->d = b->core.qual;
1346
0
            return 0;
1347
0
        } else if (memcmp(str, "mpos", 4) == 0) {
1348
0
            *end = str+4;
1349
0
            res->d = b->core.mpos+1;
1350
0
            return 0;
1351
0
        } else if (memcmp(str, "mrname", 6) == 0) {
1352
0
            *end = str+6;
1353
0
            res->is_str = 1;
1354
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1355
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1356
0
            return 0;
1357
0
        } else if (memcmp(str, "mrefid", 6) == 0) {
1358
0
            *end = str+6;
1359
0
            res->d = b->core.mtid;
1360
0
            return 0;
1361
0
        }
1362
0
        break;
1363
1364
0
    case 'n':
1365
0
        if (memcmp(str, "ncigar", 6) == 0) {
1366
0
            *end = str+6;
1367
0
            res->d = b->core.n_cigar;
1368
0
            return 0;
1369
0
        }
1370
0
        break;
1371
1372
0
    case 'p':
1373
0
        if (memcmp(str, "pos", 3) == 0) {
1374
0
            *end = str+3;
1375
0
            res->d = b->core.pos+1;
1376
0
            return 0;
1377
0
        } else if (memcmp(str, "pnext", 5) == 0) {
1378
0
            *end = str+5;
1379
0
            res->d = b->core.mpos+1;
1380
0
            return 0;
1381
0
        }
1382
0
        break;
1383
1384
0
    case 'q':
1385
0
        if (memcmp(str, "qlen", 4) == 0) {
1386
0
            *end = str+4;
1387
0
            res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
1388
0
            return 0;
1389
0
        } else if (memcmp(str, "qname", 5) == 0) {
1390
0
            *end = str+5;
1391
0
            res->is_str = 1;
1392
0
            kputs(bam_get_qname(b), ks_clear(&res->s));
1393
0
            return 0;
1394
0
        } else if (memcmp(str, "qual", 4) == 0) {
1395
0
            *end = str+4;
1396
0
            ks_clear(&res->s);
1397
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1398
0
                return -1;
1399
0
            memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq);
1400
0
            res->s.l = b->core.l_qseq;
1401
0
            res->is_str = 1;
1402
0
            return 0;
1403
0
        }
1404
0
        break;
1405
1406
0
    case 'r':
1407
0
        if (memcmp(str, "rlen", 4) == 0) {
1408
0
            *end = str+4;
1409
0
            res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
1410
0
            return 0;
1411
0
        } else if (memcmp(str, "rname", 5) == 0) {
1412
0
            *end = str+5;
1413
0
            res->is_str = 1;
1414
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.tid);
1415
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1416
0
            return 0;
1417
0
        } else if (memcmp(str, "rnext", 5) == 0) {
1418
0
            *end = str+5;
1419
0
            res->is_str = 1;
1420
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1421
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1422
0
            return 0;
1423
0
        } else if (memcmp(str, "refid", 5) == 0) {
1424
0
            *end = str+5;
1425
0
            res->d = b->core.tid;
1426
0
            return 0;
1427
0
        }
1428
0
        break;
1429
1430
0
    case 's':
1431
0
        if (memcmp(str, "seq", 3) == 0) {
1432
0
            *end = str+3;
1433
0
            ks_clear(&res->s);
1434
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1435
0
                return -1;
1436
0
            nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq);
1437
0
            res->s.s[b->core.l_qseq] = 0;
1438
0
            res->s.l = b->core.l_qseq;
1439
0
            res->is_str = 1;
1440
0
            return 0;
1441
0
        } else if (memcmp(str, "sclen", 5) == 0) {
1442
0
            int sclen = 0;
1443
0
            uint32_t *cigar = bam_get_cigar(b);
1444
0
            int ncigar = b->core.n_cigar;
1445
0
            int left = 0;
1446
1447
            // left
1448
0
            if (ncigar > 0
1449
0
                && bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP)
1450
0
                left = 0, sclen += bam_cigar_oplen(cigar[0]);
1451
0
            else if (ncigar > 1
1452
0
                     && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP
1453
0
                     && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP)
1454
0
                left = 1, sclen += bam_cigar_oplen(cigar[1]);
1455
1456
            // right
1457
0
            if (ncigar-1 > left
1458
0
                && bam_cigar_op(cigar[ncigar-1]) == BAM_CSOFT_CLIP)
1459
0
                sclen += bam_cigar_oplen(cigar[ncigar-1]);
1460
0
            else if (ncigar-2 > left
1461
0
                     && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP
1462
0
                     && bam_cigar_op(cigar[ncigar-2]) == BAM_CSOFT_CLIP)
1463
0
                sclen += bam_cigar_oplen(cigar[ncigar-2]);
1464
1465
0
            *end = str+5;
1466
0
            res->d = sclen;
1467
0
            return 0;
1468
0
        }
1469
0
        break;
1470
1471
0
    case 't':
1472
0
        if (memcmp(str, "tlen", 4) == 0) {
1473
0
            *end = str+4;
1474
0
            res->d = b->core.isize;
1475
0
            return 0;
1476
0
        }
1477
0
        break;
1478
1479
0
    case '[':
1480
0
        if (*str == '[' && str[1] && str[2] && str[3] == ']') {
1481
            /* aux tags */
1482
0
            *end = str+4;
1483
1484
0
            uint8_t *aux = bam_aux_get(b, str+1);
1485
0
            if (aux) {
1486
                // we define the truth of a tag to be its presence, even if 0.
1487
0
                res->is_true = 1;
1488
0
                switch (*aux) {
1489
0
                case 'Z':
1490
0
                case 'H':
1491
0
                    res->is_str = 1;
1492
0
                    kputs((char *)aux+1, ks_clear(&res->s));
1493
0
                    break;
1494
1495
0
                case 'A':
1496
0
                    res->is_str = 1;
1497
0
                    kputsn((char *)aux+1, 1, ks_clear(&res->s));
1498
0
                    break;
1499
1500
0
                case 'i': case 'I':
1501
0
                case 's': case 'S':
1502
0
                case 'c': case 'C':
1503
0
                    res->is_str = 0;
1504
0
                    res->d = bam_aux2i(aux);
1505
0
                    break;
1506
1507
0
                case 'f':
1508
0
                case 'd':
1509
0
                    res->is_str = 0;
1510
0
                    res->d = bam_aux2f(aux);
1511
0
                    break;
1512
1513
0
                default:
1514
0
                    hts_log_error("Aux type '%c not yet supported by filters",
1515
0
                                  *aux);
1516
0
                    return -1;
1517
0
                }
1518
0
                return 0;
1519
1520
0
            } else {
1521
                // hence absent tags are always false (and strings)
1522
0
                res->is_str = 1;
1523
0
                res->s.l = 0;
1524
0
                res->d = 0;
1525
0
                res->is_true = 0;
1526
0
                return 0;
1527
0
            }
1528
0
        }
1529
0
        break;
1530
0
    }
1531
1532
    // All successful matches in switch should return 0.
1533
    // So if we didn't match, it's a parse error.
1534
0
    return -1;
1535
0
}
1536
1537
// Returns 1 when accepted by the filter, 0 if not, -1 on error.
1538
int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt)
1539
0
{
1540
0
    hb_pair hb = {h, b};
1541
0
    hts_expr_val_t res = HTS_EXPR_VAL_INIT;
1542
0
    if (hts_filter_eval2(filt, &hb, bam_sym_lookup, &res)) {
1543
0
        hts_log_error("Couldn't process filter expression");
1544
0
        hts_expr_val_free(&res);
1545
0
        return -1;
1546
0
    }
1547
1548
0
    int t = res.is_true;
1549
0
    hts_expr_val_free(&res);
1550
1551
0
    return t;
1552
0
}
1553
1554
static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1555
0
{
1556
0
    htsFile *fp = fpv;
1557
0
    bam1_t *b = bv;
1558
0
    int pass_filter, ret;
1559
1560
0
    do {
1561
0
        ret = cram_get_bam_seq(fp->fp.cram, &b);
1562
0
        if (ret < 0)
1563
0
            return cram_eof(fp->fp.cram) ? -1 : -2;
1564
1565
0
        *tid = b->core.tid;
1566
0
        *beg = b->core.pos;
1567
0
        *end = bam_endpos(b);
1568
1569
0
        if (fp->filter) {
1570
0
            pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter);
1571
0
            if (pass_filter < 0)
1572
0
                return -2;
1573
0
        } else {
1574
0
            pass_filter = 1;
1575
0
        }
1576
0
    } while (pass_filter == 0);
1577
1578
0
    return ret;
1579
0
}
1580
1581
static int cram_pseek(void *fp, int64_t offset, int whence)
1582
0
{
1583
0
    cram_fd *fd =  (cram_fd *)fp;
1584
1585
0
    if ((0 != cram_seek(fd, offset, SEEK_SET))
1586
0
     && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
1587
0
        return -1;
1588
1589
0
    fd->curr_position = offset;
1590
1591
0
    if (fd->ctr) {
1592
0
        cram_free_container(fd->ctr);
1593
0
        if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
1594
0
            cram_free_container(fd->ctr_mt);
1595
1596
0
        fd->ctr = NULL;
1597
0
        fd->ctr_mt = NULL;
1598
0
        fd->ooc = 0;
1599
0
    }
1600
1601
0
    return 0;
1602
0
}
1603
1604
/*
1605
 * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
1606
 *   after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
1607
 *   container previously fetched. It was designed like this to integrate with the functionality
1608
 *   of the iterator stepping logic.
1609
 */
1610
1611
static int64_t cram_ptell(void *fp)
1612
0
{
1613
0
    cram_fd *fd = (cram_fd *)fp;
1614
0
    cram_container *c;
1615
0
    cram_slice *s;
1616
0
    int64_t ret = -1L;
1617
1618
0
    if (fd) {
1619
0
        if ((c = fd->ctr) != NULL) {
1620
0
            if ((s = c->slice) != NULL && s->max_rec) {
1621
0
                if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1))
1622
0
                    fd->curr_position += c->offset + c->length;
1623
0
            }
1624
0
        }
1625
0
        ret = fd->curr_position;
1626
0
    }
1627
1628
0
    return ret;
1629
0
}
1630
1631
1632
static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags)
1633
0
{
1634
0
    switch (fp->format.format) {
1635
0
    case bam:
1636
0
    case sam:
1637
0
        return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags);
1638
1639
0
    case cram: {
1640
0
        if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
1641
1642
        // Cons up a fake "index" just pointing at the associated cram_fd:
1643
0
        hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
1644
0
        if (idx == NULL) return NULL;
1645
0
        idx->fmt = HTS_FMT_CRAI;
1646
0
        idx->cram = fp->fp.cram;
1647
0
        return (hts_idx_t *) idx;
1648
0
        }
1649
1650
0
    default:
1651
0
        return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
1652
0
    }
1653
0
}
1654
1655
hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags)
1656
0
{
1657
0
    return index_load(fp, fn, fnidx, flags);
1658
0
}
1659
1660
0
hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) {
1661
0
    return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE);
1662
0
}
1663
1664
hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
1665
0
{
1666
0
    return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE);
1667
0
}
1668
1669
static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
1670
0
{
1671
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1672
0
    hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
1673
0
    if (iter == NULL) return NULL;
1674
1675
    // Cons up a dummy iterator for which hts_itr_next() will simply invoke
1676
    // the readrec function:
1677
0
    iter->is_cram = 1;
1678
0
    iter->read_rest = 1;
1679
0
    iter->off = NULL;
1680
0
    iter->bins.a = NULL;
1681
0
    iter->readrec = readrec;
1682
1683
0
    if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
1684
0
        cram_range r = { tid, beg+1, end };
1685
0
        int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
1686
1687
0
        iter->curr_off = 0;
1688
        // The following fields are not required by hts_itr_next(), but are
1689
        // filled in in case user code wants to look at them.
1690
0
        iter->tid = tid;
1691
0
        iter->beg = beg;
1692
0
        iter->end = end;
1693
1694
0
        switch (ret) {
1695
0
        case 0:
1696
0
            break;
1697
1698
0
        case -2:
1699
            // No data vs this ref, so mark iterator as completed.
1700
            // Same as HTS_IDX_NONE.
1701
0
            iter->finished = 1;
1702
0
            break;
1703
1704
0
        default:
1705
0
            free(iter);
1706
0
            return NULL;
1707
0
        }
1708
0
    }
1709
0
    else switch (tid) {
1710
0
    case HTS_IDX_REST:
1711
0
        iter->curr_off = 0;
1712
0
        break;
1713
0
    case HTS_IDX_NONE:
1714
0
        iter->curr_off = 0;
1715
0
        iter->finished = 1;
1716
0
        break;
1717
0
    default:
1718
0
        hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
1719
0
        abort();
1720
0
        break;
1721
0
    }
1722
1723
0
    return iter;
1724
0
}
1725
1726
hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
1727
0
{
1728
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1729
0
    if (idx == NULL)
1730
0
        return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest);
1731
0
    else if (cidx->fmt == HTS_FMT_CRAI)
1732
0
        return cram_itr_query(idx, tid, beg, end, sam_readrec);
1733
0
    else
1734
0
        return hts_itr_query(idx, tid, beg, end, sam_readrec);
1735
0
}
1736
1737
static int cram_name2id(void *fdv, const char *ref)
1738
0
{
1739
0
    cram_fd *fd = (cram_fd *) fdv;
1740
0
    return sam_hdr_name2tid(fd->header, ref);
1741
0
}
1742
1743
hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region)
1744
0
{
1745
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1746
0
    return hts_itr_querys(idx, region, bam_name2id_wrapper, hdr,
1747
0
                          cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query,
1748
0
                          sam_readrec);
1749
0
}
1750
1751
hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount)
1752
0
{
1753
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1754
0
    hts_reglist_t *r_list = NULL;
1755
0
    int r_count = 0;
1756
1757
0
    if (!cidx || !hdr)
1758
0
        return NULL;
1759
1760
0
    hts_itr_t *itr = NULL;
1761
0
    if (cidx->fmt == HTS_FMT_CRAI) {
1762
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id);
1763
0
        if (!r_list)
1764
0
            return NULL;
1765
0
        itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram,
1766
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1767
0
    } else {
1768
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, bam_name2id_wrapper);
1769
0
        if (!r_list)
1770
0
            return NULL;
1771
0
        itr = hts_itr_regions(idx, r_list, r_count, bam_name2id_wrapper, hdr,
1772
0
                   hts_itr_multi_bam, sam_readrec, bgzf_pseek, bgzf_ptell);
1773
0
    }
1774
1775
0
    if (!itr)
1776
0
        hts_reglist_free(r_list, r_count);
1777
1778
0
    return itr;
1779
0
}
1780
1781
hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
1782
0
{
1783
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1784
1785
0
    if(!cidx || !hdr || !reglist)
1786
0
        return NULL;
1787
1788
0
    if (cidx->fmt == HTS_FMT_CRAI)
1789
0
        return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
1790
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1791
0
    else
1792
0
        return hts_itr_regions(idx, reglist, regcount, bam_name2id_wrapper, hdr,
1793
0
                   hts_itr_multi_bam, sam_readrec, bgzf_pseek, bgzf_ptell);
1794
0
}
1795
1796
/**********************
1797
 *** SAM header I/O ***
1798
 **********************/
1799
1800
#include "htslib/kseq.h"
1801
#include "htslib/kstring.h"
1802
1803
sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text)
1804
0
{
1805
0
    sam_hdr_t *bh = sam_hdr_init();
1806
0
    if (!bh) return NULL;
1807
1808
0
    if (sam_hdr_add_lines(bh, text, l_text) != 0) {
1809
0
        sam_hdr_destroy(bh);
1810
0
        return NULL;
1811
0
    }
1812
1813
0
    return bh;
1814
0
}
1815
1816
// Minimal sanitisation of a header to ensure.
1817
// - null terminated string.
1818
// - all lines start with @ (also implies no blank lines).
1819
//
1820
// Much more could be done, but currently is not, including:
1821
// - checking header types are known (HD, SQ, etc).
1822
// - syntax (eg checking tab separated fields).
1823
// - validating n_targets matches @SQ records.
1824
// - validating target lengths against @SQ records.
1825
12.1k
static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) {
1826
12.1k
    if (!h)
1827
441
        return NULL;
1828
1829
    // Special case for empty headers.
1830
11.7k
    if (h->l_text == 0)
1831
6.84k
        return h;
1832
1833
4.87k
    size_t i;
1834
4.87k
    unsigned int lnum = 0;
1835
4.87k
    char *cp = h->text, last = '\n';
1836
424k
    for (i = 0; i < h->l_text; i++) {
1837
        // NB: l_text excludes terminating nul.  This finds early ones.
1838
420k
        if (cp[i] == 0)
1839
1.01k
            break;
1840
1841
        // Error on \n[^@], including duplicate newlines
1842
419k
        if (last == '\n') {
1843
8.31k
            lnum++;
1844
8.31k
            if (cp[i] != '@') {
1845
3
                hts_log_error("Malformed SAM header at line %u", lnum);
1846
3
                sam_hdr_destroy(h);
1847
3
                return NULL;
1848
3
            }
1849
8.31k
        }
1850
1851
419k
        last = cp[i];
1852
419k
    }
1853
1854
4.86k
    if (i < h->l_text) { // Early nul found.  Complain if not just padding.
1855
1.01k
        size_t j = i;
1856
4.46k
        while (j < h->l_text && cp[j] == '\0') j++;
1857
1.01k
        if (j < h->l_text)
1858
864
            hts_log_warning("Unexpected NUL character in header. Possibly truncated");
1859
1.01k
    }
1860
1861
    // Add trailing newline and/or trailing nul if required.
1862
4.86k
    if (last != '\n') {
1863
906
        hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
1864
1865
906
        if (h->l_text < 2 || i >= h->l_text - 2) {
1866
450
            if (h->l_text >= SIZE_MAX - 2) {
1867
0
                hts_log_error("No room for extra newline");
1868
0
                sam_hdr_destroy(h);
1869
0
                return NULL;
1870
0
            }
1871
1872
450
            cp = realloc(h->text, (size_t) h->l_text+2);
1873
450
            if (!cp) {
1874
0
                sam_hdr_destroy(h);
1875
0
                return NULL;
1876
0
            }
1877
450
            h->text = cp;
1878
450
        }
1879
906
        cp[i++] = '\n';
1880
1881
        // l_text may be larger already due to multiple nul padding
1882
906
        if (h->l_text < i)
1883
39
            h->l_text = i;
1884
906
        cp[h->l_text] = '\0';
1885
906
    }
1886
1887
4.86k
    return h;
1888
4.86k
}
1889
1890
13.8k
static sam_hdr_t *sam_hdr_create(htsFile* fp) {
1891
13.8k
    sam_hdr_t* h = sam_hdr_init();
1892
13.8k
    if (!h)
1893
0
        return NULL;
1894
1895
13.8k
    if (sam_hdr_build_from_sam_file(h, fp) != 0) {
1896
9.30k
        sam_hdr_destroy(h);
1897
9.30k
        return NULL;
1898
9.30k
    }
1899
1900
4.57k
    if (fp->bam_header)
1901
0
        sam_hdr_destroy(fp->bam_header);
1902
4.57k
    fp->bam_header = sam_hdr_sanitise(h);
1903
4.57k
    fp->bam_header->ref_count = 1;
1904
1905
4.57k
    return fp->bam_header;
1906
13.8k
}
1907
1908
sam_hdr_t *sam_hdr_read(htsFile *fp)
1909
26.3k
{
1910
26.3k
    sam_hdr_t *h = NULL;
1911
26.3k
    if (!fp) {
1912
0
        errno = EINVAL;
1913
0
        return NULL;
1914
0
    }
1915
1916
26.3k
    switch (fp->format.format) {
1917
2.13k
    case bam:
1918
2.13k
        h = sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
1919
2.13k
        break;
1920
1921
5.44k
    case cram:
1922
5.44k
        h = sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header));
1923
5.44k
        break;
1924
1925
13.8k
    case sam:
1926
13.8k
        h = sam_hdr_create(fp);
1927
13.8k
        break;
1928
1929
243
    case fastq_format:
1930
4.84k
    case fasta_format:
1931
4.84k
        return sam_hdr_init();
1932
1933
0
    case empty_format:
1934
0
        errno = EPIPE;
1935
0
        return NULL;
1936
1937
0
    default:
1938
0
        errno = EFTYPE;
1939
0
        return NULL;
1940
26.3k
    }
1941
    //only sam,bam and cram reaches here
1942
21.4k
    if (h && !fp->bam_header) { //set except for sam which already has it
1943
        //for cram, it is the o/p header as for rest and not the internal header
1944
7.14k
        fp->bam_header = h;
1945
7.14k
        sam_hdr_incr_ref(fp->bam_header);
1946
7.14k
    }
1947
21.4k
    return h;
1948
26.3k
}
1949
1950
int sam_hdr_write(htsFile *fp, const sam_hdr_t *h)
1951
16.5k
{
1952
16.5k
    if (!fp || !h) {
1953
0
        errno = EINVAL;
1954
0
        return -1;
1955
0
    }
1956
1957
16.5k
    switch (fp->format.format) {
1958
5.52k
    case binary_format:
1959
5.52k
        fp->format.category = sequence_data;
1960
5.52k
        fp->format.format = bam;
1961
        /* fall-through */
1962
5.52k
    case bam:
1963
5.52k
        if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
1964
5.52k
        break;
1965
1966
5.52k
    case cram: {
1967
5.52k
        cram_fd *fd = fp->fp.cram;
1968
5.52k
        if (cram_set_header2(fd, h) < 0) return -1;
1969
5.44k
        if (fp->fn_aux)
1970
0
            cram_load_reference(fd, fp->fn_aux);
1971
5.44k
        if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
1972
5.44k
        }
1973
5.44k
        break;
1974
1975
5.52k
    case text_format:
1976
5.52k
        fp->format.category = sequence_data;
1977
5.52k
        fp->format.format = sam;
1978
        /* fall-through */
1979
5.52k
    case sam: {
1980
5.52k
        if (!h->hrecs && !h->text)
1981
0
            return 0;
1982
5.52k
        char *text;
1983
5.52k
        kstring_t hdr_ks = { 0, 0, NULL };
1984
5.52k
        size_t l_text;
1985
5.52k
        ssize_t bytes;
1986
5.52k
        int r = 0, no_sq = 0;
1987
1988
5.52k
        if (h->hrecs) {
1989
5.46k
            if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0)
1990
0
                return -1;
1991
5.46k
            text = hdr_ks.s;
1992
5.46k
            l_text = hdr_ks.l;
1993
5.46k
        } else {
1994
59
            const char *p = NULL;
1995
80
            do {
1996
80
                const char *q = p == NULL ? h->text : p + 4;
1997
80
                p = strstr(q, "@SQ\t");
1998
80
            } while (!(p == NULL || p == h->text || *(p - 1) == '\n'));
1999
59
            no_sq = p == NULL;
2000
59
            text = h->text;
2001
59
            l_text = h->l_text;
2002
59
        }
2003
2004
5.52k
        if (fp->is_bgzf) {
2005
0
            bytes = bgzf_write(fp->fp.bgzf, text, l_text);
2006
5.52k
        } else {
2007
5.52k
            bytes = hwrite(fp->fp.hfile, text, l_text);
2008
5.52k
        }
2009
5.52k
        free(hdr_ks.s);
2010
5.52k
        if (bytes != l_text)
2011
0
            return -1;
2012
2013
5.52k
        if (no_sq) {
2014
52
            int i;
2015
117
            for (i = 0; i < h->n_targets; ++i) {
2016
65
                fp->line.l = 0;
2017
65
                r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0;
2018
65
                r |= kputs(h->target_name[i], &fp->line) < 0;
2019
65
                r |= kputsn("\tLN:", 4, &fp->line) < 0;
2020
65
                r |= kputw(h->target_len[i], &fp->line) < 0;
2021
65
                r |= kputc('\n', &fp->line) < 0;
2022
65
                if (r != 0)
2023
0
                    return -1;
2024
2025
65
                if (fp->is_bgzf) {
2026
0
                    bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
2027
65
                } else {
2028
65
                    bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
2029
65
                }
2030
65
                if (bytes != fp->line.l)
2031
0
                    return -1;
2032
65
            }
2033
52
        }
2034
5.52k
        if (fp->is_bgzf) {
2035
0
            if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2036
5.52k
        } else {
2037
5.52k
            if (hflush(fp->fp.hfile) != 0) return -1;
2038
5.52k
        }
2039
5.52k
        }
2040
5.52k
        break;
2041
2042
5.52k
    case fastq_format:
2043
0
    case fasta_format:
2044
        // Nothing to output; FASTQ has no file headers.
2045
0
        return 0;
2046
0
        break;
2047
2048
0
    default:
2049
0
        errno = EBADF;
2050
0
        return -1;
2051
16.5k
    }
2052
    //only sam,bam and cram reaches here
2053
16.4k
    if (h) {    //the new header
2054
16.4k
        sam_hdr_t *tmp = fp->bam_header;
2055
16.4k
        fp->bam_header = sam_hdr_dup(h);
2056
16.4k
        sam_hdr_destroy(tmp);
2057
16.4k
        if (!fp->bam_header && h)
2058
0
            return -1;  //failed to duplicate
2059
16.4k
    }
2060
16.4k
    return 0;
2061
16.4k
}
2062
2063
static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2064
0
{
2065
0
    char *p, *q, *beg = NULL, *end = NULL, *newtext;
2066
0
    size_t new_l_text;
2067
0
    if (!h || !key)
2068
0
        return -1;
2069
2070
0
    if (h->l_text > 3) {
2071
0
        if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
2072
0
            if ((p = strchr(h->text, '\n')) == 0) return -1;
2073
0
            *p = '\0'; // for strstr call
2074
2075
0
            char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
2076
2077
0
            if ((q = strstr(h->text, tmp)) != 0) { // key exists
2078
0
                *p = '\n'; // change back
2079
2080
                // mark the key:val
2081
0
                beg = q;
2082
0
                for (q += 4; *q != '\n' && *q != '\t'; ++q);
2083
0
                end = q;
2084
2085
0
                if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
2086
0
                    && strlen(val) == end - beg - 4)
2087
0
                     return 0; // val is the same, no need to change
2088
2089
0
            } else {
2090
0
                beg = end = p;
2091
0
                *p = '\n';
2092
0
            }
2093
0
        }
2094
0
    }
2095
0
    if (beg == NULL) { // no @HD
2096
0
        new_l_text = h->l_text;
2097
0
        if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9)
2098
0
            return -1;
2099
0
        new_l_text += strlen(SAM_FORMAT_VERSION) + 8;
2100
0
        if (val) {
2101
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2102
0
                return -1;
2103
0
            new_l_text += strlen(val) + 4;
2104
0
        }
2105
0
        newtext = (char*)malloc(new_l_text + 1);
2106
0
        if (!newtext) return -1;
2107
2108
0
        if (val)
2109
0
            snprintf(newtext, new_l_text + 1,
2110
0
                    "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
2111
0
        else
2112
0
            snprintf(newtext, new_l_text + 1,
2113
0
                    "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
2114
0
    } else { // has @HD but different or no key
2115
0
        new_l_text = (beg - h->text) + (h->text + h->l_text - end);
2116
0
        if (val) {
2117
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2118
0
                return -1;
2119
0
            new_l_text += strlen(val) + 4;
2120
0
        }
2121
0
        newtext = (char*)malloc(new_l_text + 1);
2122
0
        if (!newtext) return -1;
2123
2124
0
        if (val) {
2125
0
            snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s",
2126
0
                    (int) (beg - h->text), h->text, key, val, end);
2127
0
        } else { //delete key
2128
0
            snprintf(newtext, new_l_text + 1, "%.*s%s",
2129
0
                    (int) (beg - h->text), h->text, end);
2130
0
        }
2131
0
    }
2132
0
    free(h->text);
2133
0
    h->text = newtext;
2134
0
    h->l_text = new_l_text;
2135
0
    return 0;
2136
0
}
2137
2138
2139
int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2140
0
{
2141
0
    if (!h || !key)
2142
0
        return -1;
2143
2144
0
    if (!h->hrecs)
2145
0
        return old_sam_hdr_change_HD(h, key, val);
2146
2147
0
    if (val) {
2148
0
        if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0)
2149
0
            return -1;
2150
0
    } else {
2151
0
        if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0)
2152
0
            return -1;
2153
0
    }
2154
0
    return sam_hdr_rebuild(h);
2155
0
}
2156
2157
/* releases existing header and sets new one; increments ref count if not
2158
duplicating */
2159
int sam_hdr_set(samFile *fp, sam_hdr_t *h, int duplicate)
2160
0
{
2161
0
    if (!fp)
2162
0
        return -1;
2163
2164
0
    if (duplicate) {
2165
0
        sam_hdr_t *tmp = fp->bam_header;
2166
0
        fp->bam_header = sam_hdr_dup(h);
2167
0
        sam_hdr_destroy(tmp);
2168
0
        if (!fp->bam_header && h)
2169
0
            return -1;  //duplicate failed
2170
0
    } else {
2171
0
        if (fp->bam_header != h) {  //if not the same
2172
0
            sam_hdr_destroy(fp->bam_header);
2173
0
            fp->bam_header = h;
2174
0
            sam_hdr_incr_ref(fp->bam_header);
2175
0
        }
2176
0
    }
2177
2178
0
    return 0;
2179
0
}
2180
2181
//return the bam_header, user has to use sam_hdr_incr_ref where ever required
2182
sam_hdr_t* sam_hdr_get(samFile* fp)
2183
0
{
2184
0
    if (!fp)
2185
0
        return NULL;
2186
0
    return fp->bam_header;
2187
0
}
2188
2189
/**********************
2190
 *** SAM record I/O ***
2191
 **********************/
2192
2193
// The speed of this code can vary considerably depending on minor code
2194
// changes elsewhere as some of the tight loops are particularly prone to
2195
// speed changes when the instruction blocks are split over a 32-byte
2196
// boundary.  To protect against this, we explicitly specify an alignment
2197
// for this function.  If this is insufficient, we may also wish to
2198
// consider alignment of blocks within this function via
2199
// __attribute__((optimize("align-loops=5"))) (gcc) or clang equivalents.
2200
// However it's not very portable.
2201
// Instead we break into separate functions so we can explicitly specify
2202
// use __attribute__((aligned(32))) instead and force consistent loop
2203
// alignment.
2204
7.38k
static inline int64_t grow_B_array(bam1_t *b, uint32_t *n, size_t size) {
2205
    // Avoid overflow on 32-bit platforms, but it breaks BAM anyway
2206
7.38k
    if (*n > INT32_MAX*0.666) {
2207
0
        errno = ENOMEM;
2208
0
        return -1;
2209
0
    }
2210
2211
7.38k
    size_t bytes = (size_t)size * (size_t)(*n>>1);
2212
7.38k
    if (possibly_expand_bam_data(b, bytes) < 0) {
2213
0
        hts_log_error("Out of memory");
2214
0
        return -1;
2215
0
    }
2216
2217
7.38k
    (*n)+=*n>>1;
2218
7.38k
    return 0;
2219
7.38k
}
2220
2221
2222
// This ensures that q always ends up at the next comma after
2223
// reading a number even if it's followed by junk.  It
2224
// prevents the possibility of trying to read more than n items.
2225
1.20M
#define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0)
2226
2227
HTS_ALIGN32
2228
static char *sam_parse_Bc_vals(bam1_t *b, char *q, uint32_t *nused,
2229
1.95k
                               uint32_t *nalloc, int *overflow) {
2230
109k
    while (*q == ',') {
2231
107k
        if ((*nused)++ >= (*nalloc)) {
2232
1.14k
            if (grow_B_array(b, nalloc, 1) < 0)
2233
0
                return NULL;
2234
1.14k
        }
2235
107k
        *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, overflow);
2236
107k
        b->l_data++;
2237
107k
    }
2238
1.95k
    return q;
2239
1.95k
}
2240
2241
HTS_ALIGN32
2242
static char *sam_parse_BC_vals(bam1_t *b, char *q, uint32_t *nused,
2243
2.90k
                               uint32_t *nalloc, int *overflow) {
2244
986k
    while (*q == ',') {
2245
983k
        if ((*nused)++ >= (*nalloc)) {
2246
3.97k
            if (grow_B_array(b, nalloc, 1) < 0)
2247
0
                return NULL;
2248
3.97k
        }
2249
983k
        if (q[1] != '-') {
2250
956k
            *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, overflow);
2251
956k
            b->l_data++;
2252
956k
        } else {
2253
27.0k
            *overflow = 1;
2254
27.0k
            q++;
2255
27.0k
            skip_to_comma_(q);
2256
27.0k
        }
2257
983k
    }
2258
2.90k
    return q;
2259
2.90k
}
2260
2261
HTS_ALIGN32
2262
static char *sam_parse_Bs_vals(bam1_t *b, char *q, uint32_t *nused,
2263
9.80k
                               uint32_t *nalloc, int *overflow) {
2264
30.2k
    while (*q == ',') {
2265
20.4k
        if ((*nused)++ >= (*nalloc)) {
2266
1.42k
            if (grow_B_array(b, nalloc, 2) < 0)
2267
0
                return NULL;
2268
1.42k
        }
2269
20.4k
        i16_to_le(hts_str2int(q + 1, &q, 16, overflow),
2270
20.4k
                  b->data + b->l_data);
2271
20.4k
        b->l_data += 2;
2272
20.4k
    }
2273
9.80k
    return q;
2274
9.80k
}
2275
2276
HTS_ALIGN32
2277
static char *sam_parse_BS_vals(bam1_t *b, char *q, uint32_t *nused,
2278
4.70k
                               uint32_t *nalloc, int *overflow) {
2279
17.6k
    while (*q == ',') {
2280
12.9k
        if ((*nused)++ >= (*nalloc)) {
2281
21
            if (grow_B_array(b, nalloc, 2) < 0)
2282
0
                return NULL;
2283
21
        }
2284
12.9k
        if (q[1] != '-') {
2285
12.7k
            u16_to_le(hts_str2uint(q + 1, &q, 16, overflow),
2286
12.7k
                      b->data + b->l_data);
2287
12.7k
            b->l_data += 2;
2288
12.7k
        } else {
2289
192
            *overflow = 1;
2290
192
            q++;
2291
192
            skip_to_comma_(q);
2292
192
        }
2293
12.9k
    }
2294
4.70k
    return q;
2295
4.70k
}
2296
2297
HTS_ALIGN32
2298
static char *sam_parse_Bi_vals(bam1_t *b, char *q, uint32_t *nused,
2299
1.21k
                               uint32_t *nalloc, int *overflow) {
2300
544k
    while (*q == ',') {
2301
543k
        if ((*nused)++ >= (*nalloc)) {
2302
417
            if (grow_B_array(b, nalloc, 4) < 0)
2303
0
                return NULL;
2304
417
        }
2305
543k
        i32_to_le(hts_str2int(q + 1, &q, 32, overflow),
2306
543k
                  b->data + b->l_data);
2307
543k
        b->l_data += 4;
2308
543k
    }
2309
1.21k
    return q;
2310
1.21k
}
2311
2312
HTS_ALIGN32
2313
static char *sam_parse_BI_vals(bam1_t *b, char *q, uint32_t *nused,
2314
3.73k
                               uint32_t *nalloc, int *overflow) {
2315
9.42k
    while (*q == ',') {
2316
5.68k
        if ((*nused)++ >= (*nalloc)) {
2317
60
            if (grow_B_array(b, nalloc, 4) < 0)
2318
0
                return NULL;
2319
60
        }
2320
5.68k
        if (q[1] != '-') {
2321
5.58k
            u32_to_le(hts_str2uint(q + 1, &q, 32, overflow),
2322
5.58k
                      b->data + b->l_data);
2323
5.58k
            b->l_data += 4;
2324
5.58k
        } else {
2325
105
            *overflow = 1;
2326
105
            q++;
2327
105
            skip_to_comma_(q);
2328
105
        }
2329
5.68k
    }
2330
3.73k
    return q;
2331
3.73k
}
2332
2333
HTS_ALIGN32
2334
static char *sam_parse_Bf_vals(bam1_t *b, char *q, uint32_t *nused,
2335
1.89k
                               uint32_t *nalloc, int *overflow) {
2336
9.17k
    while (*q == ',') {
2337
7.27k
        if ((*nused)++ >= (*nalloc)) {
2338
342
            if (grow_B_array(b, nalloc, 4) < 0)
2339
0
                return NULL;
2340
342
        }
2341
7.27k
        float_to_le(strtod(q + 1, &q), b->data + b->l_data);
2342
7.27k
        b->l_data += 4;
2343
7.27k
    }
2344
1.89k
    return q;
2345
1.89k
}
2346
2347
HTS_ALIGN32
2348
static int sam_parse_B_vals_r(char type, uint32_t nalloc, char *in,
2349
                              char **end, bam1_t *b,
2350
26.5k
                              int *ctr) {
2351
    // Protect against infinite recursion when dealing with invalid input.
2352
    // An example string is "XX:B:C,-".  The lack of a number means min=0,
2353
    // but it overflowed due to "-" and so we repeat ad-infinitum.
2354
    //
2355
    // Loop detection is the safest solution incase there are other
2356
    // strange corner cases with malformed inputs.
2357
26.5k
    if (++(*ctr) > 2) {
2358
75
        hts_log_error("Malformed data in B:%c array", type);
2359
75
        return -1;
2360
75
    }
2361
2362
26.5k
    int orig_l = b->l_data;
2363
26.5k
    char *q = in;
2364
26.5k
    int32_t size;
2365
26.5k
    size_t bytes;
2366
26.5k
    int overflow = 0;
2367
2368
26.5k
    size = aux_type2size(type);
2369
26.5k
    if (size <= 0 || size > 4) {
2370
23
        hts_log_error("Unrecognized type B:%c", type);
2371
23
        return -1;
2372
23
    }
2373
2374
    // Ensure space for type + values.
2375
    // The first pass through here we don't know the number of entries and
2376
    // nalloc == 0.  We start with a small working set and then parse the
2377
    // data, growing as needed.
2378
    //
2379
    // If we have a second pass through we do know the number of entries
2380
    // and nalloc is already known.  We have no need to expand the bam data.
2381
26.4k
    if (!nalloc)
2382
22.7k
         nalloc=7;
2383
2384
    // Ensure allocated memory is big enough (for current nalloc estimate)
2385
26.4k
    bytes = (size_t) nalloc * (size_t) size;
2386
26.4k
    if (bytes / size != nalloc
2387
26.4k
        || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) {
2388
0
        hts_log_error("Out of memory");
2389
0
        return -1;
2390
0
    }
2391
2392
26.4k
    uint32_t nused = 0;
2393
2394
26.4k
    b->data[b->l_data++] = 'B';
2395
26.4k
    b->data[b->l_data++] = type;
2396
    // 32-bit B-array length is inserted later once we know it.
2397
26.4k
    int b_len_idx = b->l_data;
2398
26.4k
    b->l_data += sizeof(uint32_t);
2399
2400
26.4k
    if (type == 'c') {
2401
1.95k
        if (!(q = sam_parse_Bc_vals(b, q, &nused, &nalloc, &overflow)))
2402
0
            return -1;
2403
24.5k
    } else if (type == 'C') {
2404
2.90k
        if (!(q = sam_parse_BC_vals(b, q, &nused, &nalloc, &overflow)))
2405
0
            return -1;
2406
21.6k
    } else if (type == 's') {
2407
9.80k
        if (!(q = sam_parse_Bs_vals(b, q, &nused, &nalloc, &overflow)))
2408
0
            return -1;
2409
11.8k
    } else if (type == 'S') {
2410
4.70k
        if (!(q = sam_parse_BS_vals(b, q, &nused, &nalloc, &overflow)))
2411
0
            return -1;
2412
7.12k
    } else if (type == 'i') {
2413
1.21k
        if (!(q = sam_parse_Bi_vals(b, q, &nused, &nalloc, &overflow)))
2414
0
            return -1;
2415
5.90k
    } else if (type == 'I') {
2416
3.73k
        if (!(q = sam_parse_BI_vals(b, q, &nused, &nalloc, &overflow)))
2417
0
            return -1;
2418
3.73k
    } else if (type == 'f') {
2419
1.89k
        if (!(q = sam_parse_Bf_vals(b, q, &nused, &nalloc, &overflow)))
2420
0
            return -1;
2421
1.89k
    }
2422
26.4k
    if (*q != '\t' && *q != '\0') {
2423
        // Unknown B array type or junk in the numbers
2424
152
        hts_log_error("Malformed B:%c", type);
2425
152
        return -1;
2426
152
    }
2427
26.3k
    i32_to_le(nused, b->data + b_len_idx);
2428
2429
26.3k
    if (!overflow) {
2430
22.1k
        *end = q;
2431
22.1k
        return 0;
2432
22.1k
    } else {
2433
4.21k
        int64_t max = 0, min = 0, val;
2434
        // Given type was incorrect.  Try to rescue the situation.
2435
4.21k
        char *r = q;
2436
4.21k
        q = in;
2437
4.21k
        overflow = 0;
2438
4.21k
        b->l_data = orig_l;
2439
        // Find out what range of values is present
2440
1.02M
        while (q < r) {
2441
1.02M
            val = hts_str2int(q + 1, &q, 64, &overflow);
2442
1.02M
            if (max < val) max = val;
2443
1.02M
            if (min > val) min = val;
2444
1.02M
            skip_to_comma_(q);
2445
1.02M
        }
2446
        // Retry with appropriate type
2447
4.21k
        if (!overflow) {
2448
4.10k
            if (min < 0) {
2449
2.20k
                if (min >= INT8_MIN && max <= INT8_MAX) {
2450
156
                    return sam_parse_B_vals_r('c', nalloc, in, end, b, ctr);
2451
2.04k
                } else if (min >= INT16_MIN && max <= INT16_MAX) {
2452
792
                    return sam_parse_B_vals_r('s', nalloc, in, end, b, ctr);
2453
1.25k
                } else if (min >= INT32_MIN && max <= INT32_MAX) {
2454
1.05k
                    return sam_parse_B_vals_r('i', nalloc, in, end, b, ctr);
2455
1.05k
                }
2456
2.20k
            } else {
2457
1.90k
                if (max < UINT8_MAX) {
2458
258
                    return sam_parse_B_vals_r('C', nalloc, in, end, b, ctr);
2459
1.64k
                } else if (max <= UINT16_MAX) {
2460
524
                    return sam_parse_B_vals_r('S', nalloc, in, end, b, ctr);
2461
1.12k
                } else if (max <= UINT32_MAX) {
2462
1.01k
                    return sam_parse_B_vals_r('I', nalloc, in, end, b, ctr);
2463
1.01k
                }
2464
1.90k
            }
2465
4.10k
        }
2466
        // If here then at least one of the values is too big to store
2467
408
        hts_log_error("Numeric value in B array out of allowed range");
2468
408
        return -1;
2469
4.21k
    }
2470
26.3k
#undef skip_to_comma_
2471
26.3k
}
2472
2473
HTS_ALIGN32
2474
static int sam_parse_B_vals(char type, char *in, char **end, bam1_t *b)
2475
22.7k
{
2476
22.7k
    int ctr = 0;
2477
22.7k
    uint32_t nalloc = 0;
2478
22.7k
    return sam_parse_B_vals_r(type, nalloc, in, end, b, &ctr);
2479
22.7k
}
2480
2481
8.06k
static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) {
2482
8.06k
    if (*v >= '1' && *v <= '9') {
2483
253
        return hts_str2uint(v, rv, 16, overflow);
2484
253
    }
2485
7.80k
    else if (*v == '0') {
2486
        // handle single-digit "0" directly; otherwise it's hex or octal
2487
832
        if (v[1] == '\t') { *rv = v+1; return 0; }
2488
74
        else {
2489
74
            unsigned long val = strtoul(v, rv, 0);
2490
74
            if (val > 65535) { *overflow = 1; return 65535; }
2491
15
            return val;
2492
74
        }
2493
832
    }
2494
6.97k
    else {
2495
        // TODO implement symbolic flag letters
2496
6.97k
        *rv = v;
2497
6.97k
        return 0;
2498
6.97k
    }
2499
8.06k
}
2500
2501
// Parse tag line and append to bam object b.
2502
// Shared by both SAM and FASTQ parsers.
2503
//
2504
// The difference between the two is how lenient we are to recognising
2505
// non-compliant strings.  The FASTQ parser glosses over arbitrary
2506
// non-SAM looking strings.
2507
static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient,
2508
7.04k
                            khash_t(tag) *tag_whitelist) {
2509
7.04k
    int overflow = 0;
2510
7.04k
    int checkpoint;
2511
7.04k
    char logbuf[40];
2512
7.04k
    char *q = start, *p = end;
2513
2514
7.04k
#define _parse_err(cond, ...)                   \
2515
3.75M
    do {                                        \
2516
7.58M
        if (cond) {                             \
2517
239
            if (lenient) {                      \
2518
0
                while (q < p && !isspace_c(*q))   \
2519
0
                    q++;                        \
2520
0
                while (q < p && isspace_c(*q))    \
2521
0
                    q++;                        \
2522
0
                b->l_data = checkpoint;         \
2523
0
                goto loop;                      \
2524
239
            } else {                            \
2525
239
                hts_log_error(__VA_ARGS__);     \
2526
239
                goto err_ret;                   \
2527
239
            }                                   \
2528
239
        }                                       \
2529
3.75M
    } while (0)
2530
2531
3.72M
    while (q < p) loop: {
2532
3.72M
        char type;
2533
3.72M
        checkpoint = b->l_data;
2534
3.72M
        if (p - q < 5) {
2535
36
            if (lenient) {
2536
0
                break;
2537
36
            } else {
2538
36
                hts_log_error("Incomplete aux field");
2539
36
                goto err_ret;
2540
36
            }
2541
36
        }
2542
1.86M
        _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id");
2543
2544
1.86M
        if (lenient && (q[2] | q[4]) != ':') {
2545
0
            while (q < p && !isspace_c(*q))
2546
0
                q++;
2547
0
            while (q < p && isspace_c(*q))
2548
0
                q++;
2549
0
            continue;
2550
0
        }
2551
2552
1.86M
        if (tag_whitelist) {
2553
0
            int tt = q[0]*256 + q[1];
2554
0
            if (kh_get(tag, tag_whitelist, tt) == kh_end(tag_whitelist)) {
2555
0
                while (q < p && *q != '\t')
2556
0
                    q++;
2557
0
                continue;
2558
0
            }
2559
0
        }
2560
2561
        // Copy over id
2562
1.86M
        if (possibly_expand_bam_data(b, 2) < 0) goto err_ret;
2563
1.86M
        memcpy(b->data + b->l_data, q, 2); b->l_data += 2;
2564
1.86M
        q += 3; type = *q++; ++q; // q points to value
2565
1.86M
        if (type != 'Z' && type != 'H') // the only zero length acceptable fields
2566
1.85M
            _parse_err(*q <= '\t', "incomplete aux field");
2567
2568
        // Ensure enough space for a double + type allocated.
2569
1.86M
        if (possibly_expand_bam_data(b, 16) < 0) goto err_ret;
2570
2571
1.86M
        if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
2572
784k
            b->data[b->l_data++] = 'A';
2573
784k
            b->data[b->l_data++] = *q++;
2574
1.07M
        } else if (type == 'i' || type == 'I') {
2575
991k
            if (*q == '-') {
2576
829k
                int32_t x = hts_str2int(q, &q, 32, &overflow);
2577
829k
                if (x >= INT8_MIN) {
2578
393k
                    b->data[b->l_data++] = 'c';
2579
393k
                    b->data[b->l_data++] = x;
2580
436k
                } else if (x >= INT16_MIN) {
2581
139k
                    b->data[b->l_data++] = 's';
2582
139k
                    i16_to_le(x, b->data + b->l_data);
2583
139k
                    b->l_data += 2;
2584
297k
                } else {
2585
297k
                    b->data[b->l_data++] = 'i';
2586
297k
                    i32_to_le(x, b->data + b->l_data);
2587
297k
                    b->l_data += 4;
2588
297k
                }
2589
829k
            } else {
2590
162k
                uint32_t x = hts_str2uint(q, &q, 32, &overflow);
2591
162k
                if (x <= UINT8_MAX) {
2592
86.3k
                    b->data[b->l_data++] = 'C';
2593
86.3k
                    b->data[b->l_data++] = x;
2594
86.3k
                } else if (x <= UINT16_MAX) {
2595
74.4k
                    b->data[b->l_data++] = 'S';
2596
74.4k
                    u16_to_le(x, b->data + b->l_data);
2597
74.4k
                    b->l_data += 2;
2598
74.4k
                } else {
2599
1.42k
                    b->data[b->l_data++] = 'I';
2600
1.42k
                    u32_to_le(x, b->data + b->l_data);
2601
1.42k
                    b->l_data += 4;
2602
1.42k
                }
2603
162k
            }
2604
991k
        } else if (type == 'f') {
2605
12.6k
            b->data[b->l_data++] = 'f';
2606
12.6k
            float_to_le(strtod(q, &q), b->data + b->l_data);
2607
12.6k
            b->l_data += sizeof(float);
2608
72.9k
        } else if (type == 'd') {
2609
40.9k
            b->data[b->l_data++] = 'd';
2610
40.9k
            double_to_le(strtod(q, &q), b->data + b->l_data);
2611
40.9k
            b->l_data += sizeof(double);
2612
40.9k
        } else if (type == 'Z' || type == 'H') {
2613
9.20k
            char *end = strchr(q, '\t');
2614
9.20k
            if (!end) end = q + strlen(q);
2615
9.20k
            _parse_err(type == 'H' && ((end-q)&1) != 0,
2616
9.20k
                       "hex field does not have an even number of digits");
2617
9.19k
            b->data[b->l_data++] = type;
2618
9.19k
            if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret;
2619
9.19k
            memcpy(b->data + b->l_data, q, end - q);
2620
9.19k
            b->l_data += end - q;
2621
9.19k
            b->data[b->l_data++] = '\0';
2622
9.19k
            q = end;
2623
22.8k
        } else if (type == 'B') {
2624
22.7k
            type = *q++; // q points to the first ',' following the typing byte
2625
22.7k
            _parse_err(*q && *q != ',' && *q != '\t',
2626
22.7k
                       "B aux field type not followed by ','");
2627
2628
22.7k
            if (sam_parse_B_vals(type, q, &q, b) < 0)
2629
658
                goto err_ret;
2630
22.7k
        } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1));
2631
2632
11.9M
        while (*q > '\t') { q++; } // Skip any junk to next tab
2633
1.86M
        q++;
2634
1.86M
    }
2635
2636
6.18k
    _parse_err(!lenient && overflow != 0, "numeric value out of allowed range");
2637
6.11k
#undef _parse_err
2638
2639
6.11k
    return 0;
2640
2641
933
err_ret:
2642
933
    return -2;
2643
6.18k
}
2644
2645
int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
2646
8.20k
{
2647
35.7k
#define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0)
2648
2649
8.20k
#if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff
2650
2651
// Macro that operates on 64-bits at a time.
2652
8.20k
#define COPY_MINUS_N(to,from,n,l,failed)                        \
2653
8.20k
    do {                                                        \
2654
6.38k
        uint64_u *from8 = (uint64_u *)(from);                   \
2655
6.38k
        uint64_u *to8 = (uint64_u *)(to);                       \
2656
6.38k
        uint64_t uflow = 0;                                     \
2657
6.38k
        size_t l8 = (l)>>3, i;                                  \
2658
6.67k
        for (i = 0; i < l8; i++) {                              \
2659
285
            to8[i] = from8[i] - (n)*0x0101010101010101UL;       \
2660
285
            uflow |= to8[i];                                    \
2661
285
        }                                                       \
2662
6.87k
        for (i<<=3; i < (l); ++i) {                             \
2663
484
            to[i] = from[i] - (n);                              \
2664
484
            uflow |= to[i];                                     \
2665
484
        }                                                       \
2666
6.38k
        failed = (uflow & 0x8080808080808080UL) > 0;            \
2667
6.38k
    } while (0)
2668
2669
#else
2670
2671
// Basic version which operates a byte at a time
2672
#define COPY_MINUS_N(to,from,n,l,failed) do {                \
2673
        uint8_t uflow = 0;                                   \
2674
        for (i = 0; i < (l); ++i) {                          \
2675
            (to)[i] = (from)[i] - (n);                       \
2676
            uflow |= (uint8_t) (to)[i];                      \
2677
        }                                                    \
2678
        failed = (uflow & 0x80) > 0;                         \
2679
    } while (0)
2680
2681
#endif
2682
2683
14.1k
#define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l)
2684
90.7k
#define _parse_err(cond, ...) do { if (cond) { hts_log_error(__VA_ARGS__); goto err_ret; } } while (0)
2685
23.6k
#define _parse_warn(cond, ...) do { if (cond) { hts_log_warning(__VA_ARGS__); } } while (0)
2686
2687
8.20k
    uint8_t *t;
2688
2689
8.20k
    char *p = s->s, *q;
2690
8.20k
    int i, overflow = 0;
2691
8.20k
    char logbuf[40];
2692
8.20k
    hts_pos_t cigreflen;
2693
8.20k
    bam1_core_t *c = &b->core;
2694
2695
8.20k
    b->l_data = 0;
2696
8.20k
    memset(c, 0, 32);
2697
2698
    // qname
2699
8.20k
    q = _read_token(p);
2700
2701
8.06k
    _parse_warn(p - q <= 1, "empty query name");
2702
8.06k
    _parse_err(p - q > 255, "query name too long");
2703
    // resize large enough for name + extranul
2704
8.06k
    if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret;
2705
8.06k
    memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q;
2706
2707
8.06k
    c->l_extranul = (4 - (b->l_data & 3)) & 3;
2708
8.06k
    memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul);
2709
8.06k
    b->l_data += c->l_extranul;
2710
2711
8.06k
    c->l_qname = p - q + c->l_extranul;
2712
2713
    // flag
2714
8.06k
    c->flag = parse_sam_flag(p, &p, &overflow);
2715
8.06k
    if (*p++ != '\t') goto err_ret; // malformated flag
2716
2717
    // chr
2718
7.82k
    q = _read_token(p);
2719
7.80k
    if (strcmp(q, "*")) {
2720
2.40k
        _parse_err(h->n_targets == 0, "no SQ lines present in the header");
2721
2.36k
        c->tid = bam_name2id(h, q);
2722
2.36k
        _parse_err(c->tid < -1, "failed to parse header");
2723
2.36k
        _parse_warn(c->tid < 0, "unrecognized reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2724
5.39k
    } else c->tid = -1;
2725
2726
    // pos
2727
7.75k
    c->pos = hts_str2uint(p, &p, 62, &overflow) - 1;
2728
7.75k
    if (*p++ != '\t') goto err_ret;
2729
7.65k
    if (c->pos < 0 && c->tid >= 0) {
2730
36
        _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
2731
36
        c->tid = -1;
2732
36
    }
2733
7.65k
    if (c->tid < 0) c->flag |= BAM_FUNMAP;
2734
2735
    // mapq
2736
7.65k
    c->qual = hts_str2uint(p, &p, 8, &overflow);
2737
7.65k
    if (*p++ != '\t') goto err_ret;
2738
    // cigar
2739
7.59k
    if (*p != '*') {
2740
2.42k
        uint32_t *cigar = NULL;
2741
2.42k
        int old_l_data = b->l_data;
2742
2.42k
        int n_cigar = bam_parse_cigar(p, &p, b);
2743
2.42k
        if (n_cigar < 1 || *p++ != '\t') goto err_ret;
2744
2.29k
        cigar = (uint32_t *)(b->data + old_l_data);
2745
2746
        // can't use bam_endpos() directly as some fields not yet set up
2747
2.29k
        cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
2748
2.29k
        if (cigreflen == 0) cigreflen = 1;
2749
5.16k
    } else {
2750
5.16k
        _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
2751
5.16k
        c->flag |= BAM_FUNMAP;
2752
5.16k
        q = _read_token(p);
2753
5.14k
        cigreflen = 1;
2754
5.14k
    }
2755
7.44k
    _parse_err(HTS_POS_MAX - cigreflen <= c->pos,
2756
7.44k
               "read ends beyond highest supported position");
2757
7.44k
    c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5);
2758
    // mate chr
2759
7.44k
    q = _read_token(p);
2760
7.40k
    if (strcmp(q, "=") == 0) {
2761
129
        c->mtid = c->tid;
2762
7.27k
    } else if (strcmp(q, "*") == 0) {
2763
21
        c->mtid = -1;
2764
7.25k
    } else {
2765
7.25k
        c->mtid = bam_name2id(h, q);
2766
7.25k
        _parse_err(c->mtid < -1, "failed to parse header");
2767
7.25k
        _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2768
7.25k
    }
2769
    // mpos
2770
7.40k
    c->mpos = hts_str2uint(p, &p, 62, &overflow) - 1;
2771
7.40k
    if (*p++ != '\t') goto err_ret;
2772
7.32k
    if (c->mpos < 0 && c->mtid >= 0) {
2773
758
        _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
2774
758
        c->mtid = -1;
2775
758
    }
2776
    // tlen
2777
7.32k
    c->isize = hts_str2int(p, &p, 63, &overflow);
2778
7.32k
    if (*p++ != '\t') goto err_ret;
2779
7.17k
    _parse_err(overflow, "number outside allowed range");
2780
    // seq
2781
7.14k
    q = _read_token(p);
2782
7.12k
    if (strcmp(q, "*")) {
2783
7.01k
        _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long");
2784
7.01k
        c->l_qseq = p - q - 1;
2785
7.01k
        hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname));
2786
7.01k
        _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length");
2787
7.00k
        i = (c->l_qseq + 1) >> 1;
2788
7.00k
        _get_mem(uint8_t, &t, b, i);
2789
2790
7.00k
        unsigned int lqs2 = c->l_qseq&~1, i;
2791
8.57k
        for (i = 0; i < lqs2; i+=2)
2792
1.56k
            t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]];
2793
7.78k
        for (; i < c->l_qseq; ++i)
2794
778
            t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2);
2795
7.00k
    } else c->l_qseq = 0;
2796
    // qual
2797
14.2k
    _get_mem(uint8_t, &t, b, c->l_qseq);
2798
14.2k
    if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) {
2799
692
        memset(t, 0xff, c->l_qseq);
2800
692
        p += 2;
2801
6.42k
    } else {
2802
6.42k
        int failed = 0;
2803
6.42k
        _parse_err(s->l - (p - s->s) < c->l_qseq
2804
6.42k
                   || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'),
2805
6.42k
                   "SEQ and QUAL are of different length");
2806
6.38k
        COPY_MINUS_N(t, p, 33, c->l_qseq, failed);
2807
6.38k
        _parse_err(failed, "invalid QUAL character");
2808
6.35k
        p += c->l_qseq + 1;
2809
6.35k
    }
2810
2811
    // aux
2812
7.04k
    if (aux_parse(p, s->s + s->l, b, 0, NULL) < 0)
2813
933
        goto err_ret;
2814
2815
6.11k
    if (bam_tag2cigar(b, 1, 1) < 0)
2816
0
        return -2;
2817
6.11k
    return 0;
2818
2819
0
#undef _parse_warn
2820
0
#undef _parse_err
2821
0
#undef _get_mem
2822
0
#undef _read_token
2823
2.09k
err_ret:
2824
2.09k
    return -2;
2825
6.11k
}
2826
2827
2.42k
static uint32_t read_ncigar(const char *q) {
2828
2.42k
    uint32_t n_cigar = 0;
2829
37.9k
    for (; *q && *q != '\t'; ++q)
2830
35.4k
        if (!isdigit_c(*q)) ++n_cigar;
2831
2.42k
    if (!n_cigar) {
2832
18
        hts_log_error("No CIGAR operations");
2833
18
        return 0;
2834
18
    }
2835
2.40k
    if (n_cigar >= 2147483647) {
2836
0
        hts_log_error("Too many CIGAR operations");
2837
0
        return 0;
2838
0
    }
2839
2840
2.40k
    return n_cigar;
2841
2.40k
}
2842
2843
/*! @function
2844
 @abstract  Parse a CIGAR string into preallocated a uint32_t array
2845
 @param  in      [in]  pointer to the source string
2846
 @param  a_cigar [out]  address of the destination uint32_t buffer
2847
 @return         number of processed input characters; 0 on error
2848
 */
2849
2.40k
static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) {
2850
2.40k
    int i, overflow = 0;
2851
2.40k
    const char *p = in;
2852
8.57k
    for (i = 0; i < n_cigar; i++) {
2853
6.26k
        uint32_t len;
2854
6.26k
        int op;
2855
6.26k
        char *q;
2856
6.26k
        len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT;
2857
6.26k
        if (q == p) {
2858
31
            hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p);
2859
31
            return 0;
2860
31
        }
2861
6.23k
        if (overflow) {
2862
32
            hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p);
2863
32
            return 0;
2864
32
        }
2865
6.20k
        p = q;
2866
6.20k
        op = bam_cigar_table[(unsigned char)*p++];
2867
6.20k
        if (op < 0) {
2868
32
            hts_log_error("Unrecognized CIGAR operator");
2869
32
            return 0;
2870
32
        }
2871
6.17k
        a_cigar[i] = len;
2872
6.17k
        a_cigar[i] |= op;
2873
6.17k
    }
2874
2875
2.30k
    return p-in;
2876
2.40k
}
2877
2878
0
ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) {
2879
0
    size_t n_cigar = 0;
2880
0
    int diff;
2881
2882
0
    if (!in || !a_cigar || !a_mem) {
2883
0
        hts_log_error("NULL pointer arguments");
2884
0
        return -1;
2885
0
    }
2886
0
    if (end) *end = (char *)in;
2887
2888
0
    if (*in == '*') {
2889
0
        if (end) (*end)++;
2890
0
        return 0;
2891
0
    }
2892
0
    n_cigar = read_ncigar(in);
2893
0
    if (!n_cigar) return 0;
2894
0
    if (n_cigar > *a_mem) {
2895
0
        uint32_t *a_tmp = hts_realloc_p(*a_cigar, sizeof(**a_cigar), n_cigar);
2896
0
        if (a_tmp) {
2897
0
            *a_cigar = a_tmp;
2898
0
            *a_mem = n_cigar;
2899
0
        } else {
2900
0
            hts_log_error("Memory allocation error");
2901
0
            return -1;
2902
0
        }
2903
0
    }
2904
2905
0
    if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1;
2906
0
    if (end) *end = (char *)in+diff;
2907
2908
0
    return n_cigar;
2909
0
}
2910
2911
2.42k
ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) {
2912
2.42k
    size_t n_cigar = 0;
2913
2.42k
    int diff;
2914
2915
2.42k
    if (!in || !b) {
2916
0
        hts_log_error("NULL pointer arguments");
2917
0
        return -1;
2918
0
    }
2919
2.42k
    if (end) *end = (char *)in;
2920
2921
2.42k
    n_cigar = (*in == '*') ? 0 : read_ncigar(in);
2922
2.42k
    if (!n_cigar && b->core.n_cigar == 0) {
2923
18
        if (end) *end = (char *)in+1;
2924
18
        return 0;
2925
18
    }
2926
2927
2.40k
    ssize_t cig_diff = n_cigar - b->core.n_cigar;
2928
2.40k
    if (cig_diff > 0 &&
2929
2.40k
        possibly_expand_bam_data(b, cig_diff * sizeof(uint32_t)) < 0) {
2930
0
        hts_log_error("Memory allocation error");
2931
0
        return -1;
2932
0
    }
2933
2934
2.40k
    uint32_t *cig = bam_get_cigar(b);
2935
2.40k
    if ((uint8_t *)cig != b->data + b->l_data) {
2936
        // Modifying an BAM existing BAM record
2937
0
        uint8_t  *seq = bam_get_seq(b);
2938
0
        memmove(cig + n_cigar, seq, (b->data + b->l_data) - seq);
2939
0
    }
2940
2941
2.40k
    if (n_cigar) {
2942
2.40k
        if (!(diff = parse_cigar(in, cig, n_cigar)))
2943
95
            return -1;
2944
2.40k
    } else {
2945
0
        diff = 1; // handle "*"
2946
0
    }
2947
2948
2.30k
    b->l_data += cig_diff * sizeof(uint32_t);
2949
2.30k
    b->core.n_cigar = n_cigar;
2950
2.30k
    if (end) *end = (char *)in + diff;
2951
2952
2.30k
    return n_cigar;
2953
2.40k
}
2954
2955
/*
2956
 * -----------------------------------------------------------------------------
2957
 * SAM threading
2958
 */
2959
// Size of SAM text block (reading)
2960
0
#define SAM_NBYTES 240000
2961
2962
// Number of BAM records (writing, up to NB_mem in size)
2963
0
#define SAM_NBAM 1000
2964
2965
struct SAM_state;
2966
2967
// Output job - a block of BAM records
2968
typedef struct sp_bams {
2969
    struct sp_bams *next;
2970
    int serial;
2971
2972
    bam1_t *bams;
2973
    int nbams, abams; // used and alloc for bams[] array
2974
    size_t bam_mem;   // very approximate total size
2975
2976
    struct SAM_state *fd;
2977
} sp_bams;
2978
2979
// Input job - a block of SAM text
2980
typedef struct sp_lines {
2981
    struct sp_lines *next;
2982
    int serial;
2983
2984
    char *data;
2985
    int data_size;
2986
    int alloc;
2987
2988
    struct SAM_state *fd;
2989
    sp_bams *bams;
2990
} sp_lines;
2991
2992
enum sam_cmd {
2993
    SAM_NONE = 0,
2994
    SAM_CLOSE,
2995
    SAM_CLOSE_DONE,
2996
    SAM_AT_EOF,
2997
};
2998
2999
typedef struct SAM_state {
3000
    sam_hdr_t *h;
3001
3002
    hts_tpool *p;
3003
    int own_pool;
3004
    pthread_mutex_t lines_m;
3005
    hts_tpool_process *q;
3006
    pthread_t dispatcher;
3007
    int dispatcher_set;
3008
3009
    sp_lines *lines;
3010
    sp_bams *bams;
3011
3012
    sp_bams *curr_bam;
3013
    int curr_idx;
3014
    int serial;
3015
3016
    // Be warned: moving these mutexes around in this struct can reduce
3017
    // threading performance by up to 70%!
3018
    pthread_mutex_t command_m;
3019
    pthread_cond_t command_c;
3020
    enum sam_cmd command;
3021
3022
    // One of the E* errno codes
3023
    int errcode;
3024
3025
    htsFile *fp;
3026
} SAM_state;
3027
3028
// Returns a SAM_state struct from a generic hFILE.
3029
//
3030
// Returns NULL on failure.
3031
0
static SAM_state *sam_state_create(htsFile *fp) {
3032
    // Ideally sam_open wouldn't be a #define to hts_open but instead would
3033
    // be a redirect call with an additional 'S' mode.  This in turn would
3034
    // correctly set the designed format to sam instead of a generic
3035
    // text_format.
3036
0
    if (fp->format.format != sam && fp->format.format != text_format)
3037
0
        return NULL;
3038
3039
0
    SAM_state *fd = calloc(1, sizeof(*fd));
3040
0
    if (!fd)
3041
0
        return NULL;
3042
3043
0
    fp->state = fd;
3044
0
    fd->fp = fp;
3045
3046
0
    return fd;
3047
0
}
3048
3049
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str);
3050
static void *sam_format_worker(void *arg);
3051
3052
0
static void sam_state_err(SAM_state *fd, int errcode) {
3053
0
    pthread_mutex_lock(&fd->command_m);
3054
0
    if (!fd->errcode)
3055
0
        fd->errcode = errcode;
3056
0
    pthread_mutex_unlock(&fd->command_m);
3057
0
}
3058
3059
0
static void sam_free_sp_bams(sp_bams *b) {
3060
0
    if (!b)
3061
0
        return;
3062
3063
0
    if (b->bams) {
3064
0
        int i;
3065
0
        for (i = 0; i < b->abams; i++) {
3066
0
            if (b->bams[i].data)
3067
0
                free(b->bams[i].data);
3068
0
        }
3069
0
        free(b->bams);
3070
0
    }
3071
0
    free(b);
3072
0
}
3073
3074
// Destroys the state produce by sam_state_create.
3075
24.0k
int sam_state_destroy(htsFile *fp) {
3076
24.0k
    int ret = 0;
3077
3078
24.0k
    if (!fp->state)
3079
24.0k
        return 0;
3080
3081
0
    SAM_state *fd = fp->state;
3082
0
    if (fd->p) {
3083
0
        if (fd->h) {
3084
            // Notify sam_dispatcher we're closing
3085
0
            pthread_mutex_lock(&fd->command_m);
3086
0
            if (fd->command != SAM_CLOSE_DONE)
3087
0
                fd->command = SAM_CLOSE;
3088
0
            pthread_cond_signal(&fd->command_c);
3089
0
            ret = -fd->errcode;
3090
0
            if (fd->q)
3091
0
                hts_tpool_wake_dispatch(fd->q); // unstick the reader
3092
3093
0
            if (!fp->is_write && fd->q && fd->dispatcher_set) {
3094
0
                for (;;) {
3095
                    // Avoid deadlocks with dispatcher
3096
0
                    if (fd->command == SAM_CLOSE_DONE)
3097
0
                        break;
3098
0
                    hts_tpool_wake_dispatch(fd->q);
3099
0
                    pthread_mutex_unlock(&fd->command_m);
3100
0
                    hts_usleep(10000);
3101
0
                    pthread_mutex_lock(&fd->command_m);
3102
0
                }
3103
0
            }
3104
0
            pthread_mutex_unlock(&fd->command_m);
3105
3106
0
            if (fp->is_write) {
3107
                // Dispatch the last partial block.
3108
0
                sp_bams *gb = fd->curr_bam;
3109
0
                if (!ret && gb && gb->nbams > 0 && fd->q)
3110
0
                    ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb);
3111
3112
                // Flush and drain output
3113
0
                if (fd->q)
3114
0
                    hts_tpool_process_flush(fd->q);
3115
0
                pthread_mutex_lock(&fd->command_m);
3116
0
                if (!ret) ret = -fd->errcode;
3117
0
                pthread_mutex_unlock(&fd->command_m);
3118
3119
0
                while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) {
3120
0
                    hts_usleep(10000);
3121
0
                    pthread_mutex_lock(&fd->command_m);
3122
0
                    ret = -fd->errcode;
3123
                    // not empty but shutdown implies error
3124
0
                    if (hts_tpool_process_is_shutdown(fd->q) && !ret)
3125
0
                        ret = EIO;
3126
0
                    pthread_mutex_unlock(&fd->command_m);
3127
0
                }
3128
0
                if (fd->q)
3129
0
                    hts_tpool_process_shutdown(fd->q);
3130
0
            }
3131
3132
            // Wait for it to acknowledge
3133
0
            if (fd->dispatcher_set)
3134
0
                pthread_join(fd->dispatcher, NULL);
3135
0
            if (!ret) ret = -fd->errcode;
3136
0
        }
3137
3138
        // Tidy up memory
3139
0
        if (fd->q)
3140
0
            hts_tpool_process_destroy(fd->q);
3141
3142
0
        if (fd->own_pool && fp->format.compression == no_compression) {
3143
0
            hts_tpool_destroy(fd->p);
3144
0
            fd->p = NULL;
3145
0
        }
3146
0
        pthread_mutex_destroy(&fd->lines_m);
3147
0
        pthread_mutex_destroy(&fd->command_m);
3148
0
        pthread_cond_destroy(&fd->command_c);
3149
3150
0
        sp_lines *l = fd->lines;
3151
0
        while (l) {
3152
0
            sp_lines *n = l->next;
3153
0
            free(l->data);
3154
0
            free(l);
3155
0
            l = n;
3156
0
        }
3157
3158
0
        sp_bams *b = fd->bams;
3159
0
        while (b) {
3160
0
            if (fd->curr_bam == b)
3161
0
                fd->curr_bam = NULL;
3162
0
            sp_bams *n = b->next;
3163
0
            sam_free_sp_bams(b);
3164
0
            b = n;
3165
0
        }
3166
3167
0
        if (fd->curr_bam)
3168
0
            sam_free_sp_bams(fd->curr_bam);
3169
3170
        // Decrement counter by one, maybe destroying too.
3171
        // This is to permit the caller using bam_hdr_destroy
3172
        // before sam_close without triggering decode errors
3173
        // in the background threads.
3174
0
        bam_hdr_destroy(fd->h);
3175
0
    }
3176
3177
0
    free(fp->state);
3178
0
    fp->state = NULL;
3179
0
    return ret;
3180
24.0k
}
3181
3182
// Cleanup function - job for sam_parse_worker; result for sam_format_worker
3183
0
static void cleanup_sp_lines(void *arg) {
3184
0
    sp_lines *gl = (sp_lines *)arg;
3185
0
    if (!gl) return;
3186
3187
    // Should always be true for lines passed to / from thread workers.
3188
0
    assert(gl->next == NULL);
3189
3190
0
    free(gl->data);
3191
0
    sam_free_sp_bams(gl->bams);
3192
0
    free(gl);
3193
0
}
3194
3195
// Run from one of the worker threads.
3196
// Convert a passed in array of lines to array of BAMs, returning
3197
// the result back to the thread queue.
3198
0
static void *sam_parse_worker(void *arg) {
3199
0
    sp_lines *gl = (sp_lines *)arg;
3200
0
    sp_bams *gb = NULL;
3201
0
    char *lines = gl->data;
3202
0
    int i;
3203
0
    bam1_t *b;
3204
0
    SAM_state *fd = gl->fd;
3205
3206
    // Use a block of BAM structs we had earlier if available.
3207
0
    pthread_mutex_lock(&fd->lines_m);
3208
0
    if (fd->bams) {
3209
0
        gb = fd->bams;
3210
0
        fd->bams = gb->next;
3211
0
    }
3212
0
    pthread_mutex_unlock(&fd->lines_m);
3213
3214
0
    if (gb == NULL) {
3215
0
        gb = calloc(1, sizeof(*gb));
3216
0
        if (!gb) {
3217
0
            return NULL;
3218
0
        }
3219
0
        gb->abams = 100;
3220
0
        gb->bams = b = calloc(gb->abams, sizeof(*b));
3221
0
        if (!gb->bams) {
3222
0
            sam_state_err(fd, ENOMEM);
3223
0
            goto err;
3224
0
        }
3225
0
        gb->nbams = 0;
3226
0
        gb->bam_mem = 0;
3227
0
    }
3228
0
    gb->serial = gl->serial;
3229
0
    gb->next = NULL;
3230
3231
0
    b = (bam1_t *)gb->bams;
3232
0
    if (!b) {
3233
0
        sam_state_err(fd, ENOMEM);
3234
0
        goto err;
3235
0
    }
3236
3237
0
    i = 0;
3238
0
    char *cp = lines, *cp_end = lines + gl->data_size;
3239
0
    while (cp < cp_end) {
3240
0
        if (i >= gb->abams) {
3241
0
            int old_abams = gb->abams;
3242
0
            gb->abams *= 2;
3243
0
            b = hts_realloc_p(gb->bams, sizeof(bam1_t), gb->abams);
3244
0
            if (!b) {
3245
0
                gb->abams /= 2;
3246
0
                sam_state_err(fd, ENOMEM);
3247
0
                goto err;
3248
0
            }
3249
0
            memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b));
3250
0
            gb->bams = b;
3251
0
        }
3252
3253
        // Ideally we'd get sam_parse1 to return the number of
3254
        // bytes decoded and to be able to stop on newline as
3255
        // well as \0.
3256
        //
3257
        // We can then avoid the additional strchr loop.
3258
        // It's around 6% of our CPU cost, albeit threadable.
3259
        //
3260
        // However this is an API change so for now we copy.
3261
3262
0
        char *nl = strchr(cp, '\n');
3263
0
        char *line_end;
3264
0
        if (nl) {
3265
0
            line_end = nl;
3266
0
            if (line_end > cp && *(line_end - 1) == '\r')
3267
0
                line_end--;
3268
0
            nl++;
3269
0
        } else {
3270
0
            nl = line_end = cp_end;
3271
0
        }
3272
0
        *line_end = '\0';
3273
0
        kstring_t ks = { line_end - cp, gl->alloc, cp };
3274
0
        if (sam_parse1(&ks, fd->h, &b[i]) < 0) {
3275
0
            sam_state_err(fd, errno ? errno : EIO);
3276
0
            cleanup_sp_lines(gl);
3277
0
            goto err;
3278
0
        }
3279
3280
0
        cp = nl;
3281
0
        i++;
3282
0
    }
3283
0
    gb->nbams = i;
3284
3285
0
    pthread_mutex_lock(&fd->lines_m);
3286
0
    gl->next = fd->lines;
3287
0
    fd->lines = gl;
3288
0
    pthread_mutex_unlock(&fd->lines_m);
3289
0
    return gb;
3290
3291
0
 err:
3292
0
    sam_free_sp_bams(gb);
3293
0
    return NULL;
3294
0
}
3295
3296
0
static void *sam_parse_eof(void *arg) {
3297
0
    return NULL;
3298
0
}
3299
3300
// Cleanup function - result for sam_parse_worker; job for sam_format_worker
3301
0
static void cleanup_sp_bams(void *arg) {
3302
0
    sam_free_sp_bams((sp_bams *) arg);
3303
0
}
3304
3305
// Runs in its own thread.
3306
// Reads a block of text (SAM) and sends a new job to the thread queue to
3307
// translate this to BAM.
3308
0
static void *sam_dispatcher_read(void *vp) {
3309
0
    htsFile *fp = vp;
3310
0
    kstring_t line = {0};
3311
0
    int line_frag = 0;
3312
0
    SAM_state *fd = fp->state;
3313
0
    sp_lines *l = NULL;
3314
3315
    // Pre-allocate buffer for left-over bits of line (exact size doesn't
3316
    // matter as it will grow if necessary).
3317
0
    if (ks_resize(&line, 1000) < 0)
3318
0
        goto err;
3319
3320
0
    for (;;) {
3321
        // Check for command
3322
0
        pthread_mutex_lock(&fd->command_m);
3323
0
        switch (fd->command) {
3324
3325
0
        case SAM_CLOSE:
3326
0
            pthread_cond_signal(&fd->command_c);
3327
0
            pthread_mutex_unlock(&fd->command_m);
3328
0
            hts_tpool_process_shutdown(fd->q);
3329
0
            goto tidyup;
3330
3331
0
        default:
3332
0
            break;
3333
0
        }
3334
0
        pthread_mutex_unlock(&fd->command_m);
3335
3336
0
        pthread_mutex_lock(&fd->lines_m);
3337
0
        if (fd->lines) {
3338
            // reuse existing line buffer
3339
0
            l = fd->lines;
3340
0
            fd->lines = l->next;
3341
0
        }
3342
0
        pthread_mutex_unlock(&fd->lines_m);
3343
3344
0
        if (l == NULL) {
3345
            // none to reuse, to create a new one
3346
0
            l = calloc(1, sizeof(*l));
3347
0
            if (!l)
3348
0
                goto err;
3349
0
            l->alloc = SAM_NBYTES;
3350
0
            l->data = hts_malloc_ps(sizeof(*l->data), l->alloc, 8); // +8 for optimisation in sam_parse1
3351
0
            if (!l->data) {
3352
0
                free(l);
3353
0
                l = NULL;
3354
0
                goto err;
3355
0
            }
3356
0
            l->fd = fd;
3357
0
        }
3358
0
        l->next = NULL;
3359
3360
0
        if (l->alloc < line_frag+SAM_NBYTES/2) {
3361
0
            char *rp = hts_realloc_ps(l->data, sizeof(*rp),
3362
0
                                      line_frag, SAM_NBYTES/2 + 8);
3363
0
            if (!rp)
3364
0
                goto err;
3365
0
            l->alloc = line_frag+SAM_NBYTES/2;
3366
0
            l->data = rp;
3367
0
        }
3368
0
        memcpy(l->data, line.s, line_frag);
3369
3370
0
        l->data_size = line_frag;
3371
0
        ssize_t nbytes;
3372
0
    longer_line:
3373
0
        if (fp->is_bgzf)
3374
0
            nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag);
3375
0
        else
3376
0
            nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag);
3377
0
        if (nbytes < 0) {
3378
0
            sam_state_err(fd, errno ? errno : EIO);
3379
0
            goto err;
3380
0
        } else if (nbytes == 0)
3381
0
            break; // EOF
3382
0
        l->data_size += nbytes;
3383
3384
        // trim to last \n. Maybe \r\n, but that's still fine
3385
0
        if (nbytes == l->alloc - line_frag) {
3386
0
            char *cp_end = l->data + l->data_size;
3387
0
            char *cp = cp_end-1;
3388
3389
0
            while (cp > (char *)l->data && *cp != '\n')
3390
0
                cp--;
3391
3392
            // entire buffer is part of a single line
3393
0
            if (cp == l->data) {
3394
0
                line_frag = l->data_size;
3395
0
                char *rp = hts_realloc_pse(l->data, 2, l->alloc, 0, 8);
3396
0
                if (!rp)
3397
0
                    goto err;
3398
0
                l->alloc *= 2;
3399
0
                l->data = rp;
3400
0
                assert(l->alloc >= l->data_size);
3401
0
                assert(l->alloc >= line_frag);
3402
0
                assert(l->alloc >= l->alloc - line_frag);
3403
0
                goto longer_line;
3404
0
            }
3405
0
            cp++;
3406
3407
            // line holds the remainder of our line.
3408
0
            if (ks_resize(&line, cp_end - cp) < 0)
3409
0
                goto err;
3410
0
            memcpy(line.s, cp, cp_end - cp);
3411
0
            line_frag = cp_end - cp;
3412
0
            l->data_size = l->alloc - line_frag;
3413
0
        } else {
3414
            // out of buffer
3415
0
            line_frag = 0;
3416
0
        }
3417
3418
0
        l->serial = fd->serial++;
3419
        //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial);
3420
0
        if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l,
3421
0
                                cleanup_sp_lines, cleanup_sp_bams, 0) < 0)
3422
0
            goto err;
3423
0
        pthread_mutex_lock(&fd->command_m);
3424
0
        if (fd->command == SAM_CLOSE) {
3425
0
            pthread_mutex_unlock(&fd->command_m);
3426
0
            l = NULL;
3427
0
            goto tidyup;
3428
0
        }
3429
0
        l = NULL;  // Now "owned" by sam_parse_worker()
3430
0
        pthread_mutex_unlock(&fd->command_m);
3431
0
    }
3432
3433
    // Submit a NULL sp_bams entry to act as an EOF marker
3434
0
    if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0)
3435
0
        goto err;
3436
3437
    // At EOF, wait for close request.
3438
    // (In future if we add support for seek, this is where we need to catch it.)
3439
0
    for (;;) {
3440
0
        pthread_mutex_lock(&fd->command_m);
3441
0
        if (fd->command == SAM_NONE)
3442
0
            pthread_cond_wait(&fd->command_c, &fd->command_m);
3443
0
        switch (fd->command) {
3444
0
        case SAM_CLOSE:
3445
0
            pthread_cond_signal(&fd->command_c);
3446
0
            pthread_mutex_unlock(&fd->command_m);
3447
0
            hts_tpool_process_shutdown(fd->q);
3448
0
            goto tidyup;
3449
3450
0
        default:
3451
0
            pthread_mutex_unlock(&fd->command_m);
3452
0
            break;
3453
0
        }
3454
0
    }
3455
3456
0
 tidyup:
3457
0
    pthread_mutex_lock(&fd->command_m);
3458
0
    fd->command = SAM_CLOSE_DONE;
3459
0
    pthread_cond_signal(&fd->command_c);
3460
0
    pthread_mutex_unlock(&fd->command_m);
3461
3462
0
    if (l) {
3463
0
        pthread_mutex_lock(&fd->lines_m);
3464
0
        l->next = fd->lines;
3465
0
        fd->lines = l;
3466
0
        pthread_mutex_unlock(&fd->lines_m);
3467
0
    }
3468
0
    free(line.s);
3469
3470
0
    return NULL;
3471
3472
0
 err:
3473
0
    sam_state_err(fd, errno ? errno : ENOMEM);
3474
0
    hts_tpool_process_shutdown(fd->q);
3475
0
    goto tidyup;
3476
0
}
3477
3478
// Runs in its own thread.
3479
// Takes encoded blocks of SAM off the thread results queue and writes them
3480
// to our output stream.
3481
0
static void *sam_dispatcher_write(void *vp) {
3482
0
    htsFile *fp = vp;
3483
0
    SAM_state *fd = fp->state;
3484
0
    hts_tpool_result *r;
3485
3486
    // Iterates until result queue is shutdown, where it returns NULL.
3487
0
    while ((r = hts_tpool_next_result_wait(fd->q))) {
3488
0
        sp_lines *gl = (sp_lines *)hts_tpool_result_data(r);
3489
0
        if (!gl) {
3490
0
            sam_state_err(fd, ENOMEM);
3491
0
            goto err;
3492
0
        }
3493
3494
0
        if (fp->idx) {
3495
0
            sp_bams *gb = gl->bams;
3496
0
            int i = 0, count = 0;
3497
0
            while (i < gl->data_size) {
3498
0
                int j = i;
3499
0
                while (i < gl->data_size && gl->data[i] != '\n')
3500
0
                    i++;
3501
0
                if (i < gl->data_size)
3502
0
                    i++;
3503
3504
0
                if (fp->is_bgzf) {
3505
0
                    if (bgzf_flush_try(fp->fp.bgzf, i-j) < 0)
3506
0
                        goto err;
3507
0
                    if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j)
3508
0
                        goto err;
3509
0
                } else {
3510
0
                    if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j)
3511
0
                        goto err;
3512
0
                }
3513
3514
0
                bam1_t *b = &gb->bams[count++];
3515
0
                if (fp->format.compression == bgzf) {
3516
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
3517
0
                                      b->core.tid, b->core.pos, bam_endpos(b),
3518
0
                                      bgzf_tell(fp->fp.bgzf),
3519
0
                                      !(b->core.flag&BAM_FUNMAP)) < 0) {
3520
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3521
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3522
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3523
0
                        goto err;
3524
0
                    }
3525
0
                } else {
3526
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
3527
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
3528
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3529
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3530
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3531
0
                        goto err;
3532
0
                    }
3533
0
                }
3534
0
            }
3535
3536
0
            assert(count == gb->nbams);
3537
3538
            // Add bam array to free-list
3539
0
            pthread_mutex_lock(&fd->lines_m);
3540
0
            gb->next = fd->bams;
3541
0
            fd->bams = gl->bams;
3542
0
            gl->bams = NULL;
3543
0
            pthread_mutex_unlock(&fd->lines_m);
3544
0
        } else {
3545
0
            if (fp->is_bgzf) {
3546
                // We keep track of how much in the current block we have
3547
                // remaining => R.  We look for the last newline in input
3548
                // [i] to [i+R], backwards => position N.
3549
                //
3550
                // If we find a newline, we write out bytes i to N.
3551
                // We know we cannot fit the next record in this bgzf block,
3552
                // so we flush what we have and copy input N to i+R into
3553
                // the start of a new block, and recompute a new R for that.
3554
                //
3555
                // If we don't find a newline (i==N) then we cannot extend
3556
                // the current block at all, so flush whatever is in it now
3557
                // if it ends on a newline.
3558
                // We still copy i(==N) to i+R to the next block and
3559
                // continue as before with a new R.
3560
                //
3561
                // The only exception on the flush is when we run out of
3562
                // data in the input.  In that case we skip it as we don't
3563
                // yet know if the next record will fit.
3564
                //
3565
                // Both conditions share the same code here:
3566
                // - Look for newline (pos N)
3567
                // - Write i to N (which maybe 0)
3568
                // - Flush if block ends on newline and not end of input
3569
                // - write N to i+R
3570
3571
0
                int i = 0;
3572
0
                BGZF *fb = fp->fp.bgzf;
3573
0
                while (i < gl->data_size) {
3574
                    // remaining space in block
3575
0
                    int R = BGZF_BLOCK_SIZE - fb->block_offset;
3576
0
                    int eod = 0;
3577
0
                    if (R > gl->data_size-i)
3578
0
                        R = gl->data_size-i, eod = 1;
3579
3580
                    // Find last newline in input data
3581
0
                    int N = i + R;
3582
0
                    while (--N > i) {
3583
0
                        if (gl->data[N] == '\n')
3584
0
                            break;
3585
0
                    }
3586
3587
0
                    if (N != i) {
3588
                        // Found a newline
3589
0
                        N++;
3590
0
                        if (bgzf_write(fb, &gl->data[i], N-i) != N-i)
3591
0
                            goto err;
3592
0
                    }
3593
3594
                    // Flush bgzf block
3595
0
                    int b_off = fb->block_offset;
3596
0
                    if (!eod && b_off &&
3597
0
                        ((char *)fb->uncompressed_block)[b_off-1] == '\n')
3598
0
                        if (bgzf_flush_try(fb, BGZF_BLOCK_SIZE) < 0)
3599
0
                            goto err;
3600
3601
                    // Copy from N onwards into next block
3602
0
                    if (i+R > N)
3603
0
                        if (bgzf_write(fb, &gl->data[N], i+R - N)
3604
0
                            != i+R - N)
3605
0
                            goto err;
3606
3607
0
                    i = i+R;
3608
0
                }
3609
0
            } else {
3610
0
                if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size)
3611
0
                    goto err;
3612
0
            }
3613
0
        }
3614
3615
0
        hts_tpool_delete_result(r, 0);
3616
3617
        // Also updated by main thread
3618
0
        pthread_mutex_lock(&fd->lines_m);
3619
0
        gl->next = fd->lines;
3620
0
        fd->lines = gl;
3621
0
        pthread_mutex_unlock(&fd->lines_m);
3622
0
    }
3623
3624
0
    sam_state_err(fd, 0); // success
3625
0
    hts_tpool_process_shutdown(fd->q);
3626
0
    return NULL;
3627
3628
0
 err:
3629
0
    sam_state_err(fd, errno ? errno : EIO);
3630
0
    return (void *)-1;
3631
0
}
3632
3633
// Run from one of the worker threads.
3634
// Convert a passed in array of BAMs (sp_bams) and converts to a block
3635
// of text SAM records (sp_lines).
3636
0
static void *sam_format_worker(void *arg) {
3637
0
    sp_bams *gb = (sp_bams *)arg;
3638
0
    sp_lines *gl = NULL;
3639
0
    int i;
3640
0
    SAM_state *fd = gb->fd;
3641
0
    htsFile *fp = fd->fp;
3642
3643
    // Use a block of SAM strings we had earlier if available.
3644
0
    pthread_mutex_lock(&fd->lines_m);
3645
0
    if (fd->lines) {
3646
0
        gl = fd->lines;
3647
0
        fd->lines = gl->next;
3648
0
    }
3649
0
    pthread_mutex_unlock(&fd->lines_m);
3650
3651
0
    if (gl == NULL) {
3652
0
        gl = calloc(1, sizeof(*gl));
3653
0
        if (!gl) {
3654
0
            sam_state_err(fd, ENOMEM);
3655
0
            return NULL;
3656
0
        }
3657
0
        gl->alloc = gl->data_size = 0;
3658
0
        gl->data = NULL;
3659
0
    }
3660
0
    gl->serial = gb->serial;
3661
0
    gl->next = NULL;
3662
3663
0
    kstring_t ks = {0, gl->alloc, gl->data};
3664
3665
0
    for (i = 0; i < gb->nbams; i++) {
3666
0
        if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) {
3667
0
            sam_state_err(fd, errno ? errno : EIO);
3668
0
            goto err;
3669
0
        }
3670
0
        kputc('\n', &ks);
3671
0
    }
3672
3673
0
    pthread_mutex_lock(&fd->lines_m);
3674
0
    gl->data_size = ks.l;
3675
0
    gl->alloc = ks.m;
3676
0
    gl->data = ks.s;
3677
3678
0
    if (fp->idx) {
3679
        // Keep hold of the bam array a little longer as
3680
        // sam_dispatcher_write needs to use them for building the index.
3681
0
        gl->bams = gb;
3682
0
    } else {
3683
        // Add bam array to free-list
3684
0
        gb->next = fd->bams;
3685
0
        fd->bams = gb;
3686
0
    }
3687
0
    pthread_mutex_unlock(&fd->lines_m);
3688
3689
0
    return gl;
3690
3691
0
 err:
3692
    // Possible race between this and fd->curr_bam.
3693
    // Easier to not free and leave it on the input list so it
3694
    // gets freed there instead?
3695
    // sam_free_sp_bams(gb);
3696
0
    if (gl) {
3697
0
        free(gl->data);
3698
0
        free(gl);
3699
0
    }
3700
0
    return NULL;
3701
0
}
3702
3703
0
int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) {
3704
0
    if (fp->state)
3705
0
        return -2;   //already exists!
3706
3707
0
    if (!(fp->state = sam_state_create(fp)))
3708
0
        return -1;
3709
0
    SAM_state *fd = (SAM_state *)fp->state;
3710
3711
0
    pthread_mutex_init(&fd->lines_m, NULL);
3712
0
    pthread_mutex_init(&fd->command_m, NULL);
3713
0
    pthread_cond_init(&fd->command_c, NULL);
3714
0
    fd->p = p->pool;
3715
0
    int qsize = p->qsize;
3716
0
    if (!qsize)
3717
0
        qsize = 2*hts_tpool_size(fd->p);
3718
0
    fd->q = hts_tpool_process_init(fd->p, qsize, 0);
3719
0
    if (!fd->q) {
3720
0
        sam_state_destroy(fp);
3721
0
        return -1;
3722
0
    }
3723
3724
0
    if (fp->format.compression == bgzf)
3725
0
        return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize);
3726
3727
0
    return 0;
3728
0
}
3729
3730
0
int sam_set_threads(htsFile *fp, int nthreads) {
3731
0
    if (nthreads <= 0)
3732
0
        return 0;
3733
3734
0
    htsThreadPool p;
3735
0
    p.pool = hts_tpool_init(nthreads);
3736
0
    p.qsize = nthreads*2;
3737
3738
0
    int ret = sam_set_thread_pool(fp, &p);
3739
0
    if (ret < 0) {
3740
0
        if (p.pool)
3741
0
            hts_tpool_destroy(p.pool);
3742
0
        return ret;
3743
0
    }
3744
3745
0
    SAM_state *fd = (SAM_state *)fp->state;
3746
0
    fd->own_pool = 1;
3747
3748
0
    return 0;
3749
0
}
3750
3751
0
#define UMI_TAGS 5
3752
typedef struct {
3753
    kstring_t name;
3754
    kstring_t comment; // NB: pointer into name, do not free
3755
    kstring_t seq;
3756
    kstring_t qual;
3757
    int casava;
3758
    int aux;
3759
    int rnum;
3760
    char BC[3];         // aux tag ID for barcode
3761
    char UMI[UMI_TAGS][3]; // aux tag list for UMIs.
3762
    khash_t(tag) *tags; // which aux tags to use (if empty, use all).
3763
    char nprefix;
3764
    int sra_names;
3765
    regex_t regex;
3766
} fastq_state;
3767
3768
// Initialise fastq state.
3769
// Name char of '@' or '>' distinguishes fastq vs fasta variant
3770
4.84k
static fastq_state *fastq_state_init(int name_char) {
3771
4.84k
    fastq_state *x = (fastq_state *)calloc(1, sizeof(*x));
3772
4.84k
    if (!x)
3773
0
        return NULL;
3774
4.84k
    strcpy(x->BC, "BC");
3775
4.84k
    x->nprefix = name_char;
3776
    // Default Illumina naming convention
3777
4.84k
    char *re = "^[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:([^:#/]+)";
3778
4.84k
    if (regcomp(&x->regex, re, REG_EXTENDED) != 0) {
3779
0
        free(x);
3780
0
        return NULL;
3781
0
    }
3782
3783
4.84k
    return x;
3784
4.84k
}
3785
3786
6.46k
void fastq_state_destroy(htsFile *fp) {
3787
6.46k
    if (fp->state) {
3788
4.84k
        fastq_state *x = (fastq_state *)fp->state;
3789
4.84k
        if (x->tags)
3790
0
            kh_destroy(tag, x->tags);
3791
4.84k
        ks_free(&x->name);
3792
4.84k
        ks_free(&x->seq);
3793
4.84k
        ks_free(&x->qual);
3794
4.84k
        regfree(&x->regex);
3795
4.84k
        free(fp->state);
3796
4.84k
    }
3797
6.46k
}
3798
3799
0
int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
3800
0
    va_list args;
3801
3802
0
    if (!fp)
3803
0
        return -1;
3804
0
    if (!fp->state)
3805
0
        if (!(fp->state = fastq_state_init(fp->format.format == fastq_format
3806
0
                                           ? '@' : '>')))
3807
0
            return -1;
3808
3809
0
    fastq_state *x = (fastq_state *)fp->state;
3810
3811
0
    switch (opt) {
3812
0
    case FASTQ_OPT_CASAVA:
3813
0
        x->casava = 1;
3814
0
        break;
3815
3816
0
    case FASTQ_OPT_NAME2:
3817
0
        x->sra_names = 1;
3818
0
        break;
3819
3820
0
    case FASTQ_OPT_AUX: {
3821
0
        va_start(args, opt);
3822
0
        x->aux = 1;
3823
0
        char *tag = va_arg(args, char *);
3824
0
        va_end(args);
3825
0
        if (tag && strcmp(tag, "1") != 0) {
3826
0
            if (!x->tags)
3827
0
                if (!(x->tags = kh_init(tag)))
3828
0
                    return -1;
3829
3830
0
            size_t i, tlen = strlen(tag);
3831
0
            for (i = 0; i+3 <= tlen+1; i += 3) {
3832
0
                if (tag[i+0] == ',' || tag[i+1] == ',' ||
3833
0
                    !(tag[i+2] == ',' || tag[i+2] == '\0')) {
3834
0
                    hts_log_warning("Bad tag format '%.3s'; skipping option", tag+i);
3835
0
                    break;
3836
0
                }
3837
0
                int ret, tcode = tag[i+0]*256 + tag[i+1];
3838
0
                kh_put(tag, x->tags, tcode, &ret);
3839
0
                if (ret < 0)
3840
0
                    return -1;
3841
0
            }
3842
0
        }
3843
0
        break;
3844
0
    }
3845
3846
0
    case FASTQ_OPT_BARCODE: {
3847
0
        va_start(args, opt);
3848
0
        char *bc = va_arg(args, char *);
3849
0
        va_end(args);
3850
0
        strncpy(x->BC, bc, 2);
3851
0
        x->BC[2] = 0;
3852
0
        break;
3853
0
    }
3854
3855
0
    case FASTQ_OPT_UMI: {
3856
        // UMI tag: an empty string disables UMI by setting x->UMI[0] to \0\0\0
3857
0
        va_start(args, opt);
3858
0
        char *bc = va_arg(args, char *), *bc_orig = bc;
3859
0
        va_end(args);
3860
0
        if (!bc || strcmp(bc, "1") == 0)
3861
0
            bc = "RX";
3862
0
        int ntags = 0, err = 0;
3863
0
        for (ntags = 0; *bc && ntags < UMI_TAGS; ntags++) {
3864
0
            if (!isalpha(bc[0]) || !isalnum_c(bc[1])) {
3865
0
                err = 1;
3866
0
                break;
3867
0
            }
3868
3869
0
            strncpy(x->UMI[ntags], bc, 3);
3870
0
            bc += 2;
3871
0
            if (*bc && *bc != ',') {
3872
0
                err = 1;
3873
0
                break;
3874
0
            }
3875
0
            bc+=(*bc==',');
3876
0
            x->UMI[ntags][2] = 0;
3877
0
        }
3878
0
        for (; ntags < UMI_TAGS; ntags++)
3879
0
            x->UMI[ntags][0] = x->UMI[ntags][1] = x->UMI[ntags][2] = 0;
3880
3881
3882
0
        if (err)
3883
0
            hts_log_warning("Bad UMI tag list '%s'", bc_orig);
3884
3885
0
        break;
3886
0
    }
3887
3888
0
    case FASTQ_OPT_UMI_REGEX: {
3889
0
        va_start(args, opt);
3890
0
        char *re = va_arg(args, char *);
3891
0
        va_end(args);
3892
3893
0
        regfree(&x->regex);
3894
0
        if (regcomp(&x->regex, re, REG_EXTENDED) != 0) {
3895
0
            hts_log_error("Regular expression '%s' is not supported", re);
3896
0
            return -1;
3897
0
        }
3898
0
        break;
3899
0
    }
3900
3901
0
    case FASTQ_OPT_RNUM:
3902
0
        x->rnum = 1;
3903
0
        break;
3904
3905
0
    default:
3906
0
        break;
3907
0
    }
3908
0
    return 0;
3909
0
}
3910
3911
16.5M
static int fastq_parse1(htsFile *fp, bam1_t *b) {
3912
16.5M
    fastq_state *x = (fastq_state *)fp->state;
3913
16.5M
    size_t i, l;
3914
16.5M
    int ret = 0;
3915
3916
16.5M
    if (fp->format.format == fasta_format && fp->line.s) {
3917
        // For FASTA we've already read the >name line; steal it
3918
        // Not the most efficient, but we don't optimise for fasta reading.
3919
16.5M
        if (fp->line.l == 0)
3920
3.60k
            return -1; // EOF
3921
3922
16.5M
        free(x->name.s);
3923
16.5M
        x->name = fp->line;
3924
16.5M
        fp->line.l = fp->line.m = 0;
3925
16.5M
        fp->line.s = NULL;
3926
16.5M
    } else {
3927
        // Read a FASTQ format entry.
3928
5.38k
        ret = hts_getline(fp, KS_SEP_LINE, &x->name);
3929
5.38k
        if (ret == -1)
3930
72
            return -1;  // EOF
3931
5.31k
        else if (ret < -1)
3932
111
            return ret; // ERR
3933
5.38k
    }
3934
3935
    // Name
3936
16.5M
    if (*x->name.s != x->nprefix)
3937
66
        return -2;
3938
3939
    // Reverse the SRA strangeness of putting the run_name.number before
3940
    // the read name.
3941
16.5M
    i = 0;
3942
16.5M
    char *name = x->name.s+1;
3943
16.5M
    if (x->sra_names) {
3944
0
        char *cp = strpbrk(x->name.s, " \t");
3945
0
        if (cp) {
3946
0
            while (*cp == ' ' || *cp == '\t')
3947
0
                cp++;
3948
0
            *--cp = '@';
3949
0
            i = cp - x->name.s;
3950
0
            name = cp+1;
3951
0
        }
3952
0
    }
3953
3954
16.5M
    l = x->name.l;
3955
16.5M
    char *s = x->name.s;
3956
46.6M
    while (i < l && !isspace_c(s[i]))
3957
30.1M
        i++;
3958
16.5M
    if (i < l) {
3959
111k
        s[i] = 0;
3960
111k
        x->name.l = i++;
3961
111k
    }
3962
3963
    // Comment; a kstring struct, but pointer into name line.  (Do not free)
3964
17.0M
    while (i < l && isspace_c(s[i]))
3965
567k
        i++;
3966
16.5M
    x->comment.s = s+i;
3967
16.5M
    x->comment.l = l - i;
3968
3969
    // Seq
3970
16.5M
    x->seq.l = 0;
3971
68.6M
    for (;;) {
3972
68.6M
        if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0)
3973
4.48k
            if (fp->format.format == fastq_format || ret < -1)
3974
819
                return -2;
3975
68.6M
        if (ret == -1 ||
3976
68.6M
            *fp->line.s == (fp->format.format == fastq_format ? '+' : '>'))
3977
16.5M
            break;
3978
52.1M
        if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0)
3979
0
            return -2;
3980
52.1M
    }
3981
3982
    // Qual
3983
16.5M
    if (fp->format.format == fastq_format) {
3984
570
        size_t remainder = x->seq.l;
3985
570
        x->qual.l = 0;
3986
58.8k
        do {
3987
58.8k
            if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0)
3988
21
                return -2;
3989
58.8k
            if (fp->line.l > remainder)
3990
36
                return -2;
3991
58.7k
            if (kputsn(fp->line.s, fp->line.l, &x->qual) < 0)
3992
0
                return -2;
3993
58.7k
            remainder -= fp->line.l;
3994
58.7k
        } while (remainder > 0);
3995
3996
        // Decr qual
3997
809k
        for (i = 0; i < x->qual.l; i++)
3998
808k
            x->qual.s[i] -= '!';
3999
513
    }
4000
4001
16.5M
    int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED;
4002
16.5M
    if (x->name.l > 2 &&
4003
1.26M
        x->name.s[x->name.l-2] == '/' &&
4004
68.9k
        isdigit_c(x->name.s[x->name.l-1])) {
4005
67.4k
        switch(x->name.s[x->name.l-1]) {
4006
6.17k
        case '1': flag |= BAM_FREAD1 | pflag; break;
4007
46.9k
        case '2': flag |= BAM_FREAD2 | pflag; break;
4008
14.3k
        default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
4009
67.4k
        }
4010
67.4k
        x->name.s[x->name.l-=2] = 0;
4011
67.4k
    }
4012
4013
    // Strip Illumina formatted UMI off read-name
4014
16.5M
    char UMI_seq[256]; // maximum length in spec
4015
16.5M
    size_t UMI_len = 0;
4016
16.5M
    if (x->UMI[0][0]) {
4017
0
        regmatch_t match[3];
4018
0
        if (regexec(&x->regex, x->name.s, 2, match, 0) == 0
4019
0
            && match[0].rm_so >= 0     // whole regex
4020
0
            && match[1].rm_so >= 0) {  // bracketted UMI component
4021
0
            UMI_len = match[1].rm_eo - match[1].rm_so;
4022
0
            if (UMI_len > 255) {
4023
0
                hts_log_error("SAM read name is too long");
4024
0
                return -2;
4025
0
            }
4026
4027
            // The SAMTags spec recommends (but not requires) separating
4028
            // barcodes with hyphen ('-').
4029
0
            size_t i;
4030
0
            for (i = 0; i < UMI_len; i++)
4031
0
                UMI_seq[i] = isalpha_c(x->name.s[i+match[1].rm_so])
4032
0
                    ? x->name.s[i+match[1].rm_so]
4033
0
                    : '-';
4034
4035
            // Move any trailing #num earlier in the name
4036
0
            if (UMI_len) {
4037
0
                UMI_seq[UMI_len++] = 0;
4038
4039
0
                x->name.l = match[1].rm_so;
4040
0
                if (x->name.l > 0 && x->name.s[x->name.l-1] == ':')
4041
0
                    x->name.l--; // remove colon too
4042
0
                char *cp = x->name.s + match[1].rm_eo;
4043
0
                while (*cp)
4044
0
                    x->name.s[x->name.l++] = *cp++;
4045
0
                x->name.s[x->name.l] = 0;
4046
0
            }
4047
0
        }
4048
0
    }
4049
4050
    // Convert to BAM
4051
16.5M
    ret = bam_set1(b,
4052
16.5M
                   x->name.s + x->name.l - name, name,
4053
16.5M
                   flag,
4054
16.5M
                   -1, -1, 0, // ref '*', pos, mapq,
4055
16.5M
                   0, NULL,     // no cigar,
4056
16.5M
                   -1, -1, 0,    // mate
4057
16.5M
                   x->seq.l, x->seq.s, x->qual.s,
4058
16.5M
                   0);
4059
16.5M
    if (ret < 0) return -2;
4060
4061
    // Add UMI tag if removed from read-name above
4062
16.5M
    if (UMI_len) {
4063
0
        if (bam_aux_append(b, x->UMI[0], 'Z', UMI_len, (uint8_t *)UMI_seq) < 0)
4064
0
            ret = -2;
4065
0
    }
4066
4067
    // Identify Illumina CASAVA strings.
4068
    // <read>:<is_filtered>:<control_bits>:<barcode_sequence>
4069
16.5M
    char *barcode = NULL;
4070
16.5M
    int barcode_len = 0;
4071
16.5M
    kstring_t *kc = &x->comment;
4072
16.5M
    char *endptr;
4073
16.5M
    if (x->casava &&
4074
        // \d:[YN]:\d+:[ACGTN]+
4075
0
        kc->l > 6 && (kc->s[1] | kc->s[3]) == ':' && isdigit_c(kc->s[0]) &&
4076
0
        strtol(kc->s+4, &endptr, 10) >= 0 && endptr != kc->s+4
4077
0
        && *endptr == ':') {
4078
4079
        // read num
4080
0
        switch(kc->s[0]) {
4081
0
        case '1': b->core.flag |= BAM_FREAD1 | pflag; break;
4082
0
        case '2': b->core.flag |= BAM_FREAD2 | pflag; break;
4083
0
        default : b->core.flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
4084
0
        }
4085
4086
0
        if (kc->s[2] == 'Y')
4087
0
            b->core.flag |= BAM_FQCFAIL;
4088
4089
        // Barcode, maybe numeric in which case we skip it
4090
0
        if (!isdigit_c(endptr[1])) {
4091
0
            barcode = endptr+1;
4092
0
            for (i = barcode - kc->s; i < kc->l; i++)
4093
0
                if (isspace_c(kc->s[i]))
4094
0
                    break;
4095
4096
0
            kc->s[i] = 0;
4097
0
            barcode_len = i+1-(barcode - kc->s);
4098
0
        }
4099
0
    }
4100
4101
16.5M
    if (ret >= 0 && barcode_len)
4102
0
        if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0)
4103
0
            ret = -2;
4104
4105
16.5M
    if (!x->aux)
4106
16.5M
        return ret;
4107
4108
    // Identify any SAM style aux tags in comments too.
4109
0
    if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0)
4110
0
        ret = -2;
4111
4112
0
    return ret;
4113
16.5M
}
4114
4115
// Internal component of sam_read1 below
4116
3.05k
static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4117
3.05k
    int ret = bam_read1(fp->fp.bgzf, b);
4118
3.05k
    if (h && ret >= 0) {
4119
1.85k
        if (b->core.tid  >= h->n_targets || b->core.tid  < -1 ||
4120
1.79k
            b->core.mtid >= h->n_targets || b->core.mtid < -1) {
4121
124
            errno = ERANGE;
4122
124
            return -3;
4123
124
        }
4124
1.85k
    }
4125
2.92k
    return ret;
4126
3.05k
}
4127
4128
// Internal component of sam_read1 below
4129
5.44k
static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) {
4130
5.44k
    int ret = cram_get_bam_seq(fp->fp.cram, b);
4131
5.44k
    if (ret < 0)
4132
5.44k
        return cram_eof(fp->fp.cram) ? -1 : -2;
4133
4134
0
    return ret;
4135
5.44k
}
4136
4137
// Internal component of sam_read1 below
4138
10.5k
static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4139
10.5k
    int ret;
4140
4141
    // Consume 1st line after header parsing as it wasn't using peek
4142
10.5k
    if (fp->line.l != 0) {
4143
57
        ret = sam_parse1(&fp->line, h, b);
4144
57
        fp->line.l = 0;
4145
57
        return ret;
4146
57
    }
4147
4148
10.4k
    if (fp->state) {
4149
0
        SAM_state *fd = (SAM_state *)fp->state;
4150
4151
0
        if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) {
4152
            // We don't support multi-threaded SAM parsing with seeks yet.
4153
0
            int ret;
4154
0
            if ((ret = sam_state_destroy(fp)) < 0) {
4155
0
                errno = -ret;
4156
0
                return -2;
4157
0
            }
4158
0
            if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0)
4159
0
                return -2;
4160
0
            fp->fp.bgzf->seeked = 0;
4161
0
            goto err_recover;
4162
0
        }
4163
4164
0
        if (!fd->h) {
4165
0
            fd->h = h;
4166
0
            fd->h->ref_count++;
4167
            // Ensure hrecs is initialised now as we don't want multiple
4168
            // threads trying to do this simultaneously.
4169
0
            if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0)
4170
0
                return -2;
4171
4172
            // We can only do this once we've got a header
4173
0
            if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read,
4174
0
                               fp) != 0)
4175
0
                return -2;
4176
0
            fd->dispatcher_set = 1;
4177
0
        }
4178
4179
0
        if (fd->h != h) {
4180
0
            hts_log_error("SAM multi-threaded decoding does not support changing header");
4181
0
            return -2;
4182
0
        }
4183
4184
0
        sp_bams *gb = fd->curr_bam;
4185
0
        if (!gb) {
4186
0
            if (fd->errcode) {
4187
                // In case reader failed
4188
0
                errno = fd->errcode;
4189
0
                return -2;
4190
0
            }
4191
4192
0
            pthread_mutex_lock(&fd->command_m);
4193
0
            int cmd = fd->command;
4194
0
            pthread_mutex_unlock(&fd->command_m);
4195
0
            if (cmd == SAM_AT_EOF)
4196
0
                return -1;
4197
4198
0
            hts_tpool_result *r = hts_tpool_next_result_wait(fd->q);
4199
0
            if (!r)
4200
0
                return -2;
4201
0
            fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r);
4202
0
            hts_tpool_delete_result(r, 0);
4203
0
        }
4204
0
        if (!gb) {
4205
0
            pthread_mutex_lock(&fd->command_m);
4206
0
            fd->command = SAM_AT_EOF;
4207
0
            pthread_mutex_unlock(&fd->command_m);
4208
0
            return fd->errcode ? -2 : -1;
4209
0
        }
4210
0
        bam1_t *b_array = (bam1_t *)gb->bams;
4211
0
        if (fd->curr_idx < gb->nbams)
4212
0
            if (!bam_copy1(b, &b_array[fd->curr_idx++]))
4213
0
                return -2;
4214
0
        if (fd->curr_idx == gb->nbams) {
4215
0
            pthread_mutex_lock(&fd->lines_m);
4216
0
            gb->next = fd->bams;
4217
0
            fd->bams = gb;
4218
0
            pthread_mutex_unlock(&fd->lines_m);
4219
4220
0
            fd->curr_bam = NULL;
4221
0
            fd->curr_idx = 0;
4222
        // Consider prefetching next record?  I.e.
4223
        // } else {
4224
        //     __builtin_prefetch(&b_array[fd->curr_idx], 0, 3);
4225
0
        }
4226
4227
0
        ret = 0;
4228
4229
10.4k
    } else  {
4230
10.4k
    err_recover:
4231
10.4k
        ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
4232
10.4k
        if (ret < 0) return ret;
4233
4234
8.14k
        ret = sam_parse1(&fp->line, h, b);
4235
8.14k
        fp->line.l = 0;
4236
8.14k
        if (ret < 0) {
4237
2.05k
            hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
4238
2.05k
            if (h && h->ignore_sam_err) goto err_recover;
4239
2.05k
        }
4240
8.14k
    }
4241
4242
8.14k
    return ret;
4243
10.4k
}
4244
4245
// Returns 0 on success,
4246
//        -1 on EOF,
4247
//       <-1 on error
4248
int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b)
4249
16.5M
{
4250
16.5M
    int ret, pass_filter;
4251
4252
16.5M
    do {
4253
16.5M
        switch (fp->format.format) {
4254
3.05k
        case bam:
4255
3.05k
            ret = sam_read1_bam(fp, h, b);
4256
3.05k
            break;
4257
4258
5.44k
        case cram:
4259
5.44k
            ret = sam_read1_cram(fp, h, &b);
4260
5.44k
            break;
4261
4262
10.5k
        case sam:
4263
10.5k
            ret = sam_read1_sam(fp, h, b);
4264
10.5k
            break;
4265
4266
16.5M
        case fasta_format:
4267
16.5M
        case fastq_format: {
4268
16.5M
            fastq_state *x = (fastq_state *)fp->state;
4269
16.5M
            if (!x) {
4270
4.84k
                if (!(fp->state = fastq_state_init(fp->format.format
4271
4.84k
                                                   == fastq_format ? '@' : '>')))
4272
0
                    return -2;
4273
4.84k
            }
4274
4275
16.5M
            return fastq_parse1(fp, b);
4276
16.5M
        }
4277
4278
0
        case empty_format:
4279
0
            errno = EPIPE;
4280
0
            return -3;
4281
4282
0
        default:
4283
0
            errno = EFTYPE;
4284
0
            return -3;
4285
16.5M
        }
4286
4287
19.0k
        pass_filter = (ret >= 0 && fp->filter)
4288
19.0k
            ? sam_passes_filter(h, b, fp->filter)
4289
19.0k
            : 1;
4290
19.0k
    } while (pass_filter == 0);
4291
4292
19.0k
    return pass_filter < 0 ? -2 : ret;
4293
16.5M
}
4294
4295
// With gcc, -O3 or -ftree-loop-vectorize is really key here as otherwise
4296
// this code isn't vectorised and runs far slower than is necessary (even
4297
// with the restrict keyword being used).
4298
static inline void HTS_OPT3
4299
221
add33(uint8_t *a, const uint8_t * b, int32_t len) {
4300
221
    uint32_t i;
4301
270k
    for (i = 0; i < len; i++)
4302
270k
        a[i] = b[i]+33;
4303
221
}
4304
4305
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4306
5.50M
{
4307
5.50M
    int i, r = 0;
4308
5.50M
    uint8_t *s, *end;
4309
5.50M
    const bam1_core_t *c = &b->core;
4310
4311
5.50M
    if (c->l_qname == 0)
4312
0
        return -1;
4313
5.50M
    r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str);
4314
5.50M
    r |= kputc_('\t', str); // query name
4315
5.50M
    r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag
4316
5.50M
    if (c->tid >= 0) { // chr
4317
364
        r |= kputs(h->target_name[c->tid] , str);
4318
364
        r |= kputc_('\t', str);
4319
5.50M
    } else r |= kputsn_("*\t", 2, str);
4320
5.50M
    r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos
4321
5.50M
    r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual
4322
5.50M
    if (c->n_cigar) { // cigar
4323
1.45k
        uint32_t *cigar = bam_get_cigar(b);
4324
7.59k
        for (i = 0; i < c->n_cigar; ++i) {
4325
6.14k
            r |= kputw(bam_cigar_oplen(cigar[i]), str);
4326
6.14k
            r |= kputc_(bam_cigar_opchr(cigar[i]), str);
4327
6.14k
        }
4328
5.50M
    } else r |= kputc_('*', str);
4329
5.50M
    r |= kputc_('\t', str);
4330
5.50M
    if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr
4331
125
    else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str);
4332
4
    else {
4333
4
        r |= kputs(h->target_name[c->mtid], str);
4334
4
        r |= kputc_('\t', str);
4335
4
    }
4336
5.50M
    r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos
4337
5.50M
    r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len
4338
5.50M
    if (c->l_qseq) { // seq and qual
4339
156k
        uint8_t *s = bam_get_seq(b);
4340
156k
        if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err;
4341
156k
        char *cp = str->s + str->l;
4342
4343
        // Sequence, 2 bases at a time
4344
156k
        nibble2base(s, cp, c->l_qseq);
4345
156k
        cp[c->l_qseq] = '\t';
4346
156k
        cp += c->l_qseq+1;
4347
4348
        // Quality
4349
156k
        s = bam_get_qual(b);
4350
156k
        i = 0;
4351
156k
        if (s[0] == 0xff) {
4352
156k
            cp[i++] = '*';
4353
156k
        } else {
4354
221
            add33((uint8_t *)cp, s, c->l_qseq); // cp[i] = s[i]+33;
4355
221
            i = c->l_qseq;
4356
221
        }
4357
156k
        cp[i] = 0;
4358
156k
        cp += i;
4359
156k
        str->l = cp - str->s;
4360
5.35M
    } else r |= kputsn_("*\t*", 3, str);
4361
4362
5.50M
    s = bam_get_aux(b); // aux
4363
5.50M
    end = b->data + b->l_data;
4364
4365
6.12M
    while (end - s >= 4) {
4366
619k
        r |= kputc_('\t', str);
4367
619k
        if ((s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)) == NULL)
4368
264
            goto bad_aux;
4369
619k
    }
4370
5.50M
    r |= kputsn("", 0, str); // nul terminate
4371
5.50M
    if (r < 0) goto mem_err;
4372
4373
5.50M
    return str->l;
4374
4375
264
 bad_aux:
4376
264
    hts_log_error("Corrupted aux data for read %.*s flag %d",
4377
264
                  b->core.l_qname, bam_get_qname(b), b->core.flag);
4378
264
    errno = EINVAL;
4379
264
    return -1;
4380
4381
0
 mem_err:
4382
0
    hts_log_error("Out of memory");
4383
0
    errno = ENOMEM;
4384
0
    return -1;
4385
5.50M
}
4386
4387
int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4388
5.50M
{
4389
5.50M
    str->l = 0;
4390
5.50M
    return sam_format1_append(h, b, str);
4391
5.50M
}
4392
4393
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end);
4394
int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str)
4395
0
{
4396
0
    unsigned flag = b->core.flag;
4397
0
    int i, e = 0, len = b->core.l_qseq;
4398
0
    uint8_t *seq, *qual;
4399
4400
0
    str->l = 0;
4401
4402
    // Name
4403
0
    if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF)
4404
0
        return -1;
4405
4406
    // UMI tag
4407
0
    if (x && *x->UMI[0]) {
4408
        // Temporary copy of '#num' if present
4409
0
        char plex[256];
4410
0
        size_t len = str->l;
4411
0
        while (len && str->s[len] != ':' && str->s[len] != '#')
4412
0
            len--;
4413
4414
0
        if (str->s[len] == '#' && str->l - len < 255) {
4415
0
            memcpy(plex, &str->s[len], str->l - len);
4416
0
            plex[str->l - len] = 0;
4417
0
            str->l = len;
4418
0
        } else {
4419
0
            *plex = 0;
4420
0
        }
4421
4422
0
        uint8_t *bc = NULL;
4423
0
        int n;
4424
0
        for (n = 0; !bc && n < UMI_TAGS; n++)
4425
0
            bc = bam_aux_get(b, x->UMI[n]);
4426
0
        if (bc && *bc == 'Z') {
4427
0
            int err = kputc(':', str) < 0;
4428
            // Replace any non-alpha with '+'
4429
0
            while (*++bc)
4430
0
                err |= kputc(isalpha_c(*bc) ? toupper_c(*bc) : '+', str) < 0;
4431
0
            if (err)
4432
0
                return -1;
4433
0
        }
4434
4435
0
        if (*plex && kputs(plex, str) < 0)
4436
0
            return -1;
4437
0
    }
4438
4439
    // /1 or /2 suffix
4440
0
    if (x && x->rnum && (flag & BAM_FPAIRED)) {
4441
0
        int r12 = flag & (BAM_FREAD1 | BAM_FREAD2);
4442
0
        if (r12 == BAM_FREAD1) {
4443
0
            if (kputs("/1", str) == EOF)
4444
0
                return -1;
4445
0
        } else if (r12 == BAM_FREAD2) {
4446
0
            if (kputs("/2", str) == EOF)
4447
0
                return -1;
4448
0
        }
4449
0
    }
4450
4451
    // Illumina CASAVA tag.
4452
    // This is <rnum>:<Y/N qcfail>:<control-bits>:<barcode-or-zero>
4453
0
    if (x && x->casava) {
4454
0
        int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0;
4455
0
        char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N';
4456
0
        uint8_t *bc = bam_aux_get(b, x->BC);
4457
0
        if (ksprintf(str, " %d:%c:0:%s", rnum, filtered,
4458
0
                     bc ? (char *)bc+1 : "0") < 0)
4459
0
            return -1;
4460
4461
0
        if (bc && (*bc != 'Z' || (!isupper_c(bc[1]) && !islower_c(bc[1])))) {
4462
0
            hts_log_warning("BC tag starts with non-sequence base; using '0'");
4463
0
            str->l -= strlen((char *)bc)-2; // limit to 1 char
4464
0
            str->s[str->l-1] = '0';
4465
0
            str->s[str->l] = 0;
4466
0
            bc = NULL;
4467
0
        }
4468
4469
        // Replace any non-alpha with '+'.  Ie seq-seq to seq+seq
4470
0
        if (bc) {
4471
0
            int l = strlen((char *)bc+1);
4472
0
            char *c = (char *)str->s + str->l - l;
4473
0
            for (i = 0; i < l; i++) {
4474
0
                if (!isalpha_c(c[i]))
4475
0
                    c[i] = '+';
4476
0
                else if (islower_c(c[i]))
4477
0
                    c[i] = toupper_c(c[i]);
4478
0
            }
4479
0
        }
4480
0
    }
4481
4482
    // Aux tags
4483
0
    if (x && x->aux) {
4484
0
        uint8_t *s = bam_get_aux(b), *end = b->data + b->l_data;
4485
0
        while (s && end - s >= 4) {
4486
0
            int tt = s[0]*256 + s[1];
4487
0
            if (x->tags == NULL ||
4488
0
                kh_get(tag, x->tags, tt) != kh_end(x->tags)) {
4489
0
                e |= kputc_('\t', str) < 0;
4490
0
                if (!(s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)))
4491
0
                    return -1;
4492
0
            } else {
4493
0
                s = skip_aux(s+2, end);
4494
0
            }
4495
0
        }
4496
0
        e |= kputsn("", 0, str) < 0; // nul terminate
4497
0
    }
4498
4499
0
    if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1;
4500
0
    e |= kputc_('\n', str) < 0;
4501
4502
    // Seq line
4503
0
    seq = bam_get_seq(b);
4504
0
    if (flag & BAM_FREVERSE)
4505
0
        for (i = len-1; i >= 0; i--)
4506
0
            e |= kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str) < 0;
4507
0
    else
4508
0
        for (i = 0; i < len; i++)
4509
0
            e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0;
4510
4511
4512
    // Qual line
4513
0
    if (x->nprefix == '@') {
4514
0
        kputsn("\n+\n", 3, str);
4515
0
        qual = bam_get_qual(b);
4516
0
        if (qual[0] == 0xff)
4517
0
            for (i = 0; i < len; i++)
4518
0
                e |= kputc_('B', str) < 0;
4519
0
        else if (flag & BAM_FREVERSE)
4520
0
            for (i = len-1; i >= 0; i--)
4521
0
                e |= kputc_(33 + qual[i], str) < 0;
4522
0
        else
4523
0
            for (i = 0; i < len; i++)
4524
0
                e |= kputc_(33 + qual[i], str) < 0;
4525
4526
0
    }
4527
0
    e |= kputc('\n', str) < 0;
4528
4529
0
    return e ? -1 : str->l;
4530
0
}
4531
4532
// Sadly we need to be able to modify the bam_hdr here so we can
4533
// reference count the structure.
4534
int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b)
4535
16.5M
{
4536
16.5M
    switch (fp->format.format) {
4537
0
    case binary_format:
4538
0
        fp->format.category = sequence_data;
4539
0
        fp->format.format = bam;
4540
        /* fall-through */
4541
5.50M
    case bam:
4542
5.50M
        return bam_write_idx1(fp, h, b);
4543
4544
5.50M
    case cram:
4545
5.50M
        return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
4546
4547
0
    case text_format:
4548
0
        fp->format.category = sequence_data;
4549
0
        fp->format.format = sam;
4550
        /* fall-through */
4551
5.50M
    case sam:
4552
5.50M
        if (fp->state) {
4553
0
            SAM_state *fd = (SAM_state *)fp->state;
4554
4555
            // Threaded output
4556
0
            if (!fd->h) {
4557
                // NB: discard const.  We don't actually modify sam_hdr_t here,
4558
                // just data pointed to by it (which is a bit weasely still),
4559
                // but out cached pointer must be non-const as we want to
4560
                // destroy it later on and sam_hdr_destroy takes non-const.
4561
                //
4562
                // We do this because some tools do sam_hdr_destroy; sam_close
4563
                // while others do sam_close; sam_hdr_destroy.  The former is
4564
                // an issue as we need the header still when flushing.
4565
0
                fd->h = (sam_hdr_t *)h;
4566
0
                fd->h->ref_count++;
4567
4568
0
                if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write,
4569
0
                                   fp) != 0)
4570
0
                    return -2;
4571
0
                fd->dispatcher_set = 1;
4572
0
            }
4573
4574
0
            if (fd->h != h) {
4575
0
                hts_log_error("SAM multi-threaded decoding does not support changing header");
4576
0
                return -2;
4577
0
            }
4578
4579
            // Find a suitable BAM array to copy to
4580
0
            sp_bams *gb = fd->curr_bam;
4581
0
            if (!gb) {
4582
0
                pthread_mutex_lock(&fd->lines_m);
4583
0
                if (fd->bams) {
4584
0
                    fd->curr_bam = gb = fd->bams;
4585
0
                    fd->bams = gb->next;
4586
0
                    gb->next = NULL;
4587
0
                    gb->nbams = 0;
4588
0
                    gb->bam_mem = 0;
4589
0
                    pthread_mutex_unlock(&fd->lines_m);
4590
0
                } else {
4591
0
                    pthread_mutex_unlock(&fd->lines_m);
4592
0
                    if (!(gb = calloc(1, sizeof(*gb)))) return -1;
4593
0
                    if (!(gb->bams = calloc(SAM_NBAM, sizeof(*gb->bams)))) {
4594
0
                        free(gb);
4595
0
                        return -1;
4596
0
                    }
4597
0
                    gb->nbams = 0;
4598
0
                    gb->abams = SAM_NBAM;
4599
0
                    gb->bam_mem = 0;
4600
0
                    gb->fd = fd;
4601
0
                    fd->curr_idx = 0;
4602
0
                    fd->curr_bam = gb;
4603
0
                }
4604
0
            }
4605
4606
0
            if (!bam_copy1(&gb->bams[gb->nbams++], b))
4607
0
                return -2;
4608
0
            gb->bam_mem += b->l_data + sizeof(*b);
4609
4610
            // Dispatch if full
4611
0
            if (gb->nbams == SAM_NBAM || gb->bam_mem > SAM_NBYTES*0.8) {
4612
0
                gb->serial = fd->serial++;
4613
0
                pthread_mutex_lock(&fd->command_m);
4614
0
                if (fd->errcode != 0) {
4615
0
                    pthread_mutex_unlock(&fd->command_m);
4616
0
                    return -fd->errcode;
4617
0
                }
4618
0
                if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb,
4619
0
                                        cleanup_sp_bams,
4620
0
                                        cleanup_sp_lines, 0) < 0) {
4621
0
                    pthread_mutex_unlock(&fd->command_m);
4622
0
                    return -1;
4623
0
                }
4624
0
                pthread_mutex_unlock(&fd->command_m);
4625
0
                fd->curr_bam = NULL;
4626
0
            }
4627
4628
            // Dummy value as we don't know how long it really is.
4629
            // We could track file sizes via a SAM_state field, but I don't think
4630
            // it is necessary.
4631
0
            return 1;
4632
5.50M
        } else {
4633
5.50M
            if (sam_format1(h, b, &fp->line) < 0) return -1;
4634
5.50M
            kputc('\n', &fp->line);
4635
5.50M
            if (fp->is_bgzf) {
4636
0
                if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4637
0
                    return -1;
4638
0
                if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4639
5.50M
            } else {
4640
5.50M
                if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4641
5.50M
            }
4642
4643
5.50M
            if (fp->idx) {
4644
0
                if (fp->format.compression == bgzf) {
4645
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4646
0
                                      bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4647
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4648
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4649
0
                        return -1;
4650
0
                    }
4651
0
                } else {
4652
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4653
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4654
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4655
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4656
0
                        return -1;
4657
0
                    }
4658
0
                }
4659
0
            }
4660
4661
5.50M
            return fp->line.l;
4662
5.50M
        }
4663
4664
4665
0
    case fasta_format:
4666
0
    case fastq_format: {
4667
0
        fastq_state *x = (fastq_state *)fp->state;
4668
0
        if (!x) {
4669
0
            if (!(fp->state = fastq_state_init(fp->format.format
4670
0
                                               == fastq_format ? '@' : '>')))
4671
0
                return -2;
4672
0
        }
4673
4674
0
        if (fastq_format1(fp->state, b, &fp->line) < 0)
4675
0
            return -1;
4676
0
        if (fp->is_bgzf) {
4677
0
            if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4678
0
                return -1;
4679
0
            if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l)
4680
0
                return -1;
4681
0
        } else {
4682
0
            if (hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l)
4683
0
                return -1;
4684
0
        }
4685
0
        return fp->line.l;
4686
0
    }
4687
4688
0
    default:
4689
0
        errno = EBADF;
4690
0
        return -1;
4691
16.5M
    }
4692
16.5M
}
4693
4694
/************************
4695
 *** Auxiliary fields ***
4696
 ************************/
4697
#ifndef HTS_LITTLE_ENDIAN
4698
static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
4699
    int tsz = aux_type2size(type);
4700
4701
    if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
4702
4703
    switch (tsz) {
4704
        case 'H': case 'Z': case 1:  // Trivial
4705
            memcpy(out, in, len);
4706
            break;
4707
4708
#define aux_val_to_le(type_t, store_le) do {                            \
4709
        type_t v;                                                       \
4710
        size_t i;                                                       \
4711
        for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
4712
            memcpy(&v, in + i, sizeof(type_t));                         \
4713
            store_le(v, out);                                           \
4714
        }                                                               \
4715
    } while (0)
4716
4717
        case 2: aux_val_to_le(uint16_t, u16_to_le); break;
4718
        case 4: aux_val_to_le(uint32_t, u32_to_le); break;
4719
        case 8: aux_val_to_le(uint64_t, u64_to_le); break;
4720
4721
#undef aux_val_to_le
4722
4723
        case 'B': { // Recurse!
4724
            uint32_t n;
4725
            if (len < 5) return -1;
4726
            memcpy(&n, in + 1, 4);
4727
            out[0] = in[0];
4728
            u32_to_le(n, out + 1);
4729
            return aux_to_le(in[0], out + 5, in + 5, len - 5);
4730
        }
4731
4732
        default: // Unknown type code
4733
            return -1;
4734
    }
4735
4736
4737
4738
    return 0;
4739
}
4740
#endif
4741
4742
int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
4743
0
{
4744
0
    uint32_t new_len;
4745
4746
0
    assert(b->l_data >= 0);
4747
0
    new_len = b->l_data + 3 + len;
4748
0
    if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
4749
4750
0
    if (realloc_bam_data(b, new_len) < 0) return -1;
4751
4752
0
    b->data[b->l_data] = tag[0];
4753
0
    b->data[b->l_data + 1] = tag[1];
4754
0
    b->data[b->l_data + 2] = type;
4755
4756
0
#ifdef HTS_LITTLE_ENDIAN
4757
0
    memcpy(b->data + b->l_data + 3, data, len);
4758
#else
4759
    if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
4760
        errno = EINVAL;
4761
        return -1;
4762
    }
4763
#endif
4764
4765
0
    b->l_data = new_len;
4766
4767
0
    return 0;
4768
4769
0
 nomem:
4770
0
    errno = ENOMEM;
4771
0
    return -1;
4772
0
}
4773
4774
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
4775
621k
{
4776
621k
    int size;
4777
621k
    uint32_t n;
4778
621k
    if (s >= end) return end;
4779
621k
    size = aux_type2size(*s); ++s; // skip type
4780
621k
    switch (size) {
4781
3.86k
    case 'Z':
4782
4.12k
    case 'H':
4783
4.12k
        s = memchr(s, 0, end-s);
4784
4.12k
        return s ? s+1 : end;
4785
7.59k
    case 'B':
4786
7.59k
        if (end - s < 5) return NULL;
4787
7.57k
        size = aux_type2size(*s); ++s;
4788
7.57k
        n = le_to_u32(s);
4789
7.57k
        s += 4;
4790
7.57k
        if (size == 0 || end - s < size * n) return NULL;
4791
7.54k
        return s + size * n;
4792
465
    case 0:
4793
465
        return NULL;
4794
609k
    default:
4795
609k
        if (end - s < size) return NULL;
4796
609k
        return s + size;
4797
621k
    }
4798
621k
}
4799
4800
uint8_t *bam_aux_first(const bam1_t *b)
4801
5.50M
{
4802
5.50M
    uint8_t *s = bam_get_aux(b);
4803
5.50M
    uint8_t *end = b->data + b->l_data;
4804
5.50M
    if (end - s <= 2) { errno = ENOENT; return NULL; }
4805
2.53k
    return s+2;
4806
5.50M
}
4807
4808
uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s)
4809
621k
{
4810
621k
    uint8_t *end = b->data + b->l_data;
4811
621k
    uint8_t *next = s? skip_aux((uint8_t *) s, end) : end;
4812
621k
    if (next == NULL) goto bad_aux;
4813
620k
    if (end - next <= 2) { errno = ENOENT; return NULL; }
4814
618k
    return next+2;
4815
4816
514
 bad_aux:
4817
514
    hts_log_error("Corrupted aux data for read %s flag %d",
4818
514
                  bam_get_qname(b), b->core.flag);
4819
514
    errno = EINVAL;
4820
514
    return NULL;
4821
620k
}
4822
4823
uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
4824
5.50M
{
4825
5.50M
    uint8_t *s;
4826
6.13M
    for (s = bam_aux_first(b); s; s = bam_aux_next(b, s))
4827
621k
        if (s[-2] == tag[0] && s[-1] == tag[1]) {
4828
            // Check the tag value is valid and complete
4829
74
            uint8_t *e = skip_aux(s, b->data + b->l_data);
4830
74
            if (e == NULL) goto bad_aux;
4831
65
            if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux;
4832
4833
60
            return s;
4834
65
        }
4835
4836
    // errno now as set by bam_aux_first()/bam_aux_next()
4837
5.50M
    return NULL;
4838
4839
14
 bad_aux:
4840
14
    hts_log_error("Corrupted aux data for read %s flag %d",
4841
14
                  bam_get_qname(b), b->core.flag);
4842
14
    errno = EINVAL;
4843
14
    return NULL;
4844
5.50M
}
4845
4846
int bam_aux_del(bam1_t *b, uint8_t *s)
4847
0
{
4848
0
    s = bam_aux_remove(b, s);
4849
0
    return (s || errno == ENOENT)? 0 : -1;
4850
0
}
4851
4852
uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s)
4853
0
{
4854
0
    uint8_t *end = b->data + b->l_data;
4855
0
    uint8_t *next = skip_aux(s, end);
4856
0
    if (next == NULL) goto bad_aux;
4857
4858
0
    b->l_data -= next - (s-2);
4859
0
    if (next >= end) { errno = ENOENT; return NULL; }
4860
4861
0
    memmove(s-2, next, end - next);
4862
0
    return s;
4863
4864
0
 bad_aux:
4865
0
    hts_log_error("Corrupted aux data for read %s flag %d",
4866
0
                  bam_get_qname(b), b->core.flag);
4867
0
    errno = EINVAL;
4868
0
    return NULL;
4869
0
}
4870
4871
int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
4872
0
{
4873
    // FIXME: This is not at all efficient!
4874
0
    size_t ln = len >= 0 ? len : strlen(data) + 1;
4875
0
    size_t old_ln = 0;
4876
0
    int need_nul = ln == 0 || data[ln - 1] != '\0';
4877
0
    int save_errno = errno;
4878
0
    int new_tag = 0;
4879
0
    uint8_t *s = bam_aux_get(b,tag), *e;
4880
4881
0
    if (s) {  // Replacing existing tag
4882
0
        char type = *s;
4883
0
        if (type != 'Z') {
4884
0
            hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type);
4885
0
            errno = EINVAL;
4886
0
            return -1;
4887
0
        }
4888
0
        s++;
4889
0
        e = memchr(s, '\0', b->data + b->l_data - s);
4890
0
        old_ln = (e ? e - s : b->data + b->l_data - s) + 1;
4891
0
        s -= 3;
4892
0
    } else {
4893
0
        if (errno != ENOENT) { // Invalid aux data, give up
4894
0
            return -1;
4895
0
        } else { // Tag doesn't exist - put it on the end
4896
0
            errno = save_errno;
4897
0
            s = b->data + b->l_data;
4898
0
            new_tag = 3;
4899
0
        }
4900
0
    }
4901
4902
0
    if (old_ln < ln + need_nul + new_tag) {
4903
0
        ptrdiff_t s_offset = s - b->data;
4904
0
        if (possibly_expand_bam_data(b, ln + need_nul + new_tag - old_ln) < 0)
4905
0
            return -1;
4906
0
        s = b->data + s_offset;
4907
0
    }
4908
0
    if (!new_tag) {
4909
0
        memmove(s + 3 + ln + need_nul,
4910
0
                s + 3 + old_ln,
4911
0
                b->l_data - (s + 3 - b->data) - old_ln);
4912
0
    }
4913
0
    b->l_data += new_tag + ln + need_nul - old_ln;
4914
4915
0
    s[0] = tag[0];
4916
0
    s[1] = tag[1];
4917
0
    s[2] = 'Z';
4918
0
    memmove(s+3,data,ln);
4919
0
    if (need_nul) s[3 + ln] = '\0';
4920
0
    return 0;
4921
0
}
4922
4923
int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val)
4924
0
{
4925
0
    uint32_t sz, old_sz = 0, new = 0;
4926
0
    uint8_t *s, type;
4927
4928
0
    if (val < INT32_MIN || val > UINT32_MAX) {
4929
0
        errno = EOVERFLOW;
4930
0
        return -1;
4931
0
    }
4932
0
    if (val < INT16_MIN)       { type = 'i'; sz = 4; }
4933
0
    else if (val < INT8_MIN)   { type = 's'; sz = 2; }
4934
0
    else if (val < 0)          { type = 'c'; sz = 1; }
4935
0
    else if (val < UINT8_MAX)  { type = 'C'; sz = 1; }
4936
0
    else if (val < UINT16_MAX) { type = 'S'; sz = 2; }
4937
0
    else                       { type = 'I'; sz = 4; }
4938
4939
0
    s = bam_aux_get(b, tag);
4940
0
    if (s) {  // Tag present - how big was the old one?
4941
0
        switch (*s) {
4942
0
            case 'c': case 'C': old_sz = 1; break;
4943
0
            case 's': case 'S': old_sz = 2; break;
4944
0
            case 'i': case 'I': old_sz = 4; break;
4945
0
            default: errno = EINVAL; return -1;  // Not an integer
4946
0
        }
4947
0
    } else {
4948
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
4949
0
            s = b->data + b->l_data;
4950
0
            new = 1;
4951
0
        }  else { // Invalid aux data, give up.
4952
0
            return -1;
4953
0
        }
4954
0
    }
4955
4956
0
    if (new || old_sz < sz) {
4957
        // Make room for new tag
4958
0
        ptrdiff_t s_offset = s - b->data;
4959
0
        if (possibly_expand_bam_data(b, (new ? 3 : 0) + sz - old_sz) < 0)
4960
0
            return -1;
4961
0
        s =  b->data + s_offset;
4962
0
        if (new) { // Add tag id
4963
0
            *s++ = tag[0];
4964
0
            *s++ = tag[1];
4965
0
        } else {   // Shift following data so we have space
4966
0
            memmove(s + sz, s + old_sz, b->l_data - s_offset - old_sz);
4967
0
        }
4968
0
    } else {
4969
        // Reuse old space.  Data value may be bigger than necessary but
4970
        // we avoid having to move everything else
4971
0
        sz = old_sz;
4972
0
        type = (val < 0 ? "\0cs\0i" : "\0CS\0I")[old_sz];
4973
0
        assert(type > 0);
4974
0
    }
4975
0
    *s++ = type;
4976
0
#ifdef HTS_LITTLE_ENDIAN
4977
0
    memcpy(s, &val, sz);
4978
#else
4979
    switch (sz) {
4980
        case 4:  u32_to_le(val, s); break;
4981
        case 2:  u16_to_le(val, s); break;
4982
        default: *s = val; break;
4983
    }
4984
#endif
4985
0
    b->l_data += (new ? 3 : 0) + sz - old_sz;
4986
0
    return 0;
4987
0
}
4988
4989
int bam_aux_update_float(bam1_t *b, const char tag[2], float val)
4990
0
{
4991
0
    uint8_t *s = bam_aux_get(b, tag);
4992
0
    int shrink = 0, new = 0;
4993
4994
0
    if (s) { // Tag present - what was it?
4995
0
        switch (*s) {
4996
0
            case 'f': break;
4997
0
            case 'd': shrink = 1; break;
4998
0
            default: errno = EINVAL; return -1;  // Not a float
4999
0
        }
5000
0
    } else {
5001
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
5002
0
            new = 1;
5003
0
        }  else { // Invalid aux data, give up.
5004
0
            return -1;
5005
0
        }
5006
0
    }
5007
5008
0
    if (new) { // Ensure there's room
5009
0
        if (possibly_expand_bam_data(b, 3 + 4) < 0)
5010
0
            return -1;
5011
0
        s = b->data + b->l_data;
5012
0
        *s++ = tag[0];
5013
0
        *s++ = tag[1];
5014
0
    } else if (shrink) { // Convert non-standard double tag to float
5015
0
        memmove(s + 5, s + 9, b->l_data - ((s + 9) - b->data));
5016
0
        b->l_data -= 4;
5017
0
    }
5018
0
    *s++ = 'f';
5019
0
    float_to_le(val, s);
5020
0
    if (new) b->l_data += 7;
5021
5022
0
    return 0;
5023
0
}
5024
5025
int bam_aux_update_array(bam1_t *b, const char tag[2],
5026
                         uint8_t type, uint32_t items, void *data)
5027
0
{
5028
0
    uint8_t *s = bam_aux_get(b, tag);
5029
0
    size_t old_sz = 0, new_sz;
5030
0
    int new = 0;
5031
5032
0
    if (s) { // Tag present
5033
0
        if (*s != 'B') { errno = EINVAL; return -1; }
5034
0
        old_sz = aux_type2size(s[1]);
5035
0
        if (old_sz < 1 || old_sz > 4) { errno = EINVAL; return -1; }
5036
0
        old_sz *= le_to_u32(s + 2);
5037
0
    } else {
5038
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
5039
0
            s = b->data + b->l_data;
5040
0
            new = 1;
5041
0
        }  else { // Invalid aux data, give up.
5042
0
            return -1;
5043
0
        }
5044
0
    }
5045
5046
0
    new_sz = aux_type2size(type);
5047
0
    if (new_sz < 1 || new_sz > 4) { errno = EINVAL; return -1; }
5048
0
    if (items > INT32_MAX / new_sz) { errno = ENOMEM; return -1; }
5049
0
    new_sz *= items;
5050
5051
0
    if (new || old_sz < new_sz) {
5052
        // Make room for new tag
5053
0
        ptrdiff_t s_offset = s - b->data;
5054
0
        if (possibly_expand_bam_data(b, (new ? 8 : 0) + new_sz - old_sz) < 0)
5055
0
            return -1;
5056
0
        s =  b->data + s_offset;
5057
0
    }
5058
0
    if (new) { // Add tag id and type
5059
0
        *s++ = tag[0];
5060
0
        *s++ = tag[1];
5061
0
        *s = 'B';
5062
0
        b->l_data += 8 + new_sz;
5063
0
    } else if (old_sz != new_sz) { // shift following data if necessary
5064
0
        memmove(s + 6 + new_sz, s + 6 + old_sz,
5065
0
                b->l_data - ((s + 6 + old_sz) - b->data));
5066
0
        b->l_data -= old_sz;
5067
0
        b->l_data += new_sz;
5068
0
    }
5069
5070
0
    s[1] = type;
5071
0
    u32_to_le(items, s + 2);
5072
0
    if (new_sz > 0) {
5073
0
#ifdef HTS_LITTLE_ENDIAN
5074
0
        memcpy(s + 6, data, new_sz);
5075
#else
5076
        return aux_to_le(type, s + 6, data, new_sz);
5077
#endif
5078
0
    }
5079
0
    return 0;
5080
0
}
5081
5082
static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s,
5083
                                      uint32_t idx)
5084
0
{
5085
0
    switch (type) {
5086
0
        case 'c': return le_to_i8(s + idx);
5087
0
        case 'C': return s[idx];
5088
0
        case 's': return le_to_i16(s + 2 * idx);
5089
0
        case 'S': return le_to_u16(s + 2 * idx);
5090
0
        case 'i': return le_to_i32(s + 4 * idx);
5091
0
        case 'I': return le_to_u32(s + 4 * idx);
5092
0
        default:
5093
0
            errno = EINVAL;
5094
0
            return 0;
5095
0
    }
5096
0
}
5097
5098
int64_t bam_aux2i(const uint8_t *s)
5099
0
{
5100
0
    int type;
5101
0
    type = *s++;
5102
0
    return get_int_aux_val(type, s, 0);
5103
0
}
5104
5105
double bam_aux2f(const uint8_t *s)
5106
0
{
5107
0
    int type;
5108
0
    type = *s++;
5109
0
    if (type == 'd') return le_to_double(s);
5110
0
    else if (type == 'f') return le_to_float(s);
5111
0
    else return get_int_aux_val(type, s, 0);
5112
0
}
5113
5114
char bam_aux2A(const uint8_t *s)
5115
0
{
5116
0
    int type;
5117
0
    type = *s++;
5118
0
    if (type == 'A') return *(char*)s;
5119
0
    errno = EINVAL;
5120
0
    return 0;
5121
0
}
5122
5123
char *bam_aux2Z(const uint8_t *s)
5124
0
{
5125
0
    int type;
5126
0
    type = *s++;
5127
0
    if (type == 'Z' || type == 'H') return (char*)s;
5128
0
    errno = EINVAL;
5129
0
    return 0;
5130
0
}
5131
5132
uint32_t bam_auxB_len(const uint8_t *s)
5133
0
{
5134
0
    if (s[0] != 'B') {
5135
0
        errno = EINVAL;
5136
0
        return 0;
5137
0
    }
5138
0
    return le_to_u32(s + 2);
5139
0
}
5140
5141
int64_t bam_auxB2i(const uint8_t *s, uint32_t idx)
5142
0
{
5143
0
    uint32_t len = bam_auxB_len(s);
5144
0
    if (idx >= len) {
5145
0
        errno = ERANGE;
5146
0
        return 0;
5147
0
    }
5148
0
    return get_int_aux_val(s[1], s + 6, idx);
5149
0
}
5150
5151
double bam_auxB2f(const uint8_t *s, uint32_t idx)
5152
0
{
5153
0
    uint32_t len = bam_auxB_len(s);
5154
0
    if (idx >= len) {
5155
0
        errno = ERANGE;
5156
0
        return 0.0;
5157
0
    }
5158
0
    if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx);
5159
0
    else return get_int_aux_val(s[1], s + 6, idx);
5160
0
}
5161
5162
int sam_open_mode(char *mode, const char *fn, const char *format)
5163
0
{
5164
    // TODO Parse "bam5" etc for compression level
5165
0
    if (format == NULL) {
5166
        // Try to pick a format based on the filename extension
5167
0
        char extension[HTS_MAX_EXT_LEN];
5168
0
        if (find_file_extension(fn, extension) < 0) return -1;
5169
0
        return sam_open_mode(mode, fn, extension);
5170
0
    }
5171
0
    else if (strcasecmp(format, "bam") == 0) strcpy(mode, "b");
5172
0
    else if (strcasecmp(format, "cram") == 0) strcpy(mode, "c");
5173
0
    else if (strcasecmp(format, "sam") == 0) strcpy(mode, "");
5174
0
    else if (strcasecmp(format, "sam.gz") == 0) strcpy(mode, "z");
5175
0
    else if (strcasecmp(format, "fastq") == 0 ||
5176
0
             strcasecmp(format, "fq") == 0) strcpy(mode, "f");
5177
0
    else if (strcasecmp(format, "fastq.gz") == 0 ||
5178
0
             strcasecmp(format, "fq.gz") == 0) strcpy(mode, "fz");
5179
0
    else if (strcasecmp(format, "fasta") == 0 ||
5180
0
             strcasecmp(format, "fa") == 0) strcpy(mode, "F");
5181
0
    else if (strcasecmp(format, "fasta.gz") == 0 ||
5182
0
             strcasecmp(format, "fa.gz") == 0) strcpy(mode, "Fz");
5183
0
    else return -1;
5184
5185
0
    return 0;
5186
0
}
5187
5188
// A version of sam_open_mode that can handle ,key=value options.
5189
// The format string is allocated and returned, to be freed by the caller.
5190
// Prefix should be "r" or "w",
5191
char *sam_open_mode_opts(const char *fn,
5192
                         const char *mode,
5193
                         const char *format)
5194
0
{
5195
0
    char *mode_opts = malloc((format ? strlen(format) : 1) +
5196
0
                             (mode   ? strlen(mode)   : 1) + 12);
5197
0
    char *opts, *cp;
5198
0
    int format_len;
5199
5200
0
    if (!mode_opts)
5201
0
        return NULL;
5202
5203
0
    strcpy(mode_opts, mode ? mode : "r");
5204
0
    cp = mode_opts + strlen(mode_opts);
5205
5206
0
    if (format == NULL) {
5207
        // Try to pick a format based on the filename extension
5208
0
        char extension[HTS_MAX_EXT_LEN];
5209
0
        if (find_file_extension(fn, extension) < 0) {
5210
0
            free(mode_opts);
5211
0
            return NULL;
5212
0
        }
5213
0
        if (sam_open_mode(cp, fn, extension) == 0) {
5214
0
            return mode_opts;
5215
0
        } else {
5216
0
            free(mode_opts);
5217
0
            return NULL;
5218
0
        }
5219
0
    }
5220
5221
0
    if ((opts = strchr(format, ','))) {
5222
0
        format_len = opts-format;
5223
0
    } else {
5224
0
        opts="";
5225
0
        format_len = strlen(format);
5226
0
    }
5227
5228
0
    if (strncmp(format, "bam", format_len) == 0) {
5229
0
        *cp++ = 'b';
5230
0
    } else if (strncmp(format, "cram", format_len) == 0) {
5231
0
        *cp++ = 'c';
5232
0
    } else if (strncmp(format, "cram2", format_len) == 0) {
5233
0
        *cp++ = 'c';
5234
0
        strcpy(cp, ",VERSION=2.1");
5235
0
        cp += 12;
5236
0
    } else if (strncmp(format, "cram3", format_len) == 0) {
5237
0
        *cp++ = 'c';
5238
0
        strcpy(cp, ",VERSION=3.0");
5239
0
        cp += 12;
5240
0
    } else if (strncmp(format, "sam", format_len) == 0) {
5241
0
        ; // format mode=""
5242
0
    } else if (strncmp(format, "sam.gz", format_len) == 0) {
5243
0
        *cp++ = 'z';
5244
0
    } else if (strncmp(format, "fastq", format_len) == 0 ||
5245
0
               strncmp(format, "fq", format_len) == 0) {
5246
0
        *cp++ = 'f';
5247
0
    } else if (strncmp(format, "fastq.gz", format_len) == 0 ||
5248
0
               strncmp(format, "fq.gz", format_len) == 0) {
5249
0
        *cp++ = 'f';
5250
0
        *cp++ = 'z';
5251
0
    } else if (strncmp(format, "fasta", format_len) == 0 ||
5252
0
               strncmp(format, "fa", format_len) == 0) {
5253
0
        *cp++ = 'F';
5254
0
    } else if (strncmp(format, "fasta.gz", format_len) == 0 ||
5255
0
               strncmp(format, "fa", format_len) == 0) {
5256
0
        *cp++ = 'F';
5257
0
        *cp++ = 'z';
5258
0
    } else {
5259
0
        free(mode_opts);
5260
0
        return NULL;
5261
0
    }
5262
5263
0
    strcpy(cp, opts);
5264
5265
0
    return mode_opts;
5266
0
}
5267
5268
0
#define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
5269
int bam_str2flag(const char *str)
5270
0
{
5271
0
    char *end, *beg = (char*) str;
5272
0
    long int flag = strtol(str, &end, 0);
5273
0
    if ( end!=str ) return flag;    // the conversion was successful
5274
0
    flag = 0;
5275
0
    while ( *str )
5276
0
    {
5277
0
        end = beg;
5278
0
        while ( *end && *end!=',' ) end++;
5279
0
        if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
5280
0
        else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
5281
0
        else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
5282
0
        else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
5283
0
        else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
5284
0
        else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
5285
0
        else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
5286
0
        else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
5287
0
        else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
5288
0
        else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
5289
0
        else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
5290
0
        else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
5291
0
        else return -1;
5292
0
        if ( !*end ) break;
5293
0
        beg = end + 1;
5294
0
    }
5295
0
    return flag;
5296
0
}
5297
5298
char *bam_flag2str(int flag)
5299
0
{
5300
0
    kstring_t str = {0,0,0};
5301
0
    if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
5302
0
    if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
5303
0
    if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
5304
0
    if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
5305
0
    if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
5306
0
    if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
5307
0
    if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
5308
0
    if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
5309
0
    if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
5310
0
    if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
5311
0
    if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
5312
0
    if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
5313
0
    if ( str.l == 0 ) kputsn("", 0, &str);
5314
0
    return str.s;
5315
0
}
5316
5317
5318
/**************************
5319
 *** Pileup and Mpileup ***
5320
 **************************/
5321
5322
#if !defined(BAM_NO_PILEUP)
5323
5324
#include <assert.h>
5325
5326
/*******************
5327
 *** Memory pool ***
5328
 *******************/
5329
5330
typedef struct {
5331
    int k, y;
5332
    hts_pos_t x, end;
5333
} cstate_t;
5334
5335
static cstate_t g_cstate_null = { -1, 0, 0, 0 };
5336
5337
typedef struct __linkbuf_t {
5338
    bam1_t b;
5339
    hts_pos_t beg, end;
5340
    cstate_t s;
5341
    struct __linkbuf_t *next;
5342
    bam_pileup_cd cd;
5343
} lbnode_t;
5344
5345
typedef struct {
5346
    int cnt, n, max;
5347
    lbnode_t **buf;
5348
} mempool_t;
5349
5350
static mempool_t *mp_init(void)
5351
0
{
5352
0
    mempool_t *mp;
5353
0
    mp = (mempool_t*)calloc(1, sizeof(mempool_t));
5354
0
    return mp;
5355
0
}
5356
static void mp_destroy(mempool_t *mp)
5357
0
{
5358
0
    int k;
5359
0
    for (k = 0; k < mp->n; ++k) {
5360
0
        free(mp->buf[k]->b.data);
5361
0
        free(mp->buf[k]);
5362
0
    }
5363
0
    free(mp->buf);
5364
0
    free(mp);
5365
0
}
5366
static inline lbnode_t *mp_alloc(mempool_t *mp)
5367
0
{
5368
0
    ++mp->cnt;
5369
0
    if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
5370
0
    else return mp->buf[--mp->n];
5371
0
}
5372
static inline void mp_free(mempool_t *mp, lbnode_t *p)
5373
0
{
5374
0
    --mp->cnt; p->next = 0; // clear lbnode_t::next here
5375
0
    if (mp->n == mp->max) {
5376
0
        mp->max = mp->max? mp->max<<1 : 256;
5377
0
        mp->buf = hts_realloc_p(mp->buf, sizeof(lbnode_t*), mp->max);
5378
0
    }
5379
0
    mp->buf[mp->n++] = p;
5380
0
}
5381
5382
/**********************
5383
 *** CIGAR resolver ***
5384
 **********************/
5385
5386
/* s->k: the index of the CIGAR operator that has just been processed.
5387
   s->x: the reference coordinate of the start of s->k
5388
   s->y: the query coordinate of the start of s->k
5389
 */
5390
static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s)
5391
0
{
5392
0
#define _cop(c) ((c)&BAM_CIGAR_MASK)
5393
0
#define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
5394
5395
0
    bam1_t *b = p->b;
5396
0
    bam1_core_t *c = &b->core;
5397
0
    uint32_t *cigar = bam_get_cigar(b);
5398
0
    int k;
5399
    // determine the current CIGAR operation
5400
    //fprintf(stderr, "%s\tpos=%ld\tend=%ld\t(%d,%ld,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
5401
0
    if (s->k == -1) { // never processed
5402
0
        p->qpos = 0;
5403
0
        if (c->n_cigar == 1) { // just one operation, save a loop
5404
0
          if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
5405
0
        } else { // find the first match or deletion
5406
0
            for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
5407
0
                int op = _cop(cigar[k]);
5408
0
                int l = _cln(cigar[k]);
5409
0
                if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP ||
5410
0
                    op == BAM_CEQUAL || op == BAM_CDIFF) break;
5411
0
                else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5412
0
            }
5413
0
            assert(k < c->n_cigar);
5414
0
            s->k = k;
5415
0
        }
5416
0
    } else { // the read has been processed before
5417
0
        int op, l = _cln(cigar[s->k]);
5418
0
        if (pos - s->x >= l) { // jump to the next operation
5419
0
            assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
5420
0
            op = _cop(cigar[s->k+1]);
5421
0
            if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
5422
0
              if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5423
0
                s->x += l;
5424
0
                ++s->k;
5425
0
            } else { // find the next M/D/N/=/X
5426
0
              if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5427
0
                s->x += l;
5428
0
                for (k = s->k + 1; k < c->n_cigar; ++k) {
5429
0
                    op = _cop(cigar[k]), l = _cln(cigar[k]);
5430
0
                    if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
5431
0
                    else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5432
0
                }
5433
0
                s->k = k;
5434
0
            }
5435
0
            assert(s->k < c->n_cigar); // otherwise a bug
5436
0
        } // else, do nothing
5437
0
    }
5438
0
    { // collect pileup information
5439
0
        int op, l;
5440
0
        op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
5441
0
        p->is_del = p->indel = p->is_refskip = 0;
5442
0
        if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
5443
0
            int op2 = _cop(cigar[s->k+1]);
5444
0
            int l2 = _cln(cigar[s->k+1]);
5445
0
            if (op2 == BAM_CDEL && op != BAM_CDEL) {
5446
                // At start of a new deletion, merge e.g. 1D2D to 3D.
5447
                // Within a deletion (the 2D in 1D2D) we keep p->indel=0
5448
                // and rely on is_del=1 as we would for 3D.
5449
0
                p->indel = -(int)l2;
5450
0
                for (k = s->k+2; k < c->n_cigar; ++k) {
5451
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5452
0
                    if (op2 == BAM_CDEL) p->indel -= l2;
5453
0
                    else break;
5454
0
                }
5455
0
            } else if (op2 == BAM_CINS) {
5456
0
                p->indel = l2;
5457
0
                for (k = s->k+2; k < c->n_cigar; ++k) {
5458
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5459
0
                    if (op2 == BAM_CINS) p->indel += l2;
5460
0
                    else if (op2 != BAM_CPAD) break;
5461
0
                }
5462
0
            } else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) {
5463
0
                int l3 = 0;
5464
0
                for (k = s->k + 2; k < c->n_cigar; ++k) {
5465
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5466
0
                    if (op2 == BAM_CINS) l3 += l2;
5467
0
                    else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
5468
0
                }
5469
0
                if (l3 > 0) p->indel = l3;
5470
0
            }
5471
0
        }
5472
0
        if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
5473
0
            p->qpos = s->y + (pos - s->x);
5474
0
        } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
5475
0
            p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
5476
0
            p->is_refskip = (op == BAM_CREF_SKIP);
5477
0
        } // cannot be other operations; otherwise a bug
5478
0
        p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
5479
0
    }
5480
0
    p->cigar_ind = s->k;
5481
0
    return 1;
5482
0
}
5483
5484
/*******************************
5485
 *** Expansion of insertions ***
5486
 *******************************/
5487
5488
/*
5489
 * Fills out the kstring with the padded insertion sequence for the current
5490
 * location in 'p'.  If this is not an insertion site, the string is blank.
5491
 *
5492
 * This variant handles base modifications, but only when "m" is non-NULL.
5493
 *
5494
 * Returns the number of inserted base on success, with string length being
5495
 *        accessable via ins->l;
5496
 *        -1 on failure.
5497
 */
5498
int bam_plp_insertion_mod(const bam_pileup1_t *p,
5499
                          hts_base_mod_state *m,
5500
0
                          kstring_t *ins, int *del_len) {
5501
0
    int j, k, indel, nb = 0;
5502
0
    uint32_t *cigar;
5503
5504
0
    if (p->indel <= 0) {
5505
0
        if (ks_resize(ins, 1) < 0)
5506
0
            return -1;
5507
0
        ins->l = 0;
5508
0
        ins->s[0] = '\0';
5509
0
        return 0;
5510
0
    }
5511
5512
0
    if (del_len)
5513
0
        *del_len = 0;
5514
5515
    // Measure indel length including pads
5516
0
    indel = 0;
5517
0
    k = p->cigar_ind+1;
5518
0
    cigar = bam_get_cigar(p->b);
5519
0
    while (k < p->b->core.n_cigar) {
5520
0
        switch (cigar[k] & BAM_CIGAR_MASK) {
5521
0
        case BAM_CPAD:
5522
0
        case BAM_CINS:
5523
0
            indel += (cigar[k] >> BAM_CIGAR_SHIFT);
5524
0
            break;
5525
0
        default:
5526
0
            k = p->b->core.n_cigar;
5527
0
            break;
5528
0
        }
5529
0
        k++;
5530
0
    }
5531
0
    nb = ins->l = indel;
5532
5533
    // Produce sequence
5534
0
    if (ks_resize(ins, indel+1) < 0)
5535
0
        return -1;
5536
0
    indel = 0;
5537
0
    k = p->cigar_ind+1;
5538
0
    j = 1;
5539
0
    while (k < p->b->core.n_cigar) {
5540
0
        int l, c;
5541
0
        switch (cigar[k] & BAM_CIGAR_MASK) {
5542
0
        case BAM_CPAD:
5543
0
            for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++)
5544
0
                ins->s[indel++] = '*';
5545
0
            break;
5546
0
        case BAM_CINS:
5547
0
            for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) {
5548
0
                c = p->qpos + j - p->is_del < p->b->core.l_qseq
5549
0
                    ? seq_nt16_str[bam_seqi(bam_get_seq(p->b),
5550
0
                                            p->qpos + j - p->is_del)]
5551
0
                    : 'N';
5552
0
                ins->s[indel++] = c;
5553
0
                int nm;
5554
0
                hts_base_mod mod[256];
5555
0
                if (m && (nm = bam_mods_at_qpos(p->b, p->qpos + j - p->is_del,
5556
0
                                                m, mod, 256)) > 0) {
5557
0
                    int o_indel = indel;
5558
0
                    if (ks_resize(ins, ins->l + nm*16+3) < 0)
5559
0
                        return -1;
5560
0
                    ins->s[indel++] = '[';
5561
0
                    int j;
5562
0
                    for (j = 0; j < nm; j++) {
5563
0
                        char qual[20];
5564
0
                        if (mod[j].qual >= 0)
5565
0
                            snprintf(qual, sizeof(qual), "%d", mod[j].qual);
5566
0
                        else
5567
0
                            *qual=0;
5568
0
                        if (mod[j].modified_base < 0)
5569
                            // ChEBI
5570
0
                            indel += snprintf(&ins->s[indel], ins->m - indel,
5571
0
                                              "%c(%d)%s",
5572
0
                                              "+-"[mod[j].strand],
5573
0
                                              -mod[j].modified_base,
5574
0
                                              qual);
5575
0
                        else
5576
0
                            indel += snprintf(&ins->s[indel], ins->m - indel,
5577
0
                                              "%c%c%s",
5578
0
                                              "+-"[mod[j].strand],
5579
0
                                              mod[j].modified_base,
5580
0
                                              qual);
5581
0
                    }
5582
0
                    ins->s[indel++] = ']';
5583
0
                    ins->l += indel - o_indel; // grow by amount we used
5584
0
                }
5585
0
            }
5586
0
            break;
5587
0
        case BAM_CDEL:
5588
            // eg cigar 1M2I1D gives mpileup output in T+2AA-1C style
5589
0
            if (del_len)
5590
0
                *del_len = cigar[k]>>BAM_CIGAR_SHIFT;
5591
            // fall through
5592
0
        default:
5593
0
            k = p->b->core.n_cigar;
5594
0
            break;
5595
0
        }
5596
0
        k++;
5597
0
    }
5598
0
    ins->s[indel] = '\0';
5599
0
    ins->l = indel; // string length
5600
5601
0
    return nb;      // base length
5602
0
}
5603
5604
/*
5605
 * Fills out the kstring with the padded insertion sequence for the current
5606
 * location in 'p'.  If this is not an insertion site, the string is blank.
5607
 *
5608
 * This is the original interface with no capability for reporting base
5609
 * modifications.
5610
 *
5611
 * Returns the length of insertion string on success;
5612
 *        -1 on failure.
5613
 */
5614
0
int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) {
5615
0
    return bam_plp_insertion_mod(p, NULL, ins, del_len);
5616
0
}
5617
5618
/***********************
5619
 *** Pileup iterator ***
5620
 ***********************/
5621
5622
// Dictionary of overlapping reads
5623
KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
5624
typedef khash_t(olap_hash) olap_hash_t;
5625
5626
struct bam_plp_s {
5627
    mempool_t *mp;
5628
    lbnode_t *head, *tail;
5629
    int32_t tid, max_tid;
5630
    hts_pos_t pos, max_pos;
5631
    int is_eof, max_plp, error, maxcnt;
5632
    uint64_t id;
5633
    bam_pileup1_t *plp;
5634
    // for the "auto" interface only
5635
    bam1_t *b;
5636
    bam_plp_auto_f func;
5637
    void *data;
5638
    olap_hash_t *overlaps;
5639
5640
    // For notification of creation and destruction events
5641
    // and associated client-owned pointer.
5642
    int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
5643
    int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd);
5644
};
5645
5646
bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
5647
0
{
5648
0
    bam_plp_t iter;
5649
0
    iter = (bam_plp_t)calloc(1, sizeof(struct bam_plp_s));
5650
0
    iter->mp = mp_init();
5651
0
    iter->head = iter->tail = mp_alloc(iter->mp);
5652
0
    iter->max_tid = iter->max_pos = -1;
5653
0
    iter->maxcnt = 8000;
5654
0
    if (func) {
5655
0
        iter->func = func;
5656
0
        iter->data = data;
5657
0
        iter->b = bam_init1();
5658
0
    }
5659
0
    return iter;
5660
0
}
5661
5662
int bam_plp_init_overlaps(bam_plp_t iter)
5663
0
{
5664
0
    iter->overlaps = kh_init(olap_hash);  // hash for tweaking quality of bases in overlapping reads
5665
0
    return iter->overlaps ? 0 : -1;
5666
0
}
5667
5668
void bam_plp_destroy(bam_plp_t iter)
5669
0
{
5670
0
    lbnode_t *p, *pnext;
5671
0
    if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
5672
0
    for (p = iter->head; p != NULL; p = pnext) {
5673
0
        if (iter->plp_destruct && p != iter->tail)
5674
0
            iter->plp_destruct(iter->data, &p->b, &p->cd);
5675
0
        pnext = p->next;
5676
0
        mp_free(iter->mp, p);
5677
0
    }
5678
0
    mp_destroy(iter->mp);
5679
0
    if (iter->b) bam_destroy1(iter->b);
5680
0
    free(iter->plp);
5681
0
    free(iter);
5682
0
}
5683
5684
void bam_plp_constructor(bam_plp_t plp,
5685
0
                         int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5686
0
    plp->plp_construct = func;
5687
0
}
5688
5689
void bam_plp_destructor(bam_plp_t plp,
5690
0
                        int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5691
0
    plp->plp_destruct = func;
5692
0
}
5693
5694
//---------------------------------
5695
//---  Tweak overlapping reads
5696
//---------------------------------
5697
5698
/**
5699
 *  cigar_iref2iseq_set()  - find the first CMATCH setting the ref and the read index
5700
 *  cigar_iref2iseq_next() - get the next CMATCH base
5701
 *  @cigar:       pointer to current cigar block (rw)
5702
 *  @cigar_max:   pointer just beyond the last cigar block
5703
 *  @icig:        position within the current cigar block (rw)
5704
 *  @iseq:        position in the sequence (rw)
5705
 *  @iref:        position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
5706
 *
5707
 *  Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered,
5708
 *  or -2 on error.
5709
 */
5710
static inline int cigar_iref2iseq_set(const uint32_t **cigar,
5711
                                      const uint32_t *cigar_max,
5712
                                      hts_pos_t *icig,
5713
                                      hts_pos_t *iseq,
5714
                                      hts_pos_t *iref)
5715
0
{
5716
0
    hts_pos_t pos = *iref;
5717
0
    if ( pos < 0 ) return -1;
5718
0
    *icig = 0;
5719
0
    *iseq = 0;
5720
0
    *iref = 0;
5721
0
    while ( *cigar<cigar_max )
5722
0
    {
5723
0
        int cig  = (**cigar) & BAM_CIGAR_MASK;
5724
0
        int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5725
5726
0
        if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5727
0
        if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
5728
0
        if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5729
0
        {
5730
0
            pos -= ncig;
5731
0
            if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
5732
0
            (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
5733
0
            continue;
5734
0
        }
5735
0
        if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5736
0
        if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
5737
0
        {
5738
0
            pos -= ncig;
5739
0
            if ( pos<0 ) pos = 0;
5740
0
            (*cigar)++; *icig = 0; *iref += ncig;
5741
0
            continue;
5742
0
        }
5743
0
        hts_log_error("Unexpected cigar %d", cig);
5744
0
        return -2;
5745
0
    }
5746
0
    *iseq = -1;
5747
0
    return -1;
5748
0
}
5749
static inline int cigar_iref2iseq_next(const uint32_t **cigar,
5750
                                       const uint32_t *cigar_max,
5751
                                       hts_pos_t *icig,
5752
                                       hts_pos_t *iseq,
5753
                                       hts_pos_t *iref)
5754
0
{
5755
0
    while ( *cigar < cigar_max )
5756
0
    {
5757
0
        int cig  = (**cigar) & BAM_CIGAR_MASK;
5758
0
        int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5759
5760
0
        if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5761
0
        {
5762
0
            if ( *icig >= ncig - 1 ) { *icig = -1;  (*cigar)++; continue; }
5763
0
            (*iseq)++; (*icig)++; (*iref)++;
5764
0
            return BAM_CMATCH;
5765
0
        }
5766
0
        if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = -1; continue; }
5767
0
        if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5768
0
        if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5769
0
        if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = -1; continue; }
5770
0
        hts_log_error("Unexpected cigar %d", cig);
5771
0
        return -2;
5772
0
    }
5773
0
    *iseq = -1;
5774
0
    *iref = -1;
5775
0
    return -1;
5776
0
}
5777
5778
// Given overlapping read 'a' (left) and 'b' (right) on the same
5779
// template, adjust quality values to zero for either a or b.
5780
// Note versions 1.12 and earlier always removed quality from 'b' for
5781
// matching bases.  Now we select a or b semi-randomly based on name hash.
5782
// Returns 0 on success,
5783
//        -1 on failure
5784
static int tweak_overlap_quality(bam1_t *a, bam1_t *b)
5785
0
{
5786
0
    const uint32_t *a_cigar = bam_get_cigar(a),
5787
0
        *a_cigar_max = a_cigar + a->core.n_cigar;
5788
0
    const uint32_t *b_cigar = bam_get_cigar(b),
5789
0
        *b_cigar_max = b_cigar + b->core.n_cigar;
5790
0
    hts_pos_t a_icig = 0, a_iseq = 0;
5791
0
    hts_pos_t b_icig = 0, b_iseq = 0;
5792
0
    uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
5793
0
    uint8_t *a_seq  = bam_get_seq(a), *b_seq = bam_get_seq(b);
5794
5795
0
    hts_pos_t iref   = b->core.pos;
5796
0
    hts_pos_t a_iref = iref - a->core.pos;
5797
0
    hts_pos_t b_iref = iref - b->core.pos;
5798
5799
0
    int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max,
5800
0
                                    &a_icig, &a_iseq, &a_iref);
5801
0
    if ( a_ret<0 )
5802
        // no overlap or error
5803
0
        return a_ret<-1 ? -1:0;
5804
5805
0
    int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max,
5806
0
                                    &b_icig, &b_iseq, &b_iref);
5807
0
    if ( b_ret<0 )
5808
        // no overlap or error
5809
0
        return b_ret<-1 ? -1:0;
5810
5811
    // Determine which seq is the one getting modified qualities.
5812
0
    uint8_t amul, bmul;
5813
0
    if (__ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(a))) & 1) {
5814
0
        amul = 1;
5815
0
        bmul = 0;
5816
0
    } else {
5817
0
        amul = 0;
5818
0
        bmul = 1;
5819
0
    }
5820
5821
    // Loop over the overlapping region nulling qualities in either
5822
    // seq a or b.
5823
0
    int err = 0;
5824
0
    while ( 1 ) {
5825
        // Step to next matching reference position in a and b
5826
0
        while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos )
5827
0
            a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
5828
0
                                         &a_icig, &a_iseq, &a_iref);
5829
0
        if ( a_ret<0 ) { // done
5830
0
            err = a_ret<-1?-1:0;
5831
0
            break;
5832
0
        }
5833
5834
0
        while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos )
5835
0
            b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig,
5836
0
                                         &b_iseq, &b_iref);
5837
0
        if ( b_ret<0 ) { // done
5838
0
            err = b_ret<-1?-1:0;
5839
0
            break;
5840
0
        }
5841
5842
0
        if ( iref < a_iref + a->core.pos )
5843
0
            iref = a_iref + a->core.pos;
5844
5845
0
        if ( iref < b_iref + b->core.pos )
5846
0
            iref = b_iref + b->core.pos;
5847
5848
0
        iref++;
5849
5850
        // If A or B has a deletion then we catch up the other to this point.
5851
        // We also amend quality values using the same rules for mismatch.
5852
0
        if (a_iref+a->core.pos != b_iref+b->core.pos) {
5853
0
            if (a_iref+a->core.pos < b_iref+b->core.pos
5854
0
                && b_cigar > bam_get_cigar(b)
5855
0
                && bam_cigar_op(b_cigar[-1]) == BAM_CDEL) {
5856
                // Del in B means it's moved on further than A
5857
0
                do {
5858
0
                    a_qual[a_iseq] = amul
5859
0
                        ? a_qual[a_iseq]*0.8
5860
0
                        : 0;
5861
0
                    a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
5862
0
                                                 &a_icig, &a_iseq, &a_iref);
5863
0
                    if (a_ret < 0)
5864
0
                        return -(a_ret<-1); // 0 or -1
5865
0
                } while (a_iref + a->core.pos < b_iref+b->core.pos);
5866
0
            } else if (a_cigar > bam_get_cigar(a)
5867
0
                       && bam_cigar_op(a_cigar[-1]) == BAM_CDEL) {
5868
                // Del in A means it's moved on further than B
5869
0
                do {
5870
0
                    b_qual[b_iseq] = bmul
5871
0
                        ? b_qual[b_iseq]*0.8
5872
0
                        : 0;
5873
0
                    b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max,
5874
0
                                                 &b_icig, &b_iseq, &b_iref);
5875
0
                    if (b_ret < 0)
5876
0
                        return -(b_ret<-1); // 0 or -1
5877
0
                } while (b_iref + b->core.pos < a_iref+a->core.pos);
5878
0
            } else {
5879
                // Anything else, eg ref-skip, we don't support here
5880
0
                continue;
5881
0
            }
5882
0
        }
5883
5884
        // fprintf(stderr, "a_cig=%ld,%ld b_cig=%ld,%ld iref=%ld "
5885
        //         "a_iref=%ld b_iref=%ld a_iseq=%ld b_iseq=%ld\n",
5886
        //         a_cigar-bam_get_cigar(a), a_icig,
5887
        //         b_cigar-bam_get_cigar(b), b_icig,
5888
        //         iref, a_iref+a->core.pos+1, b_iref+b->core.pos+1,
5889
        //         a_iseq, b_iseq);
5890
5891
0
        if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq)
5892
            // Fell off end of sequence, bad CIGAR?
5893
0
            return -1;
5894
5895
        // We're finally at the same ref base in both a and b.
5896
        // Check if the bases match (confident) or mismatch
5897
        // (not so confident).
5898
0
        if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) {
5899
            // We are very confident about this base.  Use sum of quals
5900
0
            int qual = a_qual[a_iseq] + b_qual[b_iseq];
5901
0
            a_qual[a_iseq] = amul * (qual>200 ? 200 : qual);
5902
0
            b_qual[b_iseq] = bmul * (qual>200 ? 200 : qual);;
5903
0
        } else {
5904
            // Not so confident about anymore given the mismatch.
5905
            // Reduce qual for lowest quality base.
5906
0
            if ( a_qual[a_iseq] > b_qual[b_iseq] ) {
5907
                // A highest qual base; keep
5908
0
                a_qual[a_iseq] = 0.8 * a_qual[a_iseq];
5909
0
                b_qual[b_iseq] = 0;
5910
0
            } else if (a_qual[a_iseq] < b_qual[b_iseq] ) {
5911
                // B highest qual base; keep
5912
0
                b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
5913
0
                a_qual[a_iseq] = 0;
5914
0
            } else {
5915
                // Both equal, so pick randomly
5916
0
                a_qual[a_iseq] = amul * 0.8 * a_qual[a_iseq];
5917
0
                b_qual[b_iseq] = bmul * 0.8 * b_qual[b_iseq];
5918
0
            }
5919
0
        }
5920
0
    }
5921
5922
0
    return err;
5923
0
}
5924
5925
// Fix overlapping reads. Simple soft-clipping did not give good results.
5926
// Lowering qualities of unwanted bases is more selective and works better.
5927
//
5928
// Returns 0 on success, -1 on failure
5929
static int overlap_push(bam_plp_t iter, lbnode_t *node)
5930
0
{
5931
0
    if ( !iter->overlaps ) return 0;
5932
5933
    // mapped mates and paired reads only
5934
0
    if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return 0;
5935
5936
    // no overlap possible, unless some wild cigar
5937
0
    if ( (node->b.core.mtid >= 0 && node->b.core.tid != node->b.core.mtid)
5938
0
         || (llabs(node->b.core.isize) >= 2*node->b.core.l_qseq
5939
0
         && node->b.core.mpos >= node->end) // for those wild cigars
5940
0
       ) return 0;
5941
5942
0
    khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
5943
0
    if ( kitr==kh_end(iter->overlaps) )
5944
0
    {
5945
        // Only add reads where the mate is still to arrive
5946
0
        if (node->b.core.mpos >= node->b.core.pos ||
5947
0
            ((node->b.core.flag & BAM_FPAIRED) && node->b.core.mpos == -1)) {
5948
0
            int ret;
5949
0
            kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
5950
0
            if (ret < 0) return -1;
5951
0
            kh_value(iter->overlaps, kitr) = node;
5952
0
        }
5953
0
    }
5954
0
    else
5955
0
    {
5956
0
        lbnode_t *a = kh_value(iter->overlaps, kitr);
5957
0
        int err = tweak_overlap_quality(&a->b, &node->b);
5958
0
        kh_del(olap_hash, iter->overlaps, kitr);
5959
0
        assert(a->end-1 == a->s.end);
5960
0
        return err;
5961
0
    }
5962
0
    return 0;
5963
0
}
5964
5965
static void overlap_remove(bam_plp_t iter, const bam1_t *b)
5966
0
{
5967
0
    if ( !iter->overlaps ) return;
5968
5969
0
    khiter_t kitr;
5970
0
    if ( b )
5971
0
    {
5972
0
        if ( b->core.flag&BAM_FUNMAP || !(b->core.flag&BAM_FPROPER_PAIR) ) //no need
5973
0
            return;
5974
5975
0
        kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
5976
0
        if ( kitr!=kh_end(iter->overlaps) )
5977
0
            kh_del(olap_hash, iter->overlaps, kitr);
5978
0
    }
5979
0
    else
5980
0
    {
5981
        // remove all
5982
0
        for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
5983
0
            if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
5984
0
    }
5985
0
}
5986
5987
5988
5989
// Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
5990
// pointer to the piled records if next position is ready or NULL if there is not enough records in the
5991
// buffer yet (the current position is still the maximum position across all buffered reads).
5992
const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
5993
0
{
5994
0
    if (iter->error) { *_n_plp = -1; return NULL; }
5995
0
    *_n_plp = 0;
5996
0
    if (iter->is_eof && iter->head == iter->tail) return NULL;
5997
0
    while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
5998
0
        int n_plp = 0;
5999
        // write iter->plp at iter->pos
6000
0
        lbnode_t **pptr = &iter->head;
6001
0
        while (*pptr != iter->tail) {
6002
0
            if ((*pptr)->next)
6003
0
                hts_prefetch((*pptr)->next);
6004
0
            lbnode_t *p = *pptr;
6005
0
            if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
6006
0
                overlap_remove(iter, &p->b);
6007
0
                if (iter->plp_destruct)
6008
0
                    iter->plp_destruct(iter->data, &p->b, &p->cd);
6009
0
                *pptr = p->next; mp_free(iter->mp, p);
6010
0
            }
6011
0
            else {
6012
0
                if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
6013
0
                    if (n_plp == iter->max_plp) { // then double the capacity
6014
0
                        iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
6015
0
                        iter->plp = hts_realloc_p(iter->plp, sizeof(bam_pileup1_t), iter->max_plp);
6016
0
                    }
6017
0
                    iter->plp[n_plp].b = &p->b;
6018
0
                    iter->plp[n_plp].cd = p->cd;
6019
0
                    if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
6020
0
                }
6021
0
                pptr = &(*pptr)->next;
6022
0
            }
6023
0
        }
6024
0
        *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
6025
        // update iter->tid and iter->pos
6026
0
        if (iter->head != iter->tail) {
6027
0
            if (iter->tid > iter->head->b.core.tid) {
6028
0
                hts_log_error("Unsorted input. Pileup aborts");
6029
0
                iter->error = 1;
6030
0
                *_n_plp = -1;
6031
0
                return NULL;
6032
0
            }
6033
0
        }
6034
0
        if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
6035
0
            iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
6036
0
        } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
6037
0
            iter->pos = iter->head->beg; // jump to the next position
6038
0
        } else ++iter->pos; // scan contiguously
6039
        // return
6040
0
        if (n_plp) return iter->plp;
6041
0
        if (iter->is_eof && iter->head == iter->tail) break;
6042
0
    }
6043
0
    return NULL;
6044
0
}
6045
6046
const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
6047
0
{
6048
0
    hts_pos_t pos64 = 0;
6049
0
    const bam_pileup1_t *p = bam_plp64_next(iter, _tid, &pos64, _n_plp);
6050
0
    if (pos64 < INT_MAX) {
6051
0
        *_pos = pos64;
6052
0
    } else {
6053
0
        hts_log_error("Position %"PRId64" too large", pos64);
6054
0
        *_pos = INT_MAX;
6055
0
        iter->error = 1;
6056
0
        *_n_plp = -1;
6057
0
        return NULL;
6058
0
    }
6059
0
    return p;
6060
0
}
6061
6062
int bam_plp_push(bam_plp_t iter, const bam1_t *b)
6063
0
{
6064
0
    if (iter->error) return -1;
6065
0
    if (b) {
6066
0
        if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
6067
        // Skip only unmapped reads here, any additional filtering must be done in iter->func
6068
0
        if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
6069
0
        if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
6070
0
        {
6071
0
            overlap_remove(iter, b);
6072
0
            return 0;
6073
0
        }
6074
0
        if (bam_copy1(&iter->tail->b, b) == NULL)
6075
0
            return -1;
6076
0
        iter->tail->b.id = iter->id++;
6077
0
        iter->tail->beg = b->core.pos;
6078
        // Use raw rlen rather than bam_endpos() which adjusts rlen=0 to rlen=1
6079
0
        iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
6080
0
        iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
6081
0
        if (b->core.tid < iter->max_tid) {
6082
0
            hts_log_error("The input is not sorted (chromosomes out of order)");
6083
0
            iter->error = 1;
6084
0
            return -1;
6085
0
        }
6086
0
        if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
6087
0
            hts_log_error("The input is not sorted (reads out of order)");
6088
0
            iter->error = 1;
6089
0
            return -1;
6090
0
        }
6091
0
        iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
6092
0
        if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
6093
0
            lbnode_t *next = mp_alloc(iter->mp);
6094
0
            if (!next) {
6095
0
                iter->error = 1;
6096
0
                return -1;
6097
0
            }
6098
0
            if (iter->plp_construct) {
6099
0
                if (iter->plp_construct(iter->data, &iter->tail->b,
6100
0
                                        &iter->tail->cd) < 0) {
6101
0
                    mp_free(iter->mp, next);
6102
0
                    iter->error = 1;
6103
0
                    return -1;
6104
0
                }
6105
0
            }
6106
0
            if (overlap_push(iter, iter->tail) < 0) {
6107
0
                mp_free(iter->mp, next);
6108
0
                iter->error = 1;
6109
0
                return -1;
6110
0
            }
6111
0
            iter->tail->next = next;
6112
0
            iter->tail = iter->tail->next;
6113
0
        }
6114
0
    } else iter->is_eof = 1;
6115
0
    return 0;
6116
0
}
6117
6118
const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
6119
0
{
6120
0
    const bam_pileup1_t *plp;
6121
0
    if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
6122
0
    if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6123
0
    else { // no pileup line can be obtained; read alignments
6124
0
        *_n_plp = 0;
6125
0
        if (iter->is_eof) return 0;
6126
0
        int ret;
6127
0
        while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
6128
0
            if (bam_plp_push(iter, iter->b) < 0) {
6129
0
                *_n_plp = -1;
6130
0
                return 0;
6131
0
            }
6132
0
            if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6133
            // otherwise no pileup line can be returned; read the next alignment.
6134
0
        }
6135
0
        if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
6136
0
        if (bam_plp_push(iter, 0) < 0) {
6137
0
            *_n_plp = -1;
6138
0
            return 0;
6139
0
        }
6140
0
        if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6141
0
        return 0;
6142
0
    }
6143
0
}
6144
6145
const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
6146
0
{
6147
0
    hts_pos_t pos64 = 0;
6148
0
    const bam_pileup1_t *p = bam_plp64_auto(iter, _tid, &pos64, _n_plp);
6149
0
    if (pos64 < INT_MAX) {
6150
0
        *_pos = pos64;
6151
0
    } else {
6152
0
        hts_log_error("Position %"PRId64" too large", pos64);
6153
0
        *_pos = INT_MAX;
6154
0
        iter->error = 1;
6155
0
        *_n_plp = -1;
6156
0
        return NULL;
6157
0
    }
6158
0
    return p;
6159
0
}
6160
6161
void bam_plp_reset(bam_plp_t iter)
6162
0
{
6163
0
    overlap_remove(iter, NULL);
6164
0
    iter->max_tid = iter->max_pos = -1;
6165
0
    iter->tid = iter->pos = 0;
6166
0
    iter->is_eof = 0;
6167
0
    while (iter->head != iter->tail) {
6168
0
        lbnode_t *p = iter->head;
6169
0
        iter->head = p->next;
6170
0
        mp_free(iter->mp, p);
6171
0
    }
6172
0
}
6173
6174
void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
6175
0
{
6176
0
    iter->maxcnt = maxcnt;
6177
0
}
6178
6179
/************************
6180
 *** Mpileup iterator ***
6181
 ************************/
6182
6183
struct bam_mplp_s {
6184
    int n;
6185
    int32_t min_tid, *tid;
6186
    hts_pos_t min_pos, *pos;
6187
    bam_plp_t *iter;
6188
    int *n_plp;
6189
    const bam_pileup1_t **plp;
6190
};
6191
6192
bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
6193
0
{
6194
0
    int i;
6195
0
    bam_mplp_t iter;
6196
0
    iter = (bam_mplp_t)calloc(1, sizeof(struct bam_mplp_s));
6197
0
    iter->pos = (hts_pos_t*)calloc(n, sizeof(hts_pos_t));
6198
0
    iter->tid = (int32_t*)calloc(n, sizeof(int32_t));
6199
0
    iter->n_plp = (int*)calloc(n, sizeof(int));
6200
0
    iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
6201
0
    iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
6202
0
    iter->n = n;
6203
0
    iter->min_pos = HTS_POS_MAX;
6204
0
    iter->min_tid = (uint32_t)-1;
6205
0
    for (i = 0; i < n; ++i) {
6206
0
        iter->iter[i] = bam_plp_init(func, data[i]);
6207
0
        iter->pos[i] = iter->min_pos;
6208
0
        iter->tid[i] = iter->min_tid;
6209
0
    }
6210
0
    return iter;
6211
0
}
6212
6213
int bam_mplp_init_overlaps(bam_mplp_t iter)
6214
0
{
6215
0
    int i, r = 0;
6216
0
    for (i = 0; i < iter->n; ++i)
6217
0
        r |= bam_plp_init_overlaps(iter->iter[i]);
6218
0
    return r == 0 ? 0 : -1;
6219
0
}
6220
6221
void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
6222
0
{
6223
0
    int i;
6224
0
    for (i = 0; i < iter->n; ++i)
6225
0
        iter->iter[i]->maxcnt = maxcnt;
6226
0
}
6227
6228
void bam_mplp_destroy(bam_mplp_t iter)
6229
0
{
6230
0
    int i;
6231
0
    for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
6232
0
    free(iter->iter); free(iter->pos); free(iter->tid);
6233
0
    free(iter->n_plp); free(iter->plp);
6234
0
    free(iter);
6235
0
}
6236
6237
int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp)
6238
0
{
6239
0
    int i, ret = 0;
6240
0
    hts_pos_t new_min_pos = HTS_POS_MAX;
6241
0
    uint32_t new_min_tid = (uint32_t)-1;
6242
0
    for (i = 0; i < iter->n; ++i) {
6243
0
        if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
6244
0
            int tid;
6245
0
            hts_pos_t pos;
6246
0
            iter->plp[i] = bam_plp64_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
6247
0
            if ( iter->iter[i]->error ) return -1;
6248
0
            if (iter->plp[i]) {
6249
0
                iter->tid[i] = tid;
6250
0
                iter->pos[i] = pos;
6251
0
            } else {
6252
0
                iter->tid[i] = 0;
6253
0
                iter->pos[i] = 0;
6254
0
            }
6255
0
        }
6256
0
        if (iter->plp[i]) {
6257
0
            if (iter->tid[i] < new_min_tid) {
6258
0
                new_min_tid = iter->tid[i];
6259
0
                new_min_pos = iter->pos[i];
6260
0
            } else if (iter->tid[i] == new_min_tid && iter->pos[i] < new_min_pos) {
6261
0
                new_min_pos = iter->pos[i];
6262
0
            }
6263
0
        }
6264
0
    }
6265
0
    iter->min_pos = new_min_pos;
6266
0
    iter->min_tid = new_min_tid;
6267
0
    if (new_min_pos == HTS_POS_MAX) return 0;
6268
0
    *_tid = new_min_tid; *_pos = new_min_pos;
6269
0
    for (i = 0; i < iter->n; ++i) {
6270
0
        if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
6271
0
            n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
6272
0
            ++ret;
6273
0
        } else n_plp[i] = 0, plp[i] = 0;
6274
0
    }
6275
0
    return ret;
6276
0
}
6277
6278
int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
6279
0
{
6280
0
    hts_pos_t pos64 = 0;
6281
0
    int ret = bam_mplp64_auto(iter, _tid, &pos64, n_plp, plp);
6282
0
    if (ret >= 0) {
6283
0
        if (pos64 < INT_MAX) {
6284
0
            *_pos = pos64;
6285
0
        } else {
6286
0
            hts_log_error("Position %"PRId64" too large", pos64);
6287
0
            *_pos = INT_MAX;
6288
0
            return -1;
6289
0
        }
6290
0
    }
6291
0
    return ret;
6292
0
}
6293
6294
void bam_mplp_reset(bam_mplp_t iter)
6295
0
{
6296
0
    int i;
6297
0
    iter->min_pos = HTS_POS_MAX;
6298
0
    iter->min_tid = (uint32_t)-1;
6299
0
    for (i = 0; i < iter->n; ++i) {
6300
0
        bam_plp_reset(iter->iter[i]);
6301
0
        iter->pos[i] = HTS_POS_MAX;
6302
0
        iter->tid[i] = (uint32_t)-1;
6303
0
        iter->n_plp[i] = 0;
6304
0
        iter->plp[i] = NULL;
6305
0
    }
6306
0
}
6307
6308
void bam_mplp_constructor(bam_mplp_t iter,
6309
0
                          int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
6310
0
    int i;
6311
0
    for (i = 0; i < iter->n; ++i)
6312
0
        bam_plp_constructor(iter->iter[i], func);
6313
0
}
6314
6315
void bam_mplp_destructor(bam_mplp_t iter,
6316
0
                         int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
6317
0
    int i;
6318
0
    for (i = 0; i < iter->n; ++i)
6319
0
        bam_plp_destructor(iter->iter[i], func);
6320
0
}
6321
6322
#endif // ~!defined(BAM_NO_PILEUP)