Coverage Report

Created: 2026-05-16 07:02

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/htslib/sam.c
Line
Count
Source
1
/*  sam.c -- SAM and BAM file I/O and manipulation.
2
3
    Copyright (C) 2008-2010, 2012-2025 Genome Research Ltd.
4
    Copyright (C) 2010, 2012, 2013 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <strings.h>
30
#include <stdio.h>
31
#include <stdlib.h>
32
#include <string.h>
33
#include <errno.h>
34
#include <zlib.h>
35
#include <assert.h>
36
#include <signal.h>
37
#include <inttypes.h>
38
#include <unistd.h>
39
#include <regex.h>
40
41
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
42
#include "fuzz_settings.h"
43
#endif
44
45
// Suppress deprecation message for cigar_tab, which we initialise
46
#include "htslib/hts_defs.h"
47
#undef HTS_DEPRECATED
48
#define HTS_DEPRECATED(message)
49
50
#include "htslib/sam.h"
51
#include "htslib/bgzf.h"
52
#include "cram/cram.h"
53
#include "hts_internal.h"
54
#include "sam_internal.h"
55
#include "htslib/hfile.h"
56
#include "htslib/hts_endian.h"
57
#include "htslib/hts_expr.h"
58
#include "header.h"
59
60
#include "htslib/khash.h"
61
KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
62
KHASH_SET_INIT_INT(tag)
63
64
#ifndef EFTYPE
65
0
#define EFTYPE ENOEXEC
66
#endif
67
#ifndef EOVERFLOW
68
#define EOVERFLOW ERANGE
69
#endif
70
71
/**********************
72
 *** BAM header I/O ***
73
 **********************/
74
75
HTSLIB_EXPORT
76
const int8_t bam_cigar_table[256] = {
77
    // 0 .. 47
78
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
79
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
80
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
81
82
    // 48 .. 63  (including =)
83
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, BAM_CEQUAL, -1, -1,
84
85
    // 64 .. 79  (including MIDNHB)
86
    -1, -1, BAM_CBACK, -1,  BAM_CDEL, -1, -1, -1,
87
        BAM_CHARD_CLIP, BAM_CINS, -1, -1,  -1, BAM_CMATCH, BAM_CREF_SKIP, -1,
88
89
    // 80 .. 95  (including SPX)
90
    BAM_CPAD, -1, -1, BAM_CSOFT_CLIP,  -1, -1, -1, -1,
91
        BAM_CDIFF, -1, -1, -1,  -1, -1, -1, -1,
92
93
    // 96 .. 127
94
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
95
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
96
97
    // 128 .. 255
98
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
99
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
100
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
101
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
102
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
103
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
104
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
105
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1
106
};
107
108
sam_hdr_t *sam_hdr_init(void)
109
57.1k
{
110
57.1k
    sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t));
111
57.1k
    if (bh == NULL) return NULL;
112
113
57.1k
    bh->cigar_tab = bam_cigar_table;
114
57.1k
    return bh;
115
57.1k
}
116
117
void sam_hdr_destroy(sam_hdr_t *bh)
118
135k
{
119
135k
    int32_t i;
120
121
135k
    if (bh == NULL) return;
122
123
69.1k
    if (bh->ref_count > 0) {
124
12.0k
        --bh->ref_count;
125
12.0k
        return;
126
12.0k
    }
127
128
57.1k
    if (bh->target_name) {
129
7.40k
        for (i = 0; i < bh->n_targets; ++i)
130
4.75k
            free(bh->target_name[i]);
131
2.64k
        free(bh->target_name);
132
2.64k
        free(bh->target_len);
133
2.64k
    }
134
57.1k
    free(bh->text);
135
57.1k
    if (bh->hrecs)
136
39.5k
        sam_hrecs_free(bh->hrecs);
137
57.1k
    if (bh->sdict)
138
260
        kh_destroy(s2i, (khash_t(s2i) *) bh->sdict);
139
57.1k
    free(bh);
140
57.1k
}
141
142
// Copy the sam_hdr_t::sdict hash, used to store the real lengths of long
143
// references before sam_hdr_t::hrecs is populated
144
int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h)
145
0
{
146
0
    const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict;
147
0
    khash_t(s2i) *dest_long_refs = kh_init(s2i);
148
0
    int i;
149
0
    if (!dest_long_refs) return -1;
150
151
0
    for (i = 0; i < h->n_targets; i++) {
152
0
        int ret;
153
0
        khiter_t ksrc, kdest;
154
0
        if (h->target_len[i] < UINT32_MAX) continue;
155
0
        ksrc = kh_get(s2i, src_long_refs, h->target_name[i]);
156
0
        if (ksrc == kh_end(src_long_refs)) continue;
157
0
        kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret);
158
0
        if (ret < 0) {
159
0
            kh_destroy(s2i, dest_long_refs);
160
0
            return -1;
161
0
        }
162
0
        kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc);
163
0
    }
164
165
0
    h->sdict = dest_long_refs;
166
0
    return 0;
167
0
}
168
169
sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0)
170
28.2k
{
171
28.2k
    if (h0 == NULL) return NULL;
172
28.2k
    sam_hdr_t *h;
173
28.2k
    if ((h = sam_hdr_init()) == NULL) return NULL;
174
    // copy the simple data
175
28.2k
    h->n_targets = 0;
176
28.2k
    h->ignore_sam_err = h0->ignore_sam_err;
177
28.2k
    h->l_text = 0;
178
179
    // Then the pointery stuff
180
181
28.2k
    if (!h0->hrecs) {
182
177
        h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t));
183
177
        if (!h->target_len) goto fail;
184
177
        h->target_name = (char**)calloc(h0->n_targets, sizeof(char*));
185
177
        if (!h->target_name) goto fail;
186
187
177
        int i;
188
372
        for (i = 0; i < h0->n_targets; ++i) {
189
195
            h->target_len[i] = h0->target_len[i];
190
195
            h->target_name[i] = strdup(h0->target_name[i]);
191
195
            if (!h->target_name[i]) break;
192
195
        }
193
177
        h->n_targets = i;
194
177
        if (i < h0->n_targets) goto fail;
195
196
177
        if (h0->sdict) {
197
0
            if (sam_hdr_dup_sdict(h0, h) < 0) goto fail;
198
0
        }
199
177
    }
200
201
28.2k
    if (h0->hrecs) {
202
28.0k
        kstring_t tmp = { 0, 0, NULL };
203
28.0k
        if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) {
204
0
            free(ks_release(&tmp));
205
0
            goto fail;
206
0
        }
207
208
28.0k
        h->l_text = tmp.l;
209
28.0k
        h->text   = ks_release(&tmp);
210
211
28.0k
        if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0)
212
0
            goto fail;
213
28.0k
    } else {
214
177
        h->l_text = h0->text ? h0->l_text : 0;
215
177
        h->text = malloc(h->l_text + 1);
216
177
        if (!h->text) goto fail;
217
177
        if (h0->text)
218
177
            memcpy(h->text, h0->text, h->l_text);
219
177
        h->text[h->l_text] = '\0';
220
177
    }
221
222
28.2k
    return h;
223
224
0
 fail:
225
0
    sam_hdr_destroy(h);
226
0
    return NULL;
227
28.2k
}
228
229
sam_hdr_t *bam_hdr_read(BGZF *fp)
230
2.13k
{
231
2.13k
    sam_hdr_t *h;
232
2.13k
    uint8_t buf[4];
233
2.13k
    int magic_len, has_EOF;
234
2.13k
    int32_t i, name_len, num_names = 0;
235
2.13k
    size_t bufsize;
236
2.13k
    ssize_t bytes;
237
    // check EOF
238
2.13k
    has_EOF = bgzf_check_EOF(fp);
239
2.13k
    if (has_EOF < 0) {
240
0
        perror("[W::bam_hdr_read] bgzf_check_EOF");
241
2.13k
    } else if (has_EOF == 0) {
242
2.13k
        hts_log_warning("EOF marker is absent. The input is probably truncated");
243
2.13k
    }
244
    // read "BAM1"
245
2.13k
    magic_len = bgzf_read(fp, buf, 4);
246
2.13k
    if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) {
247
3
        hts_log_error("Invalid BAM binary header");
248
3
        return 0;
249
3
    }
250
2.13k
    h = sam_hdr_init();
251
2.13k
    if (!h) goto nomem;
252
253
    // read plain text and the number of reference sequences
254
2.13k
    bytes = bgzf_read(fp, buf, 4);
255
2.13k
    if (bytes != 4) goto read_err;
256
2.12k
    h->l_text = le_to_u32(buf);
257
258
2.12k
    bufsize = h->l_text + 1;
259
2.12k
    if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
260
2.12k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
261
2.12k
    if (bufsize > FUZZ_ALLOC_LIMIT) goto nomem;
262
2.12k
#endif
263
2.12k
    h->text = (char*)malloc(bufsize);
264
2.12k
    if (!h->text) goto nomem;
265
2.12k
    h->text[h->l_text] = 0; // make sure it is NULL terminated
266
2.12k
    bytes = bgzf_read(fp, h->text, h->l_text);
267
2.12k
    if (bytes != h->l_text) goto read_err;
268
269
1.95k
    bytes = bgzf_read(fp, &h->n_targets, 4);
270
1.95k
    if (bytes != 4) goto read_err;
271
1.94k
    if (fp->is_be) ed_swap_4p(&h->n_targets);
272
273
1.94k
    if (h->n_targets < 0) goto invalid;
274
275
    // read reference sequence names and lengths
276
1.89k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
277
1.89k
    if (h->n_targets > (FUZZ_ALLOC_LIMIT - bufsize)/(sizeof(char*)+sizeof(uint32_t)))
278
12
        goto nomem;
279
1.88k
#endif
280
1.88k
    if (h->n_targets > 0) {
281
840
        h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
282
840
        if (!h->target_name) goto nomem;
283
840
        h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
284
840
        if (!h->target_len) goto nomem;
285
840
    }
286
1.04k
    else {
287
1.04k
        h->target_name = NULL;
288
1.04k
        h->target_len = NULL;
289
1.04k
    }
290
291
3.26k
    for (i = 0; i != h->n_targets; ++i) {
292
1.56k
        bytes = bgzf_read(fp, &name_len, 4);
293
1.56k
        if (bytes != 4) goto read_err;
294
1.50k
        if (fp->is_be) ed_swap_4p(&name_len);
295
1.50k
        if (name_len <= 0) goto invalid;
296
297
1.46k
        h->target_name[i] = (char*)malloc(name_len);
298
1.46k
        if (!h->target_name[i]) goto nomem;
299
1.46k
        num_names++;
300
301
1.46k
        bytes = bgzf_read(fp, h->target_name[i], name_len);
302
1.46k
        if (bytes != name_len) goto read_err;
303
304
1.39k
        if (h->target_name[i][name_len - 1] != '\0') {
305
            /* Fix missing NUL-termination.  Is this being too nice?
306
               We could alternatively bail out with an error. */
307
996
            char *new_name;
308
996
            if (name_len == INT32_MAX) goto invalid;
309
996
            new_name = realloc(h->target_name[i], name_len + 1);
310
996
            if (new_name == NULL) goto nomem;
311
996
            h->target_name[i] = new_name;
312
996
            h->target_name[i][name_len] = '\0';
313
996
        }
314
315
1.39k
        bytes = bgzf_read(fp, &h->target_len[i], 4);
316
1.39k
        if (bytes != 4) goto read_err;
317
1.37k
        if (fp->is_be) ed_swap_4p(&h->target_len[i]);
318
1.37k
    }
319
1.69k
    return h;
320
321
18
 nomem:
322
18
    hts_log_error("Out of memory");
323
18
    goto clean;
324
325
327
 read_err:
326
327
    if (bytes < 0) {
327
21
        hts_log_error("Error reading BGZF stream");
328
306
    } else {
329
306
        hts_log_error("Truncated BAM header");
330
306
    }
331
327
    goto clean;
332
333
93
 invalid:
334
93
    hts_log_error("Invalid BAM binary header");
335
336
438
 clean:
337
438
    if (h != NULL) {
338
438
        h->n_targets = num_names; // ensure we free only allocated target_names
339
438
        sam_hdr_destroy(h);
340
438
    }
341
438
    return NULL;
342
93
}
343
344
int bam_hdr_write(BGZF *fp, const sam_hdr_t *h)
345
5.63k
{
346
5.63k
    int32_t i, name_len, x;
347
5.63k
    kstring_t hdr_ks = { 0, 0, NULL };
348
5.63k
    char *text;
349
5.63k
    uint32_t l_text;
350
351
5.63k
    if (!h) return -1;
352
353
5.63k
    if (h->hrecs) {
354
5.57k
        if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1;
355
5.57k
        if (hdr_ks.l > UINT32_MAX) {
356
0
            hts_log_error("Header too long for BAM format");
357
0
            free(hdr_ks.s);
358
0
            return -1;
359
5.57k
        } else if (hdr_ks.l > INT32_MAX) {
360
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
361
0
            hts_log_warning("Output file may not be portable");
362
0
        }
363
5.57k
        text = hdr_ks.s;
364
5.57k
        l_text = hdr_ks.l;
365
5.57k
    } else {
366
59
        if (h->l_text > UINT32_MAX) {
367
0
            hts_log_error("Header too long for BAM format");
368
0
            return -1;
369
59
        } else if (h->l_text > INT32_MAX) {
370
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
371
0
            hts_log_warning("Output file may not be portable");
372
0
        }
373
59
        text = h->text;
374
59
        l_text = h->l_text;
375
59
    }
376
    // write "BAM1"
377
5.63k
    if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; }
378
    // write plain text and the number of reference sequences
379
5.63k
    if (fp->is_be) {
380
0
        x = ed_swap_4(l_text);
381
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
382
0
        if (l_text) {
383
0
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
384
0
        }
385
0
        x = ed_swap_4(h->n_targets);
386
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
387
5.63k
    } else {
388
5.63k
        if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; }
389
5.63k
        if (l_text) {
390
1.76k
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
391
1.76k
        }
392
5.63k
        if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; }
393
5.63k
    }
394
5.63k
    free(hdr_ks.s);
395
    // write sequence names and lengths
396
6.19k
    for (i = 0; i != h->n_targets; ++i) {
397
565
        char *p = h->target_name[i];
398
565
        name_len = strlen(p) + 1;
399
565
        if (fp->is_be) {
400
0
            x = ed_swap_4(name_len);
401
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
402
565
        } else {
403
565
            if (bgzf_write(fp, &name_len, 4) < 0) return -1;
404
565
        }
405
565
        if (bgzf_write(fp, p, name_len) < 0) return -1;
406
565
        if (fp->is_be) {
407
0
            x = ed_swap_4(h->target_len[i]);
408
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
409
565
        } else {
410
565
            if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
411
565
        }
412
565
    }
413
5.63k
    if (bgzf_flush(fp) < 0) return -1;
414
5.63k
    return 0;
415
5.63k
}
416
417
// Wrap around bam_name2id() to get the right signature for hts_name2id_f
418
0
static int bam_name2id_wrapper(void *vhdr, const char *ref) {
419
0
    return bam_name2id((sam_hdr_t *) vhdr, ref);
420
0
}
421
422
const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
423
0
                             hts_pos_t *beg, hts_pos_t *end, int flags) {
424
0
    return hts_parse_region(s, tid, beg, end, bam_name2id_wrapper, h, flags);
425
0
}
426
427
/*************************
428
 *** BAM alignment I/O ***
429
 *************************/
430
431
bam1_t *bam_init1(void)
432
1.10M
{
433
1.10M
    return (bam1_t*)calloc(1, sizeof(bam1_t));
434
1.10M
}
435
436
int sam_realloc_bam_data(bam1_t *b, size_t desired)
437
1.12M
{
438
1.12M
    uint32_t new_m_data;
439
1.12M
    uint8_t *new_data;
440
1.12M
    new_m_data = desired;
441
1.12M
    kroundup32(new_m_data); // next power of 2
442
1.12M
    new_m_data += 32; // reduces malloc arena migrations?
443
1.12M
    if (new_m_data < desired) {
444
0
        errno = ENOMEM; // Not strictly true but we can't store the size
445
0
        return -1;
446
0
    }
447
1.12M
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
448
1.12M
    if (new_m_data > FUZZ_ALLOC_LIMIT) {
449
39
        errno = ENOMEM;
450
39
        return -1;
451
39
    }
452
1.12M
#endif
453
1.12M
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
454
1.12M
        new_data = realloc(b->data, new_m_data);
455
1.12M
    } else {
456
0
        if ((new_data = malloc(new_m_data)) != NULL) {
457
0
            if (b->l_data > 0)
458
0
                memcpy(new_data, b->data,
459
0
                       b->l_data < b->m_data ? b->l_data : b->m_data);
460
0
            bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA));
461
0
        }
462
0
    }
463
1.12M
    if (!new_data) return -1;
464
1.12M
    b->data = new_data;
465
1.12M
    b->m_data = new_m_data;
466
1.12M
    return 0;
467
1.12M
}
468
469
void bam_destroy1(bam1_t *b)
470
26.2M
{
471
26.2M
    if (b == 0) return;
472
1.10M
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
473
1.10M
        free(b->data);
474
1.10M
        if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) {
475
            // In case of reuse
476
0
            b->data = NULL;
477
0
            b->m_data = 0;
478
0
            b->l_data = 0;
479
0
        }
480
1.10M
    }
481
482
1.10M
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0)
483
1.10M
        free(b);
484
1.10M
}
485
486
bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
487
5.50M
{
488
5.50M
    if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL;
489
5.50M
    memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data
490
5.50M
    memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest
491
5.50M
    bdst->l_data = bsrc->l_data;
492
5.50M
    bdst->id = bsrc->id;
493
5.50M
    return bdst;
494
5.50M
}
495
496
bam1_t *bam_dup1(const bam1_t *bsrc)
497
1.08M
{
498
1.08M
    if (bsrc == NULL) return NULL;
499
1.08M
    bam1_t *bdst = bam_init1();
500
1.08M
    if (bdst == NULL) return NULL;
501
1.08M
    if (bam_copy1(bdst, bsrc) == NULL) {
502
0
        bam_destroy1(bdst);
503
0
        return NULL;
504
0
    }
505
1.08M
    return bdst;
506
1.08M
}
507
508
static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar,
509
                             hts_pos_t *rlen, hts_pos_t *qlen)
510
1.51k
{
511
1.51k
    int k;
512
1.51k
    *rlen = *qlen = 0;
513
45.4k
    for (k = 0; k < n_cigar; ++k) {
514
43.8k
        int type = bam_cigar_type(bam_cigar_op(cigar[k]));
515
43.8k
        int len = bam_cigar_oplen(cigar[k]);
516
43.8k
        if (type & 1) *qlen += len;
517
43.8k
        if (type & 2) *rlen += len;
518
43.8k
    }
519
1.51k
}
520
521
static int subtract_check_underflow(size_t length, size_t *limit)
522
82.5M
{
523
82.5M
    if (length <= *limit) {
524
82.5M
        *limit -= length;
525
82.5M
        return 0;
526
82.5M
    }
527
528
0
    return -1;
529
82.5M
}
530
531
int bam_set1(bam1_t *bam,
532
             size_t l_qname, const char *qname,
533
             uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq,
534
             size_t n_cigar, const uint32_t *cigar,
535
             int32_t mtid, hts_pos_t mpos, hts_pos_t isize,
536
             size_t l_seq, const char *seq, const char *qual,
537
             size_t l_aux)
538
16.5M
{
539
    // use a default qname "*" if none is provided
540
16.5M
    if (l_qname == 0) {
541
14.9M
        l_qname = 1;
542
14.9M
        qname = "*";
543
14.9M
    }
544
545
    // note: the qname is stored nul terminated and padded as described in the
546
    // documentation for the bam1_t struct.
547
16.5M
    size_t qname_nuls = 4 - l_qname % 4;
548
549
    // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos().
550
    // can't use bam_endpos() directly as some fields not yet set up.
551
16.5M
    hts_pos_t rlen = 0, qlen = 0;
552
16.5M
    if (!(flag & BAM_FUNMAP)) {
553
0
        bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen);
554
0
    }
555
16.5M
    if (rlen == 0) {
556
16.5M
        rlen = 1;
557
16.5M
    }
558
559
    // validate parameters
560
16.5M
    if (l_qname > 254) {
561
114
        hts_log_error("Query name too long");
562
114
        errno = EINVAL;
563
114
        return -1;
564
114
    }
565
16.5M
    if (HTS_POS_MAX - rlen <= pos) {
566
0
        hts_log_error("Read ends beyond highest supported position");
567
0
        errno = EINVAL;
568
0
        return -1;
569
0
    }
570
16.5M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) {
571
0
        hts_log_error("Mapped query must have a CIGAR");
572
0
        errno = EINVAL;
573
0
        return -1;
574
0
    }
575
16.5M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) {
576
0
        hts_log_error("CIGAR and query sequence are of different length");
577
0
        errno = EINVAL;
578
0
        return -1;
579
0
    }
580
581
16.5M
    size_t limit = INT32_MAX;
582
16.5M
    int u = subtract_check_underflow(l_qname + qname_nuls, &limit);
583
16.5M
    u    += subtract_check_underflow(n_cigar * 4, &limit);
584
16.5M
    u    += subtract_check_underflow((l_seq + 1) / 2, &limit);
585
16.5M
    u    += subtract_check_underflow(l_seq, &limit);
586
16.5M
    u    += subtract_check_underflow(l_aux, &limit);
587
16.5M
    if (u != 0) {
588
0
        hts_log_error("Size overflow");
589
0
        errno = EINVAL;
590
0
        return -1;
591
0
    }
592
593
    // re-allocate the data buffer as needed.
594
16.5M
    size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq;
595
16.5M
    if (realloc_bam_data(bam, data_len + l_aux) < 0) {
596
0
        return -1;
597
0
    }
598
599
16.5M
    bam->l_data = (int)data_len;
600
16.5M
    bam->core.pos = pos;
601
16.5M
    bam->core.tid = tid;
602
16.5M
    bam->core.bin = bam_reg2bin(pos, pos + rlen);
603
16.5M
    bam->core.qual = mapq;
604
16.5M
    bam->core.l_extranul = (uint8_t)(qname_nuls - 1);
605
16.5M
    bam->core.flag = flag;
606
16.5M
    bam->core.l_qname = (uint16_t)(l_qname + qname_nuls);
607
16.5M
    bam->core.n_cigar = (uint32_t)n_cigar;
608
16.5M
    bam->core.l_qseq = (int32_t)l_seq;
609
16.5M
    bam->core.mtid = mtid;
610
16.5M
    bam->core.mpos = mpos;
611
16.5M
    bam->core.isize = isize;
612
613
16.5M
    uint8_t *cp = bam->data;
614
16.5M
    strncpy((char *)cp, qname, l_qname);
615
16.5M
    int i;
616
65.5M
    for (i = 0; i < qname_nuls; i++) {
617
49.0M
        cp[l_qname + i] = '\0';
618
49.0M
    }
619
16.5M
    cp += l_qname + qname_nuls;
620
621
16.5M
    if (n_cigar > 0) {
622
0
        memcpy(cp, cigar, n_cigar * 4);
623
0
    }
624
16.5M
    cp += n_cigar * 4;
625
626
641M
#define NN 16
627
16.5M
    const uint8_t *useq = (uint8_t *)seq;
628
68.6M
    for (i = 0; i + NN < l_seq; i += NN) {
629
52.1M
        int j;
630
52.1M
        const uint8_t *u2 = useq+i;
631
469M
        for (j = 0; j < NN/2; j++)
632
416M
            cp[j] = (seq_nt16_table[u2[j*2]]<<4) | seq_nt16_table[u2[j*2+1]];
633
52.1M
        cp += NN/2;
634
52.1M
    }
635
17.9M
    for (; i + 1 < l_seq; i += 2) {
636
1.43M
        *cp++ = (seq_nt16_table[useq[i]] << 4) | seq_nt16_table[useq[i + 1]];
637
1.43M
    }
638
639
16.7M
    for (; i < l_seq; i++) {
640
235k
        *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4;
641
235k
    }
642
643
16.5M
    if (qual) {
644
513
        memcpy(cp, qual, l_seq);
645
513
    }
646
16.5M
    else {
647
16.5M
        memset(cp, '\xff', l_seq);
648
16.5M
    }
649
650
16.5M
    return (int)data_len;
651
16.5M
}
652
653
hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
654
5.35M
{
655
5.35M
    int k;
656
5.35M
    hts_pos_t l;
657
5.36M
    for (k = l = 0; k < n_cigar; ++k)
658
9.29k
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
659
4.66k
            l += bam_cigar_oplen(cigar[k]);
660
5.35M
    return l;
661
5.35M
}
662
663
hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
664
756
{
665
756
    int k;
666
756
    hts_pos_t l;
667
4.34k
    for (k = l = 0; k < n_cigar; ++k)
668
3.59k
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
669
2.77k
            l += bam_cigar_oplen(cigar[k]);
670
756
    return l;
671
756
}
672
673
hts_pos_t bam_endpos(const bam1_t *b)
674
6
{
675
6
    hts_pos_t rlen = (b->core.flag & BAM_FUNMAP)? 0 : bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
676
6
    if (rlen == 0) rlen = 1;
677
6
    return b->core.pos + rlen;
678
6
}
679
680
static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
681
8.04k
{
682
8.04k
    bam1_core_t *c = &b->core;
683
684
    // Bail out as fast as possible for the easy case
685
8.04k
    uint32_t test_CG = BAM_CSOFT_CLIP | (c->l_qseq << BAM_CIGAR_SHIFT);
686
8.04k
    if (c->n_cigar == 0 || test_CG != *bam_get_cigar(b))
687
6.48k
        return 0;
688
689
    // The above isn't fool proof - we may have old CIGAR tags that aren't used,
690
    // but this is much less likely so do as a secondary check.
691
1.55k
    if (c->tid < 0 || c->pos < 0)
692
1.43k
        return 0;
693
694
    // Do we have a CG tag?
695
122
    uint8_t *CG = bam_aux_get(b, "CG");
696
122
    int saved_errno = errno;
697
122
    if (!CG) {
698
103
        if (errno != ENOENT) return -1;  // Bad aux data
699
103
        errno = saved_errno; // restore errno on expected no-CG-tag case
700
78
        return 0;
701
103
    }
702
703
    // Now we start with the serious work migrating CG to CIGAR
704
19
    uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data,
705
19
        *cigar0, CG_len, fake_bytes;
706
19
    cigar0 = bam_get_cigar(b);
707
19
    fake_bytes = c->n_cigar * 4;
708
19
    if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i'))
709
13
        return 0; // not of type B,I
710
6
    CG_len = le_to_u32(CG + 2);
711
    // don't move if the real CIGAR length is shorter than the fake cigar length
712
6
    if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0;
713
714
    // move from the CG tag to the right position
715
6
    cigar_st = (uint8_t*)cigar0 - b->data;
716
6
    c->n_cigar = CG_len;
717
6
    n_cigar4 = c->n_cigar * 4;
718
6
    CG_st = CG - b->data - 2;
719
6
    CG_en = CG_st + 8 + n_cigar4;
720
6
    if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1;
721
    // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
722
6
    b->l_data = b->l_data - fake_bytes + n_cigar4;
723
    // insert c->n_cigar-fake_bytes empty space to make room
724
6
    memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes));
725
    // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
726
6
    memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4);
727
6
    if (ori_len > CG_en) // move data after the CG tag
728
0
        memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
729
6
    b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
730
6
    if (recal_bin)
731
6
        b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5);
732
6
    if (give_warning)
733
6
        hts_log_warning("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
734
6
    return 1;
735
6
}
736
737
static inline int aux_type2size(uint8_t type)
738
655k
{
739
655k
    switch (type) {
740
427k
    case 'A': case 'c': case 'C':
741
427k
        return 1;
742
90.0k
    case 's': case 'S':
743
90.0k
        return 2;
744
112k
    case 'i': case 'I': case 'f':
745
112k
        return 4;
746
13.6k
    case 'd':
747
13.6k
        return 8;
748
11.7k
    case 'Z': case 'H': case 'B':
749
11.7k
        return type;
750
476
    default:
751
476
        return 0;
752
655k
    }
753
655k
}
754
755
static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
756
0
{
757
0
    uint32_t *cigar = (uint32_t*)(data + c->l_qname);
758
0
    uint32_t i;
759
0
    for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
760
0
}
761
762
// Fix bad records where qname is not terminated correctly.
763
1.30k
static int fixup_missing_qname_nul(bam1_t *b) {
764
1.30k
    bam1_core_t *c = &b->core;
765
766
    // Note this is called before c->l_extranul is added to c->l_qname
767
1.30k
    if (c->l_extranul > 0) {
768
864
        b->data[c->l_qname++] = '\0';
769
864
        c->l_extranul--;
770
864
    } else {
771
438
        if (b->l_data > INT_MAX - 4) return -1;
772
438
        if (realloc_bam_data(b, b->l_data + 4) < 0) return -1;
773
438
        b->l_data += 4;
774
438
        b->data[c->l_qname++] = '\0';
775
438
        c->l_extranul = 3;
776
438
    }
777
1.30k
    return 0;
778
1.30k
}
779
780
/*
781
 * Note a second interface that returns a bam pointer instead would avoid bam_copy1
782
 * in multi-threaded handling.  This may be worth considering for htslib2.
783
 */
784
int bam_read1(BGZF *fp, bam1_t *b)
785
3.05k
{
786
3.05k
    bam1_core_t *c = &b->core;
787
3.05k
    int32_t block_len, ret, i;
788
3.05k
    uint32_t new_l_data;
789
3.05k
    uint8_t tmp[32], *x;
790
791
3.05k
    b->l_data = 0;
792
793
3.05k
    if ((ret = bgzf_read_small(fp, &block_len, 4)) != 4) {
794
202
        if (ret == 0) return -1; // normal end-of-file
795
90
        else return -2; // truncated
796
202
    }
797
2.85k
    if (fp->is_be)
798
0
        ed_swap_4p(&block_len);
799
2.85k
    if (block_len < 32) return -4;  // block_len includes core data
800
2.63k
    if (fp->block_length - fp->block_offset > 32) {
801
        // Avoid bgzf_read and a temporary copy to a local buffer
802
2.30k
        x = (uint8_t *)fp->uncompressed_block + fp->block_offset;
803
2.30k
        fp->block_offset += 32;
804
2.30k
    } else {
805
326
        x = tmp;
806
326
        if (bgzf_read(fp, x, 32) != 32) return -3;
807
326
    }
808
809
2.36k
    c->tid        = le_to_u32(x);
810
2.36k
    c->pos        = le_to_i32(x+4);
811
2.36k
    uint32_t x2   = le_to_u32(x+8);
812
2.36k
    c->bin        = x2>>16;
813
2.36k
    c->qual       = x2>>8&0xff;
814
2.36k
    c->l_qname    = x2&0xff;
815
2.36k
    c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
816
2.36k
    uint32_t x3   = le_to_u32(x+12);
817
2.36k
    c->flag       = x3>>16;
818
2.36k
    c->n_cigar    = x3&0xffff;
819
2.36k
    c->l_qseq     = le_to_u32(x+16);
820
2.36k
    c->mtid       = le_to_u32(x+20);
821
2.36k
    c->mpos       = le_to_i32(x+24);
822
2.36k
    c->isize      = le_to_i32(x+28);
823
824
2.36k
    new_l_data = block_len - 32 + c->l_extranul;
825
2.36k
    if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
826
2.28k
    if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
827
2.28k
        + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data)
828
122
        return -4;
829
2.16k
    if (realloc_bam_data(b, new_l_data) < 0) return -4;
830
2.12k
    b->l_data = new_l_data;
831
832
2.12k
    if (bgzf_read_small(fp, b->data, c->l_qname) != c->l_qname) return -4;
833
2.07k
    if (b->data[c->l_qname - 1] != '\0') { // try to fix missing nul termination
834
1.30k
        if (fixup_missing_qname_nul(b) < 0) return -4;
835
1.30k
    }
836
5.14k
    for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
837
2.07k
    c->l_qname += c->l_extranul;
838
2.07k
    if (b->l_data < c->l_qname ||
839
2.07k
        bgzf_read_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
840
147
        return -4;
841
1.93k
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
842
1.93k
    if (bam_tag2cigar(b, 0, 0) < 0)
843
25
        return -4;
844
845
    // TODO: consider making this conditional
846
1.90k
    if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
847
1.51k
        hts_pos_t rlen, qlen;
848
1.51k
        bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
849
1.51k
        if ((b->core.flag & BAM_FUNMAP) || rlen == 0) rlen = 1;
850
1.51k
        b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
851
        // Sanity check for broken CIGAR alignments
852
1.51k
        if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
853
48
            hts_log_error("CIGAR and query sequence lengths differ for %s",
854
48
                    bam_get_qname(b));
855
48
            return -4;
856
48
        }
857
1.51k
    }
858
859
1.85k
    return 4 + block_len;
860
1.90k
}
861
862
int bam_write1(BGZF *fp, const bam1_t *b)
863
5.50M
{
864
5.50M
    const bam1_core_t *c = &b->core;
865
5.50M
    uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
866
5.50M
    int i, ok;
867
5.50M
    if (c->l_qname - c->l_extranul > 255) {
868
2
        hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b));
869
2
        errno = EOVERFLOW;
870
2
        return -1;
871
2
    }
872
5.50M
    if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
873
5.50M
    if (c->pos > INT_MAX ||
874
5.50M
        c->mpos > INT_MAX ||
875
5.50M
        c->isize < INT_MIN || c->isize > INT_MAX) {
876
101
        hts_log_error("Positional data is too large for BAM format");
877
101
        return -1;
878
101
    }
879
5.50M
    x[0] = c->tid;
880
5.50M
    x[1] = c->pos;
881
5.50M
    x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
882
5.50M
    if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
883
5.50M
    else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
884
5.50M
    x[4] = c->l_qseq;
885
5.50M
    x[5] = c->mtid;
886
5.50M
    x[6] = c->mpos;
887
5.50M
    x[7] = c->isize;
888
5.50M
    ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
889
5.50M
    if (fp->is_be) {
890
0
        for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
891
0
        y = block_len;
892
0
        if (ok) ok = (bgzf_write_small(fp, ed_swap_4p(&y), 4) >= 0);
893
0
        swap_data(c, b->l_data, b->data, 1);
894
5.50M
    } else {
895
5.50M
        if (ok) ok = (bgzf_write_small(fp, &block_len, 4) >= 0);
896
5.50M
    }
897
5.50M
    if (ok) ok = (bgzf_write_small(fp, x, 32) >= 0);
898
5.50M
    if (ok) ok = (bgzf_write_small(fp, b->data, c->l_qname - c->l_extranul) >= 0);
899
5.50M
    if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
900
5.50M
        if (ok) ok = (bgzf_write_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
901
5.50M
    } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
902
0
        uint8_t buf[8];
903
0
        uint32_t cigar_st, cigar_en, cigar[2];
904
0
        hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b));
905
0
        if (cigreflen >= (1<<28)) {
906
            // Length of reference covered is greater than the biggest
907
            // CIGAR operation currently allowed.
908
0
            hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos
909
0
                          " cannot be written in BAM.  Try writing SAM or CRAM instead.\n",
910
0
                          bam_get_qname(b), c->n_cigar, cigreflen);
911
0
            return -1;
912
0
        }
913
0
        cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
914
0
        cigar_en = cigar_st + c->n_cigar * 4;
915
0
        cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
916
0
        cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP;
917
0
        u32_to_le(cigar[0], buf);
918
0
        u32_to_le(cigar[1], buf + 4);
919
0
        if (ok) ok = (bgzf_write_small(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
920
0
        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
921
0
        if (ok) ok = (bgzf_write_small(fp, "CGBI", 4) >= 0); // write CG:B,I
922
0
        u32_to_le(c->n_cigar, buf);
923
0
        if (ok) ok = (bgzf_write_small(fp, buf, 4) >= 0); // write the true CIGAR length
924
0
        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
925
0
    }
926
5.50M
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
927
5.50M
    return ok? 4 + block_len : -1;
928
5.50M
}
929
930
/*
931
 * Write a BAM file and append to the in-memory index simultaneously.
932
 */
933
5.50M
static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) {
934
5.50M
    BGZF *bfp = fp->fp.bgzf;
935
936
5.50M
    if (!fp->idx)
937
5.50M
        return bam_write1(bfp, b);
938
939
0
    uint32_t block_len = b->l_data - b->core.l_extranul + 32;
940
0
    if (bgzf_flush_try(bfp, 4 + block_len) < 0)
941
0
        return -1;
942
0
    if (!bfp->mt)
943
0
        hts_idx_amend_last(fp->idx, bgzf_tell(bfp));
944
945
0
    int ret = bam_write1(bfp, b);
946
0
    if (ret < 0)
947
0
        return -1;
948
949
0
    if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) {
950
0
        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
951
0
                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
952
0
        ret = -1;
953
0
    }
954
955
0
    return ret;
956
0
}
957
958
/*
959
 * Set the qname in a BAM record
960
 */
961
int bam_set_qname(bam1_t *rec, const char *qname)
962
0
{
963
0
    if (!rec) return -1;
964
0
    if (!qname || !*qname) return -1;
965
966
0
    size_t old_len = rec->core.l_qname;
967
0
    size_t new_len = strlen(qname) + 1;
968
0
    if (new_len < 1 || new_len > 255) return -1;
969
970
0
    int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0;
971
972
0
    size_t new_data_len = rec->l_data - old_len + new_len + extranul;
973
0
    if (realloc_bam_data(rec, new_data_len) < 0) return -1;
974
975
    // Make room
976
0
    if (new_len + extranul != rec->core.l_qname)
977
0
        memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname);
978
    // Copy in new name and pad if needed
979
0
    memcpy(rec->data, qname, new_len);
980
0
    int n;
981
0
    for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0';
982
983
0
    rec->l_data = new_data_len;
984
0
    rec->core.l_qname = new_len + extranul;
985
0
    rec->core.l_extranul = extranul;
986
987
0
    return 0;
988
0
}
989
990
/********************
991
 *** BAM indexing ***
992
 ********************/
993
994
static hts_idx_t *sam_index(htsFile *fp, int min_shift)
995
0
{
996
0
    int n_lvls, i, fmt, ret;
997
0
    bam1_t *b;
998
0
    hts_idx_t *idx;
999
0
    sam_hdr_t *h;
1000
0
    h = sam_hdr_read(fp);
1001
0
    if (h == NULL) return NULL;
1002
0
    if (min_shift > 0) {
1003
0
        hts_pos_t max_len = 0;
1004
0
        for (i = 0; i < h->n_targets; ++i) {
1005
0
            hts_pos_t len = sam_hdr_tid2len(h, i);
1006
0
            if (max_len < len) max_len = len;
1007
0
        }
1008
0
        n_lvls = 0;
1009
0
        hts_adjust_csi_settings(max_len, &min_shift, &n_lvls);
1010
0
        fmt = HTS_FMT_CSI;
1011
0
    } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1012
0
    idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1013
0
    b = bam_init1();
1014
0
    while ((ret = sam_read1(fp, h, b)) >= 0) {
1015
0
        ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP));
1016
0
        if (ret < 0) { // unsorted or doesn't fit
1017
0
            hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
1018
0
            goto err;
1019
0
        }
1020
0
    }
1021
0
    if (ret < -1) goto err; // corrupted BAM file
1022
1023
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
1024
0
    sam_hdr_destroy(h);
1025
0
    bam_destroy1(b);
1026
0
    return idx;
1027
1028
0
err:
1029
0
    bam_destroy1(b);
1030
0
    hts_idx_destroy(idx);
1031
0
    return NULL;
1032
0
}
1033
1034
int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
1035
0
{
1036
0
    hts_idx_t *idx;
1037
0
    htsFile *fp;
1038
0
    int ret = 0;
1039
1040
0
    if ((fp = hts_open(fn, "r")) == 0) return -2;
1041
0
    if (nthreads)
1042
0
        hts_set_threads(fp, nthreads);
1043
1044
0
    switch (fp->format.format) {
1045
0
    case cram:
1046
1047
0
        ret = cram_index_build(fp->fp.cram, fn, fnidx);
1048
0
        break;
1049
1050
0
    case bam:
1051
0
    case sam:
1052
0
        if (fp->format.compression != bgzf) {
1053
0
            hts_log_error("%s file \"%s\" not BGZF compressed",
1054
0
                          fp->format.format == bam ? "BAM" : "SAM", fn);
1055
0
            ret = -1;
1056
0
            break;
1057
0
        }
1058
0
        idx = sam_index(fp, min_shift);
1059
0
        if (idx) {
1060
0
            ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
1061
0
            if (ret < 0) ret = -4;
1062
0
            hts_idx_destroy(idx);
1063
0
        }
1064
0
        else ret = -1;
1065
0
        break;
1066
1067
0
    default:
1068
0
        ret = -3;
1069
0
        break;
1070
0
    }
1071
0
    hts_close(fp);
1072
1073
0
    return ret;
1074
0
}
1075
1076
int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
1077
0
{
1078
0
    return sam_index_build3(fn, fnidx, min_shift, 0);
1079
0
}
1080
1081
int sam_index_build(const char *fn, int min_shift)
1082
0
{
1083
0
    return sam_index_build3(fn, NULL, min_shift, 0);
1084
0
}
1085
1086
// Provide bam_index_build() symbol for binary compatibility with earlier HTSlib
1087
#undef bam_index_build
1088
int bam_index_build(const char *fn, int min_shift)
1089
0
{
1090
0
    return sam_index_build2(fn, NULL, min_shift);
1091
0
}
1092
1093
// Initialise fp->idx for the current format type.
1094
// This must be called after the header has been written but no other data.
1095
0
int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) {
1096
0
    fp->fnidx = fnidx;
1097
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1098
0
        (fp->format.format == sam && fp->format.compression == bgzf)) {
1099
0
        int n_lvls, fmt = HTS_FMT_CSI;
1100
0
        if (min_shift > 0) {
1101
0
            int64_t max_len = 0;
1102
0
            int i;
1103
0
            for (i = 0; i < h->n_targets; ++i)
1104
0
                if (max_len < h->target_len[i]) max_len = h->target_len[i];
1105
0
            n_lvls = 0;
1106
0
            hts_adjust_csi_settings(max_len, &min_shift, &n_lvls);
1107
0
        } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1108
1109
0
        fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1110
0
        return fp->idx ? 0 : -1;
1111
0
    }
1112
1113
0
    if (fp->format.format == cram) {
1114
0
        fp->fp.cram->idxfp = bgzf_open(fnidx, "wg");
1115
0
        return fp->fp.cram->idxfp ? 0 : -1;
1116
0
    }
1117
1118
0
    return -1;
1119
0
}
1120
1121
// Finishes an index. Call after the last record has been written.
1122
// Returns 0 on success, <0 on failure.
1123
0
int sam_idx_save(htsFile *fp) {
1124
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1125
0
        fp->format.format == vcf || fp->format.format == sam) {
1126
0
        int ret;
1127
0
        if ((ret = sam_state_destroy(fp)) < 0) {
1128
0
            errno = -ret;
1129
0
            return -1;
1130
0
        }
1131
0
        if (!fp->is_bgzf || bgzf_flush(fp->fp.bgzf) < 0)
1132
0
            return -1;
1133
0
        hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
1134
1135
0
        if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0)
1136
0
            return -1;
1137
1138
0
        return hts_idx_save_but_not_close(fp->idx, fp->fnidx, hts_idx_fmt(fp->idx));
1139
1140
0
    } else if (fp->format.format == cram) {
1141
        // flushed and closed by cram_close
1142
0
    }
1143
1144
0
    return 0;
1145
0
}
1146
1147
static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1148
0
{
1149
0
    htsFile *fp = (htsFile *)fpv;
1150
0
    bam1_t *b = bv;
1151
0
    fp->line.l = 0;
1152
0
    int ret = sam_read1(fp, fp->bam_header, b);
1153
0
    if (ret >= 0) {
1154
0
        *tid = b->core.tid;
1155
0
        *beg = b->core.pos;
1156
0
        *end = bam_endpos(b);
1157
0
    }
1158
0
    return ret;
1159
0
}
1160
1161
// This is used only with read_rest=1 iterators, so need not set tid/beg/end.
1162
static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1163
0
{
1164
0
    htsFile *fp = (htsFile *)fpv;
1165
0
    bam1_t *b = bv;
1166
0
    fp->line.l = 0;
1167
0
    int ret = sam_read1(fp, fp->bam_header, b);
1168
0
    return ret;
1169
0
}
1170
1171
// Internal (for now) func used by bam_sym_lookup.  This is copied from
1172
// samtools/bam.c.
1173
static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b)
1174
0
{
1175
0
    const char *rg;
1176
0
    kstring_t lib = { 0, 0, NULL };
1177
0
    rg = (char *)bam_aux_get(b, "RG");
1178
1179
0
    if (!rg)
1180
0
        return NULL;
1181
0
    else
1182
0
        rg++;
1183
1184
0
    if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib)  < 0)
1185
0
        return NULL;
1186
1187
0
    static char LB_text[1024];
1188
0
    int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1;
1189
1190
0
    memcpy(LB_text, lib.s, len);
1191
0
    LB_text[len] = 0;
1192
1193
0
    free(lib.s);
1194
1195
0
    return LB_text;
1196
0
}
1197
1198
1199
// Bam record pointer and SAM header combined
1200
typedef struct {
1201
    const sam_hdr_t *h;
1202
    const bam1_t *b;
1203
} hb_pair;
1204
1205
// Looks up variable names in str and replaces them with their value.
1206
// Also supports aux tags.
1207
//
1208
// Note the expression parser deliberately overallocates str size so it
1209
// is safe to use memcmp over strcmp.
1210
static int bam_sym_lookup(void *data, char *str, char **end,
1211
0
                          hts_expr_val_t *res) {
1212
0
    hb_pair *hb = (hb_pair *)data;
1213
0
    const bam1_t *b = hb->b;
1214
1215
0
    res->is_str = 0;
1216
0
    switch(*str) {
1217
0
    case 'c':
1218
0
        if (memcmp(str, "cigar", 5) == 0) {
1219
0
            *end = str+5;
1220
0
            res->is_str = 1;
1221
0
            ks_clear(&res->s);
1222
0
            uint32_t *cigar = bam_get_cigar(b);
1223
0
            int i, n = b->core.n_cigar, r = 0;
1224
0
            if (n) {
1225
0
                for (i = 0; i < n; i++) {
1226
0
                    r |= kputw (bam_cigar_oplen(cigar[i]), &res->s) < 0;
1227
0
                    r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s) < 0;
1228
0
                }
1229
0
                r |= kputs("", &res->s) < 0;
1230
0
            } else {
1231
0
                r |= kputs("*", &res->s) < 0;
1232
0
            }
1233
0
            return r ? -1 : 0;
1234
0
        }
1235
0
        break;
1236
1237
0
    case 'e':
1238
0
        if (memcmp(str, "endpos", 6) == 0) {
1239
0
            *end = str+6;
1240
0
            res->d = bam_endpos(b);
1241
0
            return 0;
1242
0
        }
1243
0
        break;
1244
1245
0
    case 'f':
1246
0
        if (memcmp(str, "flag", 4) == 0) {
1247
0
            str = *end = str+4;
1248
0
            if (*str != '.') {
1249
0
                res->d = b->core.flag;
1250
0
                return 0;
1251
0
            } else {
1252
0
                str++;
1253
0
                if (!memcmp(str, "paired", 6)) {
1254
0
                    *end = str+6;
1255
0
                    res->d = b->core.flag & BAM_FPAIRED;
1256
0
                    return 0;
1257
0
                } else if (!memcmp(str, "proper_pair", 11)) {
1258
0
                    *end = str+11;
1259
0
                    res->d = b->core.flag & BAM_FPROPER_PAIR;
1260
0
                    return 0;
1261
0
                } else if (!memcmp(str, "unmap", 5)) {
1262
0
                    *end = str+5;
1263
0
                    res->d = b->core.flag & BAM_FUNMAP;
1264
0
                    return 0;
1265
0
                } else if (!memcmp(str, "munmap", 6)) {
1266
0
                    *end = str+6;
1267
0
                    res->d = b->core.flag & BAM_FMUNMAP;
1268
0
                    return 0;
1269
0
                } else if (!memcmp(str, "reverse", 7)) {
1270
0
                    *end = str+7;
1271
0
                    res->d = b->core.flag & BAM_FREVERSE;
1272
0
                    return 0;
1273
0
                } else if (!memcmp(str, "mreverse", 8)) {
1274
0
                    *end = str+8;
1275
0
                    res->d = b->core.flag & BAM_FMREVERSE;
1276
0
                    return 0;
1277
0
                } else if (!memcmp(str, "read1", 5)) {
1278
0
                    *end = str+5;
1279
0
                    res->d = b->core.flag & BAM_FREAD1;
1280
0
                    return 0;
1281
0
                } else if (!memcmp(str, "read2", 5)) {
1282
0
                    *end = str+5;
1283
0
                    res->d = b->core.flag & BAM_FREAD2;
1284
0
                    return 0;
1285
0
                } else if (!memcmp(str, "secondary", 9)) {
1286
0
                    *end = str+9;
1287
0
                    res->d = b->core.flag & BAM_FSECONDARY;
1288
0
                    return 0;
1289
0
                } else if (!memcmp(str, "qcfail", 6)) {
1290
0
                    *end = str+6;
1291
0
                    res->d = b->core.flag & BAM_FQCFAIL;
1292
0
                    return 0;
1293
0
                } else if (!memcmp(str, "dup", 3)) {
1294
0
                    *end = str+3;
1295
0
                    res->d = b->core.flag & BAM_FDUP;
1296
0
                    return 0;
1297
0
                } else if (!memcmp(str, "supplementary", 13)) {
1298
0
                    *end = str+13;
1299
0
                    res->d = b->core.flag & BAM_FSUPPLEMENTARY;
1300
0
                    return 0;
1301
0
                } else {
1302
0
                    hts_log_error("Unrecognised flag string");
1303
0
                    return -1;
1304
0
                }
1305
0
            }
1306
0
        }
1307
0
        break;
1308
1309
0
    case 'h':
1310
0
        if (memcmp(str, "hclen", 5) == 0) {
1311
0
            int hclen = 0;
1312
0
            uint32_t *cigar = bam_get_cigar(b);
1313
0
            uint32_t ncigar = b->core.n_cigar;
1314
1315
            // left
1316
0
            if (ncigar > 0 && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP)
1317
0
                hclen = bam_cigar_oplen(cigar[0]);
1318
1319
            // right
1320
0
            if (ncigar > 1 && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP)
1321
0
                hclen += bam_cigar_oplen(cigar[ncigar-1]);
1322
1323
0
            *end = str+5;
1324
0
            res->d = hclen;
1325
0
            return 0;
1326
0
        }
1327
0
        break;
1328
1329
0
    case 'l':
1330
0
        if (memcmp(str, "library", 7) == 0) {
1331
0
            *end = str+7;
1332
0
            res->is_str = 1;
1333
0
            const char *lib = bam_get_library(hb->h, b);
1334
0
            kputs(lib ? lib : "", ks_clear(&res->s));
1335
0
            return 0;
1336
0
        }
1337
0
        break;
1338
1339
0
    case 'm':
1340
0
        if (memcmp(str, "mapq", 4) == 0) {
1341
0
            *end = str+4;
1342
0
            res->d = b->core.qual;
1343
0
            return 0;
1344
0
        } else if (memcmp(str, "mpos", 4) == 0) {
1345
0
            *end = str+4;
1346
0
            res->d = b->core.mpos+1;
1347
0
            return 0;
1348
0
        } else if (memcmp(str, "mrname", 6) == 0) {
1349
0
            *end = str+6;
1350
0
            res->is_str = 1;
1351
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1352
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1353
0
            return 0;
1354
0
        } else if (memcmp(str, "mrefid", 6) == 0) {
1355
0
            *end = str+6;
1356
0
            res->d = b->core.mtid;
1357
0
            return 0;
1358
0
        }
1359
0
        break;
1360
1361
0
    case 'n':
1362
0
        if (memcmp(str, "ncigar", 6) == 0) {
1363
0
            *end = str+6;
1364
0
            res->d = b->core.n_cigar;
1365
0
            return 0;
1366
0
        }
1367
0
        break;
1368
1369
0
    case 'p':
1370
0
        if (memcmp(str, "pos", 3) == 0) {
1371
0
            *end = str+3;
1372
0
            res->d = b->core.pos+1;
1373
0
            return 0;
1374
0
        } else if (memcmp(str, "pnext", 5) == 0) {
1375
0
            *end = str+5;
1376
0
            res->d = b->core.mpos+1;
1377
0
            return 0;
1378
0
        }
1379
0
        break;
1380
1381
0
    case 'q':
1382
0
        if (memcmp(str, "qlen", 4) == 0) {
1383
0
            *end = str+4;
1384
0
            res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
1385
0
            return 0;
1386
0
        } else if (memcmp(str, "qname", 5) == 0) {
1387
0
            *end = str+5;
1388
0
            res->is_str = 1;
1389
0
            kputs(bam_get_qname(b), ks_clear(&res->s));
1390
0
            return 0;
1391
0
        } else if (memcmp(str, "qual", 4) == 0) {
1392
0
            *end = str+4;
1393
0
            ks_clear(&res->s);
1394
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1395
0
                return -1;
1396
0
            memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq);
1397
0
            res->s.l = b->core.l_qseq;
1398
0
            res->is_str = 1;
1399
0
            return 0;
1400
0
        }
1401
0
        break;
1402
1403
0
    case 'r':
1404
0
        if (memcmp(str, "rlen", 4) == 0) {
1405
0
            *end = str+4;
1406
0
            res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
1407
0
            return 0;
1408
0
        } else if (memcmp(str, "rname", 5) == 0) {
1409
0
            *end = str+5;
1410
0
            res->is_str = 1;
1411
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.tid);
1412
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1413
0
            return 0;
1414
0
        } else if (memcmp(str, "rnext", 5) == 0) {
1415
0
            *end = str+5;
1416
0
            res->is_str = 1;
1417
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1418
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1419
0
            return 0;
1420
0
        } else if (memcmp(str, "refid", 5) == 0) {
1421
0
            *end = str+5;
1422
0
            res->d = b->core.tid;
1423
0
            return 0;
1424
0
        }
1425
0
        break;
1426
1427
0
    case 's':
1428
0
        if (memcmp(str, "seq", 3) == 0) {
1429
0
            *end = str+3;
1430
0
            ks_clear(&res->s);
1431
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1432
0
                return -1;
1433
0
            nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq);
1434
0
            res->s.s[b->core.l_qseq] = 0;
1435
0
            res->s.l = b->core.l_qseq;
1436
0
            res->is_str = 1;
1437
0
            return 0;
1438
0
        } else if (memcmp(str, "sclen", 5) == 0) {
1439
0
            int sclen = 0;
1440
0
            uint32_t *cigar = bam_get_cigar(b);
1441
0
            int ncigar = b->core.n_cigar;
1442
0
            int left = 0;
1443
1444
            // left
1445
0
            if (ncigar > 0
1446
0
                && bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP)
1447
0
                left = 0, sclen += bam_cigar_oplen(cigar[0]);
1448
0
            else if (ncigar > 1
1449
0
                     && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP
1450
0
                     && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP)
1451
0
                left = 1, sclen += bam_cigar_oplen(cigar[1]);
1452
1453
            // right
1454
0
            if (ncigar-1 > left
1455
0
                && bam_cigar_op(cigar[ncigar-1]) == BAM_CSOFT_CLIP)
1456
0
                sclen += bam_cigar_oplen(cigar[ncigar-1]);
1457
0
            else if (ncigar-2 > left
1458
0
                     && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP
1459
0
                     && bam_cigar_op(cigar[ncigar-2]) == BAM_CSOFT_CLIP)
1460
0
                sclen += bam_cigar_oplen(cigar[ncigar-2]);
1461
1462
0
            *end = str+5;
1463
0
            res->d = sclen;
1464
0
            return 0;
1465
0
        }
1466
0
        break;
1467
1468
0
    case 't':
1469
0
        if (memcmp(str, "tlen", 4) == 0) {
1470
0
            *end = str+4;
1471
0
            res->d = b->core.isize;
1472
0
            return 0;
1473
0
        }
1474
0
        break;
1475
1476
0
    case '[':
1477
0
        if (*str == '[' && str[1] && str[2] && str[3] == ']') {
1478
            /* aux tags */
1479
0
            *end = str+4;
1480
1481
0
            uint8_t *aux = bam_aux_get(b, str+1);
1482
0
            if (aux) {
1483
                // we define the truth of a tag to be its presence, even if 0.
1484
0
                res->is_true = 1;
1485
0
                switch (*aux) {
1486
0
                case 'Z':
1487
0
                case 'H':
1488
0
                    res->is_str = 1;
1489
0
                    kputs((char *)aux+1, ks_clear(&res->s));
1490
0
                    break;
1491
1492
0
                case 'A':
1493
0
                    res->is_str = 1;
1494
0
                    kputsn((char *)aux+1, 1, ks_clear(&res->s));
1495
0
                    break;
1496
1497
0
                case 'i': case 'I':
1498
0
                case 's': case 'S':
1499
0
                case 'c': case 'C':
1500
0
                    res->is_str = 0;
1501
0
                    res->d = bam_aux2i(aux);
1502
0
                    break;
1503
1504
0
                case 'f':
1505
0
                case 'd':
1506
0
                    res->is_str = 0;
1507
0
                    res->d = bam_aux2f(aux);
1508
0
                    break;
1509
1510
0
                default:
1511
0
                    hts_log_error("Aux type '%c not yet supported by filters",
1512
0
                                  *aux);
1513
0
                    return -1;
1514
0
                }
1515
0
                return 0;
1516
1517
0
            } else {
1518
                // hence absent tags are always false (and strings)
1519
0
                res->is_str = 1;
1520
0
                res->s.l = 0;
1521
0
                res->d = 0;
1522
0
                res->is_true = 0;
1523
0
                return 0;
1524
0
            }
1525
0
        }
1526
0
        break;
1527
0
    }
1528
1529
    // All successful matches in switch should return 0.
1530
    // So if we didn't match, it's a parse error.
1531
0
    return -1;
1532
0
}
1533
1534
// Returns 1 when accepted by the filter, 0 if not, -1 on error.
1535
int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt)
1536
0
{
1537
0
    hb_pair hb = {h, b};
1538
0
    hts_expr_val_t res = HTS_EXPR_VAL_INIT;
1539
0
    if (hts_filter_eval2(filt, &hb, bam_sym_lookup, &res)) {
1540
0
        hts_log_error("Couldn't process filter expression");
1541
0
        hts_expr_val_free(&res);
1542
0
        return -1;
1543
0
    }
1544
1545
0
    int t = res.is_true;
1546
0
    hts_expr_val_free(&res);
1547
1548
0
    return t;
1549
0
}
1550
1551
static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1552
0
{
1553
0
    htsFile *fp = fpv;
1554
0
    bam1_t *b = bv;
1555
0
    int pass_filter, ret;
1556
1557
0
    do {
1558
0
        ret = cram_get_bam_seq(fp->fp.cram, &b);
1559
0
        if (ret < 0)
1560
0
            return cram_eof(fp->fp.cram) ? -1 : -2;
1561
1562
0
        if (bam_tag2cigar(b, 1, 1) < 0)
1563
0
            return -2;
1564
1565
0
        *tid = b->core.tid;
1566
0
        *beg = b->core.pos;
1567
0
        *end = bam_endpos(b);
1568
1569
0
        if (fp->filter) {
1570
0
            pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter);
1571
0
            if (pass_filter < 0)
1572
0
                return -2;
1573
0
        } else {
1574
0
            pass_filter = 1;
1575
0
        }
1576
0
    } while (pass_filter == 0);
1577
1578
0
    return ret;
1579
0
}
1580
1581
static int cram_pseek(void *fp, int64_t offset, int whence)
1582
0
{
1583
0
    cram_fd *fd =  (cram_fd *)fp;
1584
1585
0
    if ((0 != cram_seek(fd, offset, SEEK_SET))
1586
0
     && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
1587
0
        return -1;
1588
1589
0
    fd->curr_position = offset;
1590
1591
0
    if (fd->ctr) {
1592
0
        cram_free_container(fd->ctr);
1593
0
        if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
1594
0
            cram_free_container(fd->ctr_mt);
1595
1596
0
        fd->ctr = NULL;
1597
0
        fd->ctr_mt = NULL;
1598
0
        fd->ooc = 0;
1599
0
    }
1600
1601
0
    return 0;
1602
0
}
1603
1604
/*
1605
 * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
1606
 *   after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
1607
 *   container previously fetched. It was designed like this to integrate with the functionality
1608
 *   of the iterator stepping logic.
1609
 */
1610
1611
static int64_t cram_ptell(void *fp)
1612
0
{
1613
0
    cram_fd *fd = (cram_fd *)fp;
1614
0
    cram_container *c;
1615
0
    cram_slice *s;
1616
0
    int64_t ret = -1L;
1617
1618
0
    if (fd) {
1619
0
        if ((c = fd->ctr) != NULL) {
1620
0
            if ((s = c->slice) != NULL && s->max_rec) {
1621
0
                if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1))
1622
0
                    fd->curr_position += c->offset + c->length;
1623
0
            }
1624
0
        }
1625
0
        ret = fd->curr_position;
1626
0
    }
1627
1628
0
    return ret;
1629
0
}
1630
1631
static int bam_pseek(void *fp, int64_t offset, int whence)
1632
0
{
1633
0
    BGZF *fd = (BGZF *)fp;
1634
1635
0
    return bgzf_seek(fd, offset, whence);
1636
0
}
1637
1638
static int64_t bam_ptell(void *fp)
1639
0
{
1640
0
    BGZF *fd = (BGZF *)fp;
1641
0
    if (!fd)
1642
0
        return -1L;
1643
1644
0
    return bgzf_tell(fd);
1645
0
}
1646
1647
1648
1649
static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags)
1650
0
{
1651
0
    switch (fp->format.format) {
1652
0
    case bam:
1653
0
    case sam:
1654
0
        return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags);
1655
1656
0
    case cram: {
1657
0
        if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
1658
1659
        // Cons up a fake "index" just pointing at the associated cram_fd:
1660
0
        hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
1661
0
        if (idx == NULL) return NULL;
1662
0
        idx->fmt = HTS_FMT_CRAI;
1663
0
        idx->cram = fp->fp.cram;
1664
0
        return (hts_idx_t *) idx;
1665
0
        }
1666
1667
0
    default:
1668
0
        return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
1669
0
    }
1670
0
}
1671
1672
hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags)
1673
0
{
1674
0
    return index_load(fp, fn, fnidx, flags);
1675
0
}
1676
1677
0
hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) {
1678
0
    return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE);
1679
0
}
1680
1681
hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
1682
0
{
1683
0
    return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE);
1684
0
}
1685
1686
static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
1687
0
{
1688
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1689
0
    hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
1690
0
    if (iter == NULL) return NULL;
1691
1692
    // Cons up a dummy iterator for which hts_itr_next() will simply invoke
1693
    // the readrec function:
1694
0
    iter->is_cram = 1;
1695
0
    iter->read_rest = 1;
1696
0
    iter->off = NULL;
1697
0
    iter->bins.a = NULL;
1698
0
    iter->readrec = readrec;
1699
1700
0
    if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
1701
0
        cram_range r = { tid, beg+1, end };
1702
0
        int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
1703
1704
0
        iter->curr_off = 0;
1705
        // The following fields are not required by hts_itr_next(), but are
1706
        // filled in in case user code wants to look at them.
1707
0
        iter->tid = tid;
1708
0
        iter->beg = beg;
1709
0
        iter->end = end;
1710
1711
0
        switch (ret) {
1712
0
        case 0:
1713
0
            break;
1714
1715
0
        case -2:
1716
            // No data vs this ref, so mark iterator as completed.
1717
            // Same as HTS_IDX_NONE.
1718
0
            iter->finished = 1;
1719
0
            break;
1720
1721
0
        default:
1722
0
            free(iter);
1723
0
            return NULL;
1724
0
        }
1725
0
    }
1726
0
    else switch (tid) {
1727
0
    case HTS_IDX_REST:
1728
0
        iter->curr_off = 0;
1729
0
        break;
1730
0
    case HTS_IDX_NONE:
1731
0
        iter->curr_off = 0;
1732
0
        iter->finished = 1;
1733
0
        break;
1734
0
    default:
1735
0
        hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
1736
0
        abort();
1737
0
        break;
1738
0
    }
1739
1740
0
    return iter;
1741
0
}
1742
1743
hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
1744
0
{
1745
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1746
0
    if (idx == NULL)
1747
0
        return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest);
1748
0
    else if (cidx->fmt == HTS_FMT_CRAI)
1749
0
        return cram_itr_query(idx, tid, beg, end, sam_readrec);
1750
0
    else
1751
0
        return hts_itr_query(idx, tid, beg, end, sam_readrec);
1752
0
}
1753
1754
static int cram_name2id(void *fdv, const char *ref)
1755
0
{
1756
0
    cram_fd *fd = (cram_fd *) fdv;
1757
0
    return sam_hdr_name2tid(fd->header, ref);
1758
0
}
1759
1760
hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region)
1761
0
{
1762
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1763
0
    return hts_itr_querys(idx, region, bam_name2id_wrapper, hdr,
1764
0
                          cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query,
1765
0
                          sam_readrec);
1766
0
}
1767
1768
hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount)
1769
0
{
1770
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1771
0
    hts_reglist_t *r_list = NULL;
1772
0
    int r_count = 0;
1773
1774
0
    if (!cidx || !hdr)
1775
0
        return NULL;
1776
1777
0
    hts_itr_t *itr = NULL;
1778
0
    if (cidx->fmt == HTS_FMT_CRAI) {
1779
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id);
1780
0
        if (!r_list)
1781
0
            return NULL;
1782
0
        itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram,
1783
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1784
0
    } else {
1785
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, bam_name2id_wrapper);
1786
0
        if (!r_list)
1787
0
            return NULL;
1788
0
        itr = hts_itr_regions(idx, r_list, r_count, bam_name2id_wrapper, hdr,
1789
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1790
0
    }
1791
1792
0
    if (!itr)
1793
0
        hts_reglist_free(r_list, r_count);
1794
1795
0
    return itr;
1796
0
}
1797
1798
hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
1799
0
{
1800
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1801
1802
0
    if(!cidx || !hdr || !reglist)
1803
0
        return NULL;
1804
1805
0
    if (cidx->fmt == HTS_FMT_CRAI)
1806
0
        return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
1807
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1808
0
    else
1809
0
        return hts_itr_regions(idx, reglist, regcount, bam_name2id_wrapper, hdr,
1810
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1811
0
}
1812
1813
/**********************
1814
 *** SAM header I/O ***
1815
 **********************/
1816
1817
#include "htslib/kseq.h"
1818
#include "htslib/kstring.h"
1819
1820
sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text)
1821
0
{
1822
0
    sam_hdr_t *bh = sam_hdr_init();
1823
0
    if (!bh) return NULL;
1824
1825
0
    if (sam_hdr_add_lines(bh, text, l_text) != 0) {
1826
0
        sam_hdr_destroy(bh);
1827
0
        return NULL;
1828
0
    }
1829
1830
0
    return bh;
1831
0
}
1832
1833
// Minimal sanitisation of a header to ensure.
1834
// - null terminated string.
1835
// - all lines start with @ (also implies no blank lines).
1836
//
1837
// Much more could be done, but currently is not, including:
1838
// - checking header types are known (HD, SQ, etc).
1839
// - syntax (eg checking tab separated fields).
1840
// - validating n_targets matches @SQ records.
1841
// - validating target lengths against @SQ records.
1842
12.4k
static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) {
1843
12.4k
    if (!h)
1844
441
        return NULL;
1845
1846
    // Special case for empty headers.
1847
12.0k
    if (h->l_text == 0)
1848
7.18k
        return h;
1849
1850
4.87k
    size_t i;
1851
4.87k
    unsigned int lnum = 0;
1852
4.87k
    char *cp = h->text, last = '\n';
1853
424k
    for (i = 0; i < h->l_text; i++) {
1854
        // NB: l_text excludes terminating nul.  This finds early ones.
1855
420k
        if (cp[i] == 0)
1856
1.01k
            break;
1857
1858
        // Error on \n[^@], including duplicate newlines
1859
419k
        if (last == '\n') {
1860
8.31k
            lnum++;
1861
8.31k
            if (cp[i] != '@') {
1862
3
                hts_log_error("Malformed SAM header at line %u", lnum);
1863
3
                sam_hdr_destroy(h);
1864
3
                return NULL;
1865
3
            }
1866
8.31k
        }
1867
1868
419k
        last = cp[i];
1869
419k
    }
1870
1871
4.86k
    if (i < h->l_text) { // Early nul found.  Complain if not just padding.
1872
1.01k
        size_t j = i;
1873
4.46k
        while (j < h->l_text && cp[j] == '\0') j++;
1874
1.01k
        if (j < h->l_text)
1875
864
            hts_log_warning("Unexpected NUL character in header. Possibly truncated");
1876
1.01k
    }
1877
1878
    // Add trailing newline and/or trailing nul if required.
1879
4.86k
    if (last != '\n') {
1880
906
        hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
1881
1882
906
        if (h->l_text < 2 || i >= h->l_text - 2) {
1883
450
            if (h->l_text >= SIZE_MAX - 2) {
1884
0
                hts_log_error("No room for extra newline");
1885
0
                sam_hdr_destroy(h);
1886
0
                return NULL;
1887
0
            }
1888
1889
450
            cp = realloc(h->text, (size_t) h->l_text+2);
1890
450
            if (!cp) {
1891
0
                sam_hdr_destroy(h);
1892
0
                return NULL;
1893
0
            }
1894
450
            h->text = cp;
1895
450
        }
1896
906
        cp[i++] = '\n';
1897
1898
        // l_text may be larger already due to multiple nul padding
1899
906
        if (h->l_text < i)
1900
39
            h->l_text = i;
1901
906
        cp[h->l_text] = '\0';
1902
906
    }
1903
1904
4.86k
    return h;
1905
4.86k
}
1906
1907
13.8k
static sam_hdr_t *sam_hdr_create(htsFile* fp) {
1908
13.8k
    sam_hdr_t* h = sam_hdr_init();
1909
13.8k
    if (!h)
1910
0
        return NULL;
1911
1912
13.8k
    if (sam_hdr_build_from_sam_file(h, fp) != 0) {
1913
9.30k
        sam_hdr_destroy(h);
1914
9.30k
        return NULL;
1915
9.30k
    }
1916
1917
4.57k
    if (fp->bam_header)
1918
0
        sam_hdr_destroy(fp->bam_header);
1919
4.57k
    fp->bam_header = sam_hdr_sanitise(h);
1920
4.57k
    fp->bam_header->ref_count = 1;
1921
1922
4.57k
    return fp->bam_header;
1923
13.8k
}
1924
1925
sam_hdr_t *sam_hdr_read(htsFile *fp)
1926
26.6k
{
1927
26.6k
    sam_hdr_t *h = NULL;
1928
26.6k
    if (!fp) {
1929
0
        errno = EINVAL;
1930
0
        return NULL;
1931
0
    }
1932
1933
26.6k
    switch (fp->format.format) {
1934
2.13k
    case bam:
1935
2.13k
        h = sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
1936
2.13k
        break;
1937
1938
5.78k
    case cram:
1939
5.78k
        h = sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header));
1940
5.78k
        break;
1941
1942
13.8k
    case sam:
1943
13.8k
        h = sam_hdr_create(fp);
1944
13.8k
        break;
1945
1946
243
    case fastq_format:
1947
4.84k
    case fasta_format:
1948
4.84k
        return sam_hdr_init();
1949
1950
0
    case empty_format:
1951
0
        errno = EPIPE;
1952
0
        return NULL;
1953
1954
0
    default:
1955
0
        errno = EFTYPE;
1956
0
        return NULL;
1957
26.6k
    }
1958
    //only sam,bam and cram reaches here
1959
21.8k
    if (h && !fp->bam_header) { //set except for sam which already has it
1960
        //for cram, it is the o/p header as for rest and not the internal header
1961
7.47k
        fp->bam_header = h;
1962
7.47k
        sam_hdr_incr_ref(fp->bam_header);
1963
7.47k
    }
1964
21.8k
    return h;
1965
26.6k
}
1966
1967
int sam_hdr_write(htsFile *fp, const sam_hdr_t *h)
1968
16.9k
{
1969
16.9k
    if (!fp || !h) {
1970
0
        errno = EINVAL;
1971
0
        return -1;
1972
0
    }
1973
1974
16.9k
    switch (fp->format.format) {
1975
5.63k
    case binary_format:
1976
5.63k
        fp->format.category = sequence_data;
1977
5.63k
        fp->format.format = bam;
1978
        /* fall-through */
1979
5.63k
    case bam:
1980
5.63k
        if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
1981
5.63k
        break;
1982
1983
5.63k
    case cram: {
1984
5.63k
        cram_fd *fd = fp->fp.cram;
1985
5.63k
        if (cram_set_header2(fd, h) < 0) return -1;
1986
5.55k
        if (fp->fn_aux)
1987
0
            cram_load_reference(fd, fp->fn_aux);
1988
5.55k
        if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
1989
5.55k
        }
1990
5.55k
        break;
1991
1992
5.63k
    case text_format:
1993
5.63k
        fp->format.category = sequence_data;
1994
5.63k
        fp->format.format = sam;
1995
        /* fall-through */
1996
5.63k
    case sam: {
1997
5.63k
        if (!h->hrecs && !h->text)
1998
0
            return 0;
1999
5.63k
        char *text;
2000
5.63k
        kstring_t hdr_ks = { 0, 0, NULL };
2001
5.63k
        size_t l_text;
2002
5.63k
        ssize_t bytes;
2003
5.63k
        int r = 0, no_sq = 0;
2004
2005
5.63k
        if (h->hrecs) {
2006
5.57k
            if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0)
2007
0
                return -1;
2008
5.57k
            text = hdr_ks.s;
2009
5.57k
            l_text = hdr_ks.l;
2010
5.57k
        } else {
2011
59
            const char *p = NULL;
2012
80
            do {
2013
80
                const char *q = p == NULL ? h->text : p + 4;
2014
80
                p = strstr(q, "@SQ\t");
2015
80
            } while (!(p == NULL || p == h->text || *(p - 1) == '\n'));
2016
59
            no_sq = p == NULL;
2017
59
            text = h->text;
2018
59
            l_text = h->l_text;
2019
59
        }
2020
2021
5.63k
        if (fp->is_bgzf) {
2022
0
            bytes = bgzf_write(fp->fp.bgzf, text, l_text);
2023
5.63k
        } else {
2024
5.63k
            bytes = hwrite(fp->fp.hfile, text, l_text);
2025
5.63k
        }
2026
5.63k
        free(hdr_ks.s);
2027
5.63k
        if (bytes != l_text)
2028
0
            return -1;
2029
2030
5.63k
        if (no_sq) {
2031
52
            int i;
2032
117
            for (i = 0; i < h->n_targets; ++i) {
2033
65
                fp->line.l = 0;
2034
65
                r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0;
2035
65
                r |= kputs(h->target_name[i], &fp->line) < 0;
2036
65
                r |= kputsn("\tLN:", 4, &fp->line) < 0;
2037
65
                r |= kputw(h->target_len[i], &fp->line) < 0;
2038
65
                r |= kputc('\n', &fp->line) < 0;
2039
65
                if (r != 0)
2040
0
                    return -1;
2041
2042
65
                if (fp->is_bgzf) {
2043
0
                    bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
2044
65
                } else {
2045
65
                    bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
2046
65
                }
2047
65
                if (bytes != fp->line.l)
2048
0
                    return -1;
2049
65
            }
2050
52
        }
2051
5.63k
        if (fp->is_bgzf) {
2052
0
            if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2053
5.63k
        } else {
2054
5.63k
            if (hflush(fp->fp.hfile) != 0) return -1;
2055
5.63k
        }
2056
5.63k
        }
2057
5.63k
        break;
2058
2059
5.63k
    case fastq_format:
2060
0
    case fasta_format:
2061
        // Nothing to output; FASTQ has no file headers.
2062
0
        return 0;
2063
0
        break;
2064
2065
0
    default:
2066
0
        errno = EBADF;
2067
0
        return -1;
2068
16.9k
    }
2069
    //only sam,bam and cram reaches here
2070
16.8k
    if (h) {    //the new header
2071
16.8k
        sam_hdr_t *tmp = fp->bam_header;
2072
16.8k
        fp->bam_header = sam_hdr_dup(h);
2073
16.8k
        sam_hdr_destroy(tmp);
2074
16.8k
        if (!fp->bam_header && h)
2075
0
            return -1;  //failed to duplicate
2076
16.8k
    }
2077
16.8k
    return 0;
2078
16.8k
}
2079
2080
static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2081
0
{
2082
0
    char *p, *q, *beg = NULL, *end = NULL, *newtext;
2083
0
    size_t new_l_text;
2084
0
    if (!h || !key)
2085
0
        return -1;
2086
2087
0
    if (h->l_text > 3) {
2088
0
        if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
2089
0
            if ((p = strchr(h->text, '\n')) == 0) return -1;
2090
0
            *p = '\0'; // for strstr call
2091
2092
0
            char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
2093
2094
0
            if ((q = strstr(h->text, tmp)) != 0) { // key exists
2095
0
                *p = '\n'; // change back
2096
2097
                // mark the key:val
2098
0
                beg = q;
2099
0
                for (q += 4; *q != '\n' && *q != '\t'; ++q);
2100
0
                end = q;
2101
2102
0
                if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
2103
0
                    && strlen(val) == end - beg - 4)
2104
0
                     return 0; // val is the same, no need to change
2105
2106
0
            } else {
2107
0
                beg = end = p;
2108
0
                *p = '\n';
2109
0
            }
2110
0
        }
2111
0
    }
2112
0
    if (beg == NULL) { // no @HD
2113
0
        new_l_text = h->l_text;
2114
0
        if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9)
2115
0
            return -1;
2116
0
        new_l_text += strlen(SAM_FORMAT_VERSION) + 8;
2117
0
        if (val) {
2118
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2119
0
                return -1;
2120
0
            new_l_text += strlen(val) + 4;
2121
0
        }
2122
0
        newtext = (char*)malloc(new_l_text + 1);
2123
0
        if (!newtext) return -1;
2124
2125
0
        if (val)
2126
0
            snprintf(newtext, new_l_text + 1,
2127
0
                    "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
2128
0
        else
2129
0
            snprintf(newtext, new_l_text + 1,
2130
0
                    "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
2131
0
    } else { // has @HD but different or no key
2132
0
        new_l_text = (beg - h->text) + (h->text + h->l_text - end);
2133
0
        if (val) {
2134
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2135
0
                return -1;
2136
0
            new_l_text += strlen(val) + 4;
2137
0
        }
2138
0
        newtext = (char*)malloc(new_l_text + 1);
2139
0
        if (!newtext) return -1;
2140
2141
0
        if (val) {
2142
0
            snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s",
2143
0
                    (int) (beg - h->text), h->text, key, val, end);
2144
0
        } else { //delete key
2145
0
            snprintf(newtext, new_l_text + 1, "%.*s%s",
2146
0
                    (int) (beg - h->text), h->text, end);
2147
0
        }
2148
0
    }
2149
0
    free(h->text);
2150
0
    h->text = newtext;
2151
0
    h->l_text = new_l_text;
2152
0
    return 0;
2153
0
}
2154
2155
2156
int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2157
0
{
2158
0
    if (!h || !key)
2159
0
        return -1;
2160
2161
0
    if (!h->hrecs)
2162
0
        return old_sam_hdr_change_HD(h, key, val);
2163
2164
0
    if (val) {
2165
0
        if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0)
2166
0
            return -1;
2167
0
    } else {
2168
0
        if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0)
2169
0
            return -1;
2170
0
    }
2171
0
    return sam_hdr_rebuild(h);
2172
0
}
2173
2174
/* releases existing header and sets new one; increments ref count if not
2175
duplicating */
2176
int sam_hdr_set(samFile *fp, sam_hdr_t *h, int duplicate)
2177
0
{
2178
0
    if (!fp)
2179
0
        return -1;
2180
2181
0
    if (duplicate) {
2182
0
        sam_hdr_t *tmp = fp->bam_header;
2183
0
        fp->bam_header = sam_hdr_dup(h);
2184
0
        sam_hdr_destroy(tmp);
2185
0
        if (!fp->bam_header && h)
2186
0
            return -1;  //duplicate failed
2187
0
    } else {
2188
0
        if (fp->bam_header != h) {  //if not the same
2189
0
            sam_hdr_destroy(fp->bam_header);
2190
0
            fp->bam_header = h;
2191
0
            sam_hdr_incr_ref(fp->bam_header);
2192
0
        }
2193
0
    }
2194
2195
0
    return 0;
2196
0
}
2197
2198
//return the bam_header, user has to use sam_hdr_incr_ref where ever required
2199
sam_hdr_t* sam_hdr_get(samFile* fp)
2200
0
{
2201
0
    if (!fp)
2202
0
        return NULL;
2203
0
    return fp->bam_header;
2204
0
}
2205
2206
/**********************
2207
 *** SAM record I/O ***
2208
 **********************/
2209
2210
// The speed of this code can vary considerably depending on minor code
2211
// changes elsewhere as some of the tight loops are particularly prone to
2212
// speed changes when the instruction blocks are split over a 32-byte
2213
// boundary.  To protect against this, we explicitly specify an alignment
2214
// for this function.  If this is insufficient, we may also wish to
2215
// consider alignment of blocks within this function via
2216
// __attribute__((optimize("align-loops=5"))) (gcc) or clang equivalents.
2217
// However it's not very portable.
2218
// Instead we break into separate functions so we can explicitly specify
2219
// use __attribute__((aligned(32))) instead and force consistent loop
2220
// alignment.
2221
7.38k
static inline int64_t grow_B_array(bam1_t *b, uint32_t *n, size_t size) {
2222
    // Avoid overflow on 32-bit platforms, but it breaks BAM anyway
2223
7.38k
    if (*n > INT32_MAX*0.666) {
2224
0
        errno = ENOMEM;
2225
0
        return -1;
2226
0
    }
2227
2228
7.38k
    size_t bytes = (size_t)size * (size_t)(*n>>1);
2229
7.38k
    if (possibly_expand_bam_data(b, bytes) < 0) {
2230
0
        hts_log_error("Out of memory");
2231
0
        return -1;
2232
0
    }
2233
2234
7.38k
    (*n)+=*n>>1;
2235
7.38k
    return 0;
2236
7.38k
}
2237
2238
2239
// This ensures that q always ends up at the next comma after
2240
// reading a number even if it's followed by junk.  It
2241
// prevents the possibility of trying to read more than n items.
2242
1.20M
#define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0)
2243
2244
HTS_ALIGN32
2245
static char *sam_parse_Bc_vals(bam1_t *b, char *q, uint32_t *nused,
2246
1.95k
                               uint32_t *nalloc, int *overflow) {
2247
109k
    while (*q == ',') {
2248
107k
        if ((*nused)++ >= (*nalloc)) {
2249
1.14k
            if (grow_B_array(b, nalloc, 1) < 0)
2250
0
                return NULL;
2251
1.14k
        }
2252
107k
        *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, overflow);
2253
107k
        b->l_data++;
2254
107k
    }
2255
1.95k
    return q;
2256
1.95k
}
2257
2258
HTS_ALIGN32
2259
static char *sam_parse_BC_vals(bam1_t *b, char *q, uint32_t *nused,
2260
2.90k
                               uint32_t *nalloc, int *overflow) {
2261
986k
    while (*q == ',') {
2262
983k
        if ((*nused)++ >= (*nalloc)) {
2263
3.97k
            if (grow_B_array(b, nalloc, 1) < 0)
2264
0
                return NULL;
2265
3.97k
        }
2266
983k
        if (q[1] != '-') {
2267
956k
            *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, overflow);
2268
956k
            b->l_data++;
2269
956k
        } else {
2270
27.0k
            *overflow = 1;
2271
27.0k
            q++;
2272
27.0k
            skip_to_comma_(q);
2273
27.0k
        }
2274
983k
    }
2275
2.90k
    return q;
2276
2.90k
}
2277
2278
HTS_ALIGN32
2279
static char *sam_parse_Bs_vals(bam1_t *b, char *q, uint32_t *nused,
2280
9.80k
                               uint32_t *nalloc, int *overflow) {
2281
30.2k
    while (*q == ',') {
2282
20.4k
        if ((*nused)++ >= (*nalloc)) {
2283
1.42k
            if (grow_B_array(b, nalloc, 2) < 0)
2284
0
                return NULL;
2285
1.42k
        }
2286
20.4k
        i16_to_le(hts_str2int(q + 1, &q, 16, overflow),
2287
20.4k
                  b->data + b->l_data);
2288
20.4k
        b->l_data += 2;
2289
20.4k
    }
2290
9.80k
    return q;
2291
9.80k
}
2292
2293
HTS_ALIGN32
2294
static char *sam_parse_BS_vals(bam1_t *b, char *q, uint32_t *nused,
2295
4.70k
                               uint32_t *nalloc, int *overflow) {
2296
17.6k
    while (*q == ',') {
2297
12.9k
        if ((*nused)++ >= (*nalloc)) {
2298
21
            if (grow_B_array(b, nalloc, 2) < 0)
2299
0
                return NULL;
2300
21
        }
2301
12.9k
        if (q[1] != '-') {
2302
12.7k
            u16_to_le(hts_str2uint(q + 1, &q, 16, overflow),
2303
12.7k
                      b->data + b->l_data);
2304
12.7k
            b->l_data += 2;
2305
12.7k
        } else {
2306
192
            *overflow = 1;
2307
192
            q++;
2308
192
            skip_to_comma_(q);
2309
192
        }
2310
12.9k
    }
2311
4.70k
    return q;
2312
4.70k
}
2313
2314
HTS_ALIGN32
2315
static char *sam_parse_Bi_vals(bam1_t *b, char *q, uint32_t *nused,
2316
1.21k
                               uint32_t *nalloc, int *overflow) {
2317
544k
    while (*q == ',') {
2318
543k
        if ((*nused)++ >= (*nalloc)) {
2319
417
            if (grow_B_array(b, nalloc, 4) < 0)
2320
0
                return NULL;
2321
417
        }
2322
543k
        i32_to_le(hts_str2int(q + 1, &q, 32, overflow),
2323
543k
                  b->data + b->l_data);
2324
543k
        b->l_data += 4;
2325
543k
    }
2326
1.21k
    return q;
2327
1.21k
}
2328
2329
HTS_ALIGN32
2330
static char *sam_parse_BI_vals(bam1_t *b, char *q, uint32_t *nused,
2331
3.73k
                               uint32_t *nalloc, int *overflow) {
2332
9.42k
    while (*q == ',') {
2333
5.68k
        if ((*nused)++ >= (*nalloc)) {
2334
60
            if (grow_B_array(b, nalloc, 4) < 0)
2335
0
                return NULL;
2336
60
        }
2337
5.68k
        if (q[1] != '-') {
2338
5.58k
            u32_to_le(hts_str2uint(q + 1, &q, 32, overflow),
2339
5.58k
                      b->data + b->l_data);
2340
5.58k
            b->l_data += 4;
2341
5.58k
        } else {
2342
105
            *overflow = 1;
2343
105
            q++;
2344
105
            skip_to_comma_(q);
2345
105
        }
2346
5.68k
    }
2347
3.73k
    return q;
2348
3.73k
}
2349
2350
HTS_ALIGN32
2351
static char *sam_parse_Bf_vals(bam1_t *b, char *q, uint32_t *nused,
2352
1.89k
                               uint32_t *nalloc, int *overflow) {
2353
9.17k
    while (*q == ',') {
2354
7.27k
        if ((*nused)++ >= (*nalloc)) {
2355
342
            if (grow_B_array(b, nalloc, 4) < 0)
2356
0
                return NULL;
2357
342
        }
2358
7.27k
        float_to_le(strtod(q + 1, &q), b->data + b->l_data);
2359
7.27k
        b->l_data += 4;
2360
7.27k
    }
2361
1.89k
    return q;
2362
1.89k
}
2363
2364
HTS_ALIGN32
2365
static int sam_parse_B_vals_r(char type, uint32_t nalloc, char *in,
2366
                              char **end, bam1_t *b,
2367
26.5k
                              int *ctr) {
2368
    // Protect against infinite recursion when dealing with invalid input.
2369
    // An example string is "XX:B:C,-".  The lack of a number means min=0,
2370
    // but it overflowed due to "-" and so we repeat ad-infinitum.
2371
    //
2372
    // Loop detection is the safest solution incase there are other
2373
    // strange corner cases with malformed inputs.
2374
26.5k
    if (++(*ctr) > 2) {
2375
75
        hts_log_error("Malformed data in B:%c array", type);
2376
75
        return -1;
2377
75
    }
2378
2379
26.5k
    int orig_l = b->l_data;
2380
26.5k
    char *q = in;
2381
26.5k
    int32_t size;
2382
26.5k
    size_t bytes;
2383
26.5k
    int overflow = 0;
2384
2385
26.5k
    size = aux_type2size(type);
2386
26.5k
    if (size <= 0 || size > 4) {
2387
23
        hts_log_error("Unrecognized type B:%c", type);
2388
23
        return -1;
2389
23
    }
2390
2391
    // Ensure space for type + values.
2392
    // The first pass through here we don't know the number of entries and
2393
    // nalloc == 0.  We start with a small working set and then parse the
2394
    // data, growing as needed.
2395
    //
2396
    // If we have a second pass through we do know the number of entries
2397
    // and nalloc is already known.  We have no need to expand the bam data.
2398
26.4k
    if (!nalloc)
2399
22.7k
         nalloc=7;
2400
2401
    // Ensure allocated memory is big enough (for current nalloc estimate)
2402
26.4k
    bytes = (size_t) nalloc * (size_t) size;
2403
26.4k
    if (bytes / size != nalloc
2404
26.4k
        || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) {
2405
0
        hts_log_error("Out of memory");
2406
0
        return -1;
2407
0
    }
2408
2409
26.4k
    uint32_t nused = 0;
2410
2411
26.4k
    b->data[b->l_data++] = 'B';
2412
26.4k
    b->data[b->l_data++] = type;
2413
    // 32-bit B-array length is inserted later once we know it.
2414
26.4k
    int b_len_idx = b->l_data;
2415
26.4k
    b->l_data += sizeof(uint32_t);
2416
2417
26.4k
    if (type == 'c') {
2418
1.95k
        if (!(q = sam_parse_Bc_vals(b, q, &nused, &nalloc, &overflow)))
2419
0
            return -1;
2420
24.5k
    } else if (type == 'C') {
2421
2.90k
        if (!(q = sam_parse_BC_vals(b, q, &nused, &nalloc, &overflow)))
2422
0
            return -1;
2423
21.6k
    } else if (type == 's') {
2424
9.80k
        if (!(q = sam_parse_Bs_vals(b, q, &nused, &nalloc, &overflow)))
2425
0
            return -1;
2426
11.8k
    } else if (type == 'S') {
2427
4.70k
        if (!(q = sam_parse_BS_vals(b, q, &nused, &nalloc, &overflow)))
2428
0
            return -1;
2429
7.12k
    } else if (type == 'i') {
2430
1.21k
        if (!(q = sam_parse_Bi_vals(b, q, &nused, &nalloc, &overflow)))
2431
0
            return -1;
2432
5.90k
    } else if (type == 'I') {
2433
3.73k
        if (!(q = sam_parse_BI_vals(b, q, &nused, &nalloc, &overflow)))
2434
0
            return -1;
2435
3.73k
    } else if (type == 'f') {
2436
1.89k
        if (!(q = sam_parse_Bf_vals(b, q, &nused, &nalloc, &overflow)))
2437
0
            return -1;
2438
1.89k
    }
2439
26.4k
    if (*q != '\t' && *q != '\0') {
2440
        // Unknown B array type or junk in the numbers
2441
152
        hts_log_error("Malformed B:%c", type);
2442
152
        return -1;
2443
152
    }
2444
26.3k
    i32_to_le(nused, b->data + b_len_idx);
2445
2446
26.3k
    if (!overflow) {
2447
22.1k
        *end = q;
2448
22.1k
        return 0;
2449
22.1k
    } else {
2450
4.21k
        int64_t max = 0, min = 0, val;
2451
        // Given type was incorrect.  Try to rescue the situation.
2452
4.21k
        char *r = q;
2453
4.21k
        q = in;
2454
4.21k
        overflow = 0;
2455
4.21k
        b->l_data = orig_l;
2456
        // Find out what range of values is present
2457
1.02M
        while (q < r) {
2458
1.02M
            val = hts_str2int(q + 1, &q, 64, &overflow);
2459
1.02M
            if (max < val) max = val;
2460
1.02M
            if (min > val) min = val;
2461
1.02M
            skip_to_comma_(q);
2462
1.02M
        }
2463
        // Retry with appropriate type
2464
4.21k
        if (!overflow) {
2465
4.10k
            if (min < 0) {
2466
2.20k
                if (min >= INT8_MIN && max <= INT8_MAX) {
2467
156
                    return sam_parse_B_vals_r('c', nalloc, in, end, b, ctr);
2468
2.04k
                } else if (min >= INT16_MIN && max <= INT16_MAX) {
2469
792
                    return sam_parse_B_vals_r('s', nalloc, in, end, b, ctr);
2470
1.25k
                } else if (min >= INT32_MIN && max <= INT32_MAX) {
2471
1.05k
                    return sam_parse_B_vals_r('i', nalloc, in, end, b, ctr);
2472
1.05k
                }
2473
2.20k
            } else {
2474
1.90k
                if (max < UINT8_MAX) {
2475
258
                    return sam_parse_B_vals_r('C', nalloc, in, end, b, ctr);
2476
1.64k
                } else if (max <= UINT16_MAX) {
2477
524
                    return sam_parse_B_vals_r('S', nalloc, in, end, b, ctr);
2478
1.12k
                } else if (max <= UINT32_MAX) {
2479
1.01k
                    return sam_parse_B_vals_r('I', nalloc, in, end, b, ctr);
2480
1.01k
                }
2481
1.90k
            }
2482
4.10k
        }
2483
        // If here then at least one of the values is too big to store
2484
408
        hts_log_error("Numeric value in B array out of allowed range");
2485
408
        return -1;
2486
4.21k
    }
2487
26.3k
#undef skip_to_comma_
2488
26.3k
}
2489
2490
HTS_ALIGN32
2491
static int sam_parse_B_vals(char type, char *in, char **end, bam1_t *b)
2492
22.7k
{
2493
22.7k
    int ctr = 0;
2494
22.7k
    uint32_t nalloc = 0;
2495
22.7k
    return sam_parse_B_vals_r(type, nalloc, in, end, b, &ctr);
2496
22.7k
}
2497
2498
8.06k
static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) {
2499
8.06k
    if (*v >= '1' && *v <= '9') {
2500
253
        return hts_str2uint(v, rv, 16, overflow);
2501
253
    }
2502
7.80k
    else if (*v == '0') {
2503
        // handle single-digit "0" directly; otherwise it's hex or octal
2504
832
        if (v[1] == '\t') { *rv = v+1; return 0; }
2505
74
        else {
2506
74
            unsigned long val = strtoul(v, rv, 0);
2507
74
            if (val > 65535) { *overflow = 1; return 65535; }
2508
15
            return val;
2509
74
        }
2510
832
    }
2511
6.97k
    else {
2512
        // TODO implement symbolic flag letters
2513
6.97k
        *rv = v;
2514
6.97k
        return 0;
2515
6.97k
    }
2516
8.06k
}
2517
2518
// Parse tag line and append to bam object b.
2519
// Shared by both SAM and FASTQ parsers.
2520
//
2521
// The difference between the two is how lenient we are to recognising
2522
// non-compliant strings.  The FASTQ parser glosses over arbitrary
2523
// non-SAM looking strings.
2524
static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient,
2525
7.04k
                            khash_t(tag) *tag_whitelist) {
2526
7.04k
    int overflow = 0;
2527
7.04k
    int checkpoint;
2528
7.04k
    char logbuf[40];
2529
7.04k
    char *q = start, *p = end;
2530
2531
7.04k
#define _parse_err(cond, ...)                   \
2532
3.75M
    do {                                        \
2533
7.58M
        if (cond) {                             \
2534
239
            if (lenient) {                      \
2535
0
                while (q < p && !isspace_c(*q))   \
2536
0
                    q++;                        \
2537
0
                while (q < p && isspace_c(*q))    \
2538
0
                    q++;                        \
2539
0
                b->l_data = checkpoint;         \
2540
0
                goto loop;                      \
2541
239
            } else {                            \
2542
239
                hts_log_error(__VA_ARGS__);     \
2543
239
                goto err_ret;                   \
2544
239
            }                                   \
2545
239
        }                                       \
2546
3.75M
    } while (0)
2547
2548
3.72M
    while (q < p) loop: {
2549
3.72M
        char type;
2550
3.72M
        checkpoint = b->l_data;
2551
3.72M
        if (p - q < 5) {
2552
36
            if (lenient) {
2553
0
                break;
2554
36
            } else {
2555
36
                hts_log_error("Incomplete aux field");
2556
36
                goto err_ret;
2557
36
            }
2558
36
        }
2559
1.86M
        _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id");
2560
2561
1.86M
        if (lenient && (q[2] | q[4]) != ':') {
2562
0
            while (q < p && !isspace_c(*q))
2563
0
                q++;
2564
0
            while (q < p && isspace_c(*q))
2565
0
                q++;
2566
0
            continue;
2567
0
        }
2568
2569
1.86M
        if (tag_whitelist) {
2570
0
            int tt = q[0]*256 + q[1];
2571
0
            if (kh_get(tag, tag_whitelist, tt) == kh_end(tag_whitelist)) {
2572
0
                while (q < p && *q != '\t')
2573
0
                    q++;
2574
0
                continue;
2575
0
            }
2576
0
        }
2577
2578
        // Copy over id
2579
1.86M
        if (possibly_expand_bam_data(b, 2) < 0) goto err_ret;
2580
1.86M
        memcpy(b->data + b->l_data, q, 2); b->l_data += 2;
2581
1.86M
        q += 3; type = *q++; ++q; // q points to value
2582
1.86M
        if (type != 'Z' && type != 'H') // the only zero length acceptable fields
2583
1.85M
            _parse_err(*q <= '\t', "incomplete aux field");
2584
2585
        // Ensure enough space for a double + type allocated.
2586
1.86M
        if (possibly_expand_bam_data(b, 16) < 0) goto err_ret;
2587
2588
1.86M
        if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
2589
784k
            b->data[b->l_data++] = 'A';
2590
784k
            b->data[b->l_data++] = *q++;
2591
1.07M
        } else if (type == 'i' || type == 'I') {
2592
991k
            if (*q == '-') {
2593
829k
                int32_t x = hts_str2int(q, &q, 32, &overflow);
2594
829k
                if (x >= INT8_MIN) {
2595
393k
                    b->data[b->l_data++] = 'c';
2596
393k
                    b->data[b->l_data++] = x;
2597
436k
                } else if (x >= INT16_MIN) {
2598
139k
                    b->data[b->l_data++] = 's';
2599
139k
                    i16_to_le(x, b->data + b->l_data);
2600
139k
                    b->l_data += 2;
2601
297k
                } else {
2602
297k
                    b->data[b->l_data++] = 'i';
2603
297k
                    i32_to_le(x, b->data + b->l_data);
2604
297k
                    b->l_data += 4;
2605
297k
                }
2606
829k
            } else {
2607
162k
                uint32_t x = hts_str2uint(q, &q, 32, &overflow);
2608
162k
                if (x <= UINT8_MAX) {
2609
86.3k
                    b->data[b->l_data++] = 'C';
2610
86.3k
                    b->data[b->l_data++] = x;
2611
86.3k
                } else if (x <= UINT16_MAX) {
2612
74.4k
                    b->data[b->l_data++] = 'S';
2613
74.4k
                    u16_to_le(x, b->data + b->l_data);
2614
74.4k
                    b->l_data += 2;
2615
74.4k
                } else {
2616
1.42k
                    b->data[b->l_data++] = 'I';
2617
1.42k
                    u32_to_le(x, b->data + b->l_data);
2618
1.42k
                    b->l_data += 4;
2619
1.42k
                }
2620
162k
            }
2621
991k
        } else if (type == 'f') {
2622
12.6k
            b->data[b->l_data++] = 'f';
2623
12.6k
            float_to_le(strtod(q, &q), b->data + b->l_data);
2624
12.6k
            b->l_data += sizeof(float);
2625
72.9k
        } else if (type == 'd') {
2626
40.9k
            b->data[b->l_data++] = 'd';
2627
40.9k
            double_to_le(strtod(q, &q), b->data + b->l_data);
2628
40.9k
            b->l_data += sizeof(double);
2629
40.9k
        } else if (type == 'Z' || type == 'H') {
2630
9.20k
            char *end = strchr(q, '\t');
2631
9.20k
            if (!end) end = q + strlen(q);
2632
9.20k
            _parse_err(type == 'H' && ((end-q)&1) != 0,
2633
9.20k
                       "hex field does not have an even number of digits");
2634
9.19k
            b->data[b->l_data++] = type;
2635
9.19k
            if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret;
2636
9.19k
            memcpy(b->data + b->l_data, q, end - q);
2637
9.19k
            b->l_data += end - q;
2638
9.19k
            b->data[b->l_data++] = '\0';
2639
9.19k
            q = end;
2640
22.8k
        } else if (type == 'B') {
2641
22.7k
            type = *q++; // q points to the first ',' following the typing byte
2642
22.7k
            _parse_err(*q && *q != ',' && *q != '\t',
2643
22.7k
                       "B aux field type not followed by ','");
2644
2645
22.7k
            if (sam_parse_B_vals(type, q, &q, b) < 0)
2646
658
                goto err_ret;
2647
22.7k
        } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1));
2648
2649
11.9M
        while (*q > '\t') { q++; } // Skip any junk to next tab
2650
1.86M
        q++;
2651
1.86M
    }
2652
2653
6.18k
    _parse_err(!lenient && overflow != 0, "numeric value out of allowed range");
2654
6.11k
#undef _parse_err
2655
2656
6.11k
    return 0;
2657
2658
933
err_ret:
2659
933
    return -2;
2660
6.18k
}
2661
2662
int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
2663
8.20k
{
2664
35.7k
#define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0)
2665
2666
8.20k
#if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff
2667
2668
// Macro that operates on 64-bits at a time.
2669
8.20k
#define COPY_MINUS_N(to,from,n,l,failed)                        \
2670
8.20k
    do {                                                        \
2671
6.38k
        uint64_u *from8 = (uint64_u *)(from);                   \
2672
6.38k
        uint64_u *to8 = (uint64_u *)(to);                       \
2673
6.38k
        uint64_t uflow = 0;                                     \
2674
6.38k
        size_t l8 = (l)>>3, i;                                  \
2675
6.67k
        for (i = 0; i < l8; i++) {                              \
2676
285
            to8[i] = from8[i] - (n)*0x0101010101010101UL;       \
2677
285
            uflow |= to8[i];                                    \
2678
285
        }                                                       \
2679
6.87k
        for (i<<=3; i < (l); ++i) {                             \
2680
484
            to[i] = from[i] - (n);                              \
2681
484
            uflow |= to[i];                                     \
2682
484
        }                                                       \
2683
6.38k
        failed = (uflow & 0x8080808080808080UL) > 0;            \
2684
6.38k
    } while (0)
2685
2686
#else
2687
2688
// Basic version which operates a byte at a time
2689
#define COPY_MINUS_N(to,from,n,l,failed) do {                \
2690
        uint8_t uflow = 0;                                   \
2691
        for (i = 0; i < (l); ++i) {                          \
2692
            (to)[i] = (from)[i] - (n);                       \
2693
            uflow |= (uint8_t) (to)[i];                      \
2694
        }                                                    \
2695
        failed = (uflow & 0x80) > 0;                         \
2696
    } while (0)
2697
2698
#endif
2699
2700
14.1k
#define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l)
2701
90.7k
#define _parse_err(cond, ...) do { if (cond) { hts_log_error(__VA_ARGS__); goto err_ret; } } while (0)
2702
23.6k
#define _parse_warn(cond, ...) do { if (cond) { hts_log_warning(__VA_ARGS__); } } while (0)
2703
2704
8.20k
    uint8_t *t;
2705
2706
8.20k
    char *p = s->s, *q;
2707
8.20k
    int i, overflow = 0;
2708
8.20k
    char logbuf[40];
2709
8.20k
    hts_pos_t cigreflen;
2710
8.20k
    bam1_core_t *c = &b->core;
2711
2712
8.20k
    b->l_data = 0;
2713
8.20k
    memset(c, 0, 32);
2714
2715
    // qname
2716
8.20k
    q = _read_token(p);
2717
2718
8.06k
    _parse_warn(p - q <= 1, "empty query name");
2719
8.06k
    _parse_err(p - q > 255, "query name too long");
2720
    // resize large enough for name + extranul
2721
8.06k
    if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret;
2722
8.06k
    memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q;
2723
2724
8.06k
    c->l_extranul = (4 - (b->l_data & 3)) & 3;
2725
8.06k
    memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul);
2726
8.06k
    b->l_data += c->l_extranul;
2727
2728
8.06k
    c->l_qname = p - q + c->l_extranul;
2729
2730
    // flag
2731
8.06k
    c->flag = parse_sam_flag(p, &p, &overflow);
2732
8.06k
    if (*p++ != '\t') goto err_ret; // malformated flag
2733
2734
    // chr
2735
7.82k
    q = _read_token(p);
2736
7.80k
    if (strcmp(q, "*")) {
2737
2.40k
        _parse_err(h->n_targets == 0, "no SQ lines present in the header");
2738
2.36k
        c->tid = bam_name2id(h, q);
2739
2.36k
        _parse_err(c->tid < -1, "failed to parse header");
2740
2.36k
        _parse_warn(c->tid < 0, "unrecognized reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2741
5.39k
    } else c->tid = -1;
2742
2743
    // pos
2744
7.75k
    c->pos = hts_str2uint(p, &p, 62, &overflow) - 1;
2745
7.75k
    if (*p++ != '\t') goto err_ret;
2746
7.65k
    if (c->pos < 0 && c->tid >= 0) {
2747
36
        _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
2748
36
        c->tid = -1;
2749
36
    }
2750
7.65k
    if (c->tid < 0) c->flag |= BAM_FUNMAP;
2751
2752
    // mapq
2753
7.65k
    c->qual = hts_str2uint(p, &p, 8, &overflow);
2754
7.65k
    if (*p++ != '\t') goto err_ret;
2755
    // cigar
2756
7.59k
    if (*p != '*') {
2757
2.42k
        uint32_t *cigar = NULL;
2758
2.42k
        int old_l_data = b->l_data;
2759
2.42k
        int n_cigar = bam_parse_cigar(p, &p, b);
2760
2.42k
        if (n_cigar < 1 || *p++ != '\t') goto err_ret;
2761
2.29k
        cigar = (uint32_t *)(b->data + old_l_data);
2762
2763
        // can't use bam_endpos() directly as some fields not yet set up
2764
2.29k
        cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
2765
2.29k
        if (cigreflen == 0) cigreflen = 1;
2766
5.16k
    } else {
2767
5.16k
        _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
2768
5.16k
        c->flag |= BAM_FUNMAP;
2769
5.16k
        q = _read_token(p);
2770
5.14k
        cigreflen = 1;
2771
5.14k
    }
2772
7.44k
    _parse_err(HTS_POS_MAX - cigreflen <= c->pos,
2773
7.44k
               "read ends beyond highest supported position");
2774
7.44k
    c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5);
2775
    // mate chr
2776
7.44k
    q = _read_token(p);
2777
7.40k
    if (strcmp(q, "=") == 0) {
2778
129
        c->mtid = c->tid;
2779
7.27k
    } else if (strcmp(q, "*") == 0) {
2780
21
        c->mtid = -1;
2781
7.25k
    } else {
2782
7.25k
        c->mtid = bam_name2id(h, q);
2783
7.25k
        _parse_err(c->mtid < -1, "failed to parse header");
2784
7.25k
        _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2785
7.25k
    }
2786
    // mpos
2787
7.40k
    c->mpos = hts_str2uint(p, &p, 62, &overflow) - 1;
2788
7.40k
    if (*p++ != '\t') goto err_ret;
2789
7.32k
    if (c->mpos < 0 && c->mtid >= 0) {
2790
758
        _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
2791
758
        c->mtid = -1;
2792
758
    }
2793
    // tlen
2794
7.32k
    c->isize = hts_str2int(p, &p, 63, &overflow);
2795
7.32k
    if (*p++ != '\t') goto err_ret;
2796
7.17k
    _parse_err(overflow, "number outside allowed range");
2797
    // seq
2798
7.14k
    q = _read_token(p);
2799
7.12k
    if (strcmp(q, "*")) {
2800
7.01k
        _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long");
2801
7.01k
        c->l_qseq = p - q - 1;
2802
7.01k
        hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname));
2803
7.01k
        _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length");
2804
7.00k
        i = (c->l_qseq + 1) >> 1;
2805
7.00k
        _get_mem(uint8_t, &t, b, i);
2806
2807
7.00k
        unsigned int lqs2 = c->l_qseq&~1, i;
2808
8.57k
        for (i = 0; i < lqs2; i+=2)
2809
1.56k
            t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]];
2810
7.78k
        for (; i < c->l_qseq; ++i)
2811
778
            t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2);
2812
7.00k
    } else c->l_qseq = 0;
2813
    // qual
2814
14.2k
    _get_mem(uint8_t, &t, b, c->l_qseq);
2815
14.2k
    if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) {
2816
692
        memset(t, 0xff, c->l_qseq);
2817
692
        p += 2;
2818
6.42k
    } else {
2819
6.42k
        int failed = 0;
2820
6.42k
        _parse_err(s->l - (p - s->s) < c->l_qseq
2821
6.42k
                   || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'),
2822
6.42k
                   "SEQ and QUAL are of different length");
2823
6.38k
        COPY_MINUS_N(t, p, 33, c->l_qseq, failed);
2824
6.38k
        _parse_err(failed, "invalid QUAL character");
2825
6.35k
        p += c->l_qseq + 1;
2826
6.35k
    }
2827
2828
    // aux
2829
7.04k
    if (aux_parse(p, s->s + s->l, b, 0, NULL) < 0)
2830
933
        goto err_ret;
2831
2832
6.11k
    if (bam_tag2cigar(b, 1, 1) < 0)
2833
0
        return -2;
2834
6.11k
    return 0;
2835
2836
0
#undef _parse_warn
2837
0
#undef _parse_err
2838
0
#undef _get_mem
2839
0
#undef _read_token
2840
2.09k
err_ret:
2841
2.09k
    return -2;
2842
6.11k
}
2843
2844
2.42k
static uint32_t read_ncigar(const char *q) {
2845
2.42k
    uint32_t n_cigar = 0;
2846
37.9k
    for (; *q && *q != '\t'; ++q)
2847
35.4k
        if (!isdigit_c(*q)) ++n_cigar;
2848
2.42k
    if (!n_cigar) {
2849
18
        hts_log_error("No CIGAR operations");
2850
18
        return 0;
2851
18
    }
2852
2.40k
    if (n_cigar >= 2147483647) {
2853
0
        hts_log_error("Too many CIGAR operations");
2854
0
        return 0;
2855
0
    }
2856
2857
2.40k
    return n_cigar;
2858
2.40k
}
2859
2860
/*! @function
2861
 @abstract  Parse a CIGAR string into preallocated a uint32_t array
2862
 @param  in      [in]  pointer to the source string
2863
 @param  a_cigar [out]  address of the destination uint32_t buffer
2864
 @return         number of processed input characters; 0 on error
2865
 */
2866
2.40k
static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) {
2867
2.40k
    int i, overflow = 0;
2868
2.40k
    const char *p = in;
2869
8.57k
    for (i = 0; i < n_cigar; i++) {
2870
6.26k
        uint32_t len;
2871
6.26k
        int op;
2872
6.26k
        char *q;
2873
6.26k
        len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT;
2874
6.26k
        if (q == p) {
2875
31
            hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p);
2876
31
            return 0;
2877
31
        }
2878
6.23k
        if (overflow) {
2879
32
            hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p);
2880
32
            return 0;
2881
32
        }
2882
6.20k
        p = q;
2883
6.20k
        op = bam_cigar_table[(unsigned char)*p++];
2884
6.20k
        if (op < 0) {
2885
32
            hts_log_error("Unrecognized CIGAR operator");
2886
32
            return 0;
2887
32
        }
2888
6.17k
        a_cigar[i] = len;
2889
6.17k
        a_cigar[i] |= op;
2890
6.17k
    }
2891
2892
2.30k
    return p-in;
2893
2.40k
}
2894
2895
0
ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) {
2896
0
    size_t n_cigar = 0;
2897
0
    int diff;
2898
2899
0
    if (!in || !a_cigar || !a_mem) {
2900
0
        hts_log_error("NULL pointer arguments");
2901
0
        return -1;
2902
0
    }
2903
0
    if (end) *end = (char *)in;
2904
2905
0
    if (*in == '*') {
2906
0
        if (end) (*end)++;
2907
0
        return 0;
2908
0
    }
2909
0
    n_cigar = read_ncigar(in);
2910
0
    if (!n_cigar) return 0;
2911
0
    if (n_cigar > *a_mem) {
2912
0
        uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar));
2913
0
        if (a_tmp) {
2914
0
            *a_cigar = a_tmp;
2915
0
            *a_mem = n_cigar;
2916
0
        } else {
2917
0
            hts_log_error("Memory allocation error");
2918
0
            return -1;
2919
0
        }
2920
0
    }
2921
2922
0
    if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1;
2923
0
    if (end) *end = (char *)in+diff;
2924
2925
0
    return n_cigar;
2926
0
}
2927
2928
2.42k
ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) {
2929
2.42k
    size_t n_cigar = 0;
2930
2.42k
    int diff;
2931
2932
2.42k
    if (!in || !b) {
2933
0
        hts_log_error("NULL pointer arguments");
2934
0
        return -1;
2935
0
    }
2936
2.42k
    if (end) *end = (char *)in;
2937
2938
2.42k
    n_cigar = (*in == '*') ? 0 : read_ncigar(in);
2939
2.42k
    if (!n_cigar && b->core.n_cigar == 0) {
2940
18
        if (end) *end = (char *)in+1;
2941
18
        return 0;
2942
18
    }
2943
2944
2.40k
    ssize_t cig_diff = n_cigar - b->core.n_cigar;
2945
2.40k
    if (cig_diff > 0 &&
2946
2.40k
        possibly_expand_bam_data(b, cig_diff * sizeof(uint32_t)) < 0) {
2947
0
        hts_log_error("Memory allocation error");
2948
0
        return -1;
2949
0
    }
2950
2951
2.40k
    uint32_t *cig = bam_get_cigar(b);
2952
2.40k
    if ((uint8_t *)cig != b->data + b->l_data) {
2953
        // Modifying an BAM existing BAM record
2954
0
        uint8_t  *seq = bam_get_seq(b);
2955
0
        memmove(cig + n_cigar, seq, (b->data + b->l_data) - seq);
2956
0
    }
2957
2958
2.40k
    if (n_cigar) {
2959
2.40k
        if (!(diff = parse_cigar(in, cig, n_cigar)))
2960
95
            return -1;
2961
2.40k
    } else {
2962
0
        diff = 1; // handle "*"
2963
0
    }
2964
2965
2.30k
    b->l_data += cig_diff * sizeof(uint32_t);
2966
2.30k
    b->core.n_cigar = n_cigar;
2967
2.30k
    if (end) *end = (char *)in + diff;
2968
2969
2.30k
    return n_cigar;
2970
2.40k
}
2971
2972
/*
2973
 * -----------------------------------------------------------------------------
2974
 * SAM threading
2975
 */
2976
// Size of SAM text block (reading)
2977
0
#define SAM_NBYTES 240000
2978
2979
// Number of BAM records (writing, up to NB_mem in size)
2980
0
#define SAM_NBAM 1000
2981
2982
struct SAM_state;
2983
2984
// Output job - a block of BAM records
2985
typedef struct sp_bams {
2986
    struct sp_bams *next;
2987
    int serial;
2988
2989
    bam1_t *bams;
2990
    int nbams, abams; // used and alloc for bams[] array
2991
    size_t bam_mem;   // very approximate total size
2992
2993
    struct SAM_state *fd;
2994
} sp_bams;
2995
2996
// Input job - a block of SAM text
2997
typedef struct sp_lines {
2998
    struct sp_lines *next;
2999
    int serial;
3000
3001
    char *data;
3002
    int data_size;
3003
    int alloc;
3004
3005
    struct SAM_state *fd;
3006
    sp_bams *bams;
3007
} sp_lines;
3008
3009
enum sam_cmd {
3010
    SAM_NONE = 0,
3011
    SAM_CLOSE,
3012
    SAM_CLOSE_DONE,
3013
    SAM_AT_EOF,
3014
};
3015
3016
typedef struct SAM_state {
3017
    sam_hdr_t *h;
3018
3019
    hts_tpool *p;
3020
    int own_pool;
3021
    pthread_mutex_t lines_m;
3022
    hts_tpool_process *q;
3023
    pthread_t dispatcher;
3024
    int dispatcher_set;
3025
3026
    sp_lines *lines;
3027
    sp_bams *bams;
3028
3029
    sp_bams *curr_bam;
3030
    int curr_idx;
3031
    int serial;
3032
3033
    // Be warned: moving these mutexes around in this struct can reduce
3034
    // threading performance by up to 70%!
3035
    pthread_mutex_t command_m;
3036
    pthread_cond_t command_c;
3037
    enum sam_cmd command;
3038
3039
    // One of the E* errno codes
3040
    int errcode;
3041
3042
    htsFile *fp;
3043
} SAM_state;
3044
3045
// Returns a SAM_state struct from a generic hFILE.
3046
//
3047
// Returns NULL on failure.
3048
0
static SAM_state *sam_state_create(htsFile *fp) {
3049
    // Ideally sam_open wouldn't be a #define to hts_open but instead would
3050
    // be a redirect call with an additional 'S' mode.  This in turn would
3051
    // correctly set the designed format to sam instead of a generic
3052
    // text_format.
3053
0
    if (fp->format.format != sam && fp->format.format != text_format)
3054
0
        return NULL;
3055
3056
0
    SAM_state *fd = calloc(1, sizeof(*fd));
3057
0
    if (!fd)
3058
0
        return NULL;
3059
3060
0
    fp->state = fd;
3061
0
    fd->fp = fp;
3062
3063
0
    return fd;
3064
0
}
3065
3066
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str);
3067
static void *sam_format_worker(void *arg);
3068
3069
0
static void sam_state_err(SAM_state *fd, int errcode) {
3070
0
    pthread_mutex_lock(&fd->command_m);
3071
0
    if (!fd->errcode)
3072
0
        fd->errcode = errcode;
3073
0
    pthread_mutex_unlock(&fd->command_m);
3074
0
}
3075
3076
0
static void sam_free_sp_bams(sp_bams *b) {
3077
0
    if (!b)
3078
0
        return;
3079
3080
0
    if (b->bams) {
3081
0
        int i;
3082
0
        for (i = 0; i < b->abams; i++) {
3083
0
            if (b->bams[i].data)
3084
0
                free(b->bams[i].data);
3085
0
        }
3086
0
        free(b->bams);
3087
0
    }
3088
0
    free(b);
3089
0
}
3090
3091
// Destroys the state produce by sam_state_create.
3092
24.1k
int sam_state_destroy(htsFile *fp) {
3093
24.1k
    int ret = 0;
3094
3095
24.1k
    if (!fp->state)
3096
24.1k
        return 0;
3097
3098
0
    SAM_state *fd = fp->state;
3099
0
    if (fd->p) {
3100
0
        if (fd->h) {
3101
            // Notify sam_dispatcher we're closing
3102
0
            pthread_mutex_lock(&fd->command_m);
3103
0
            if (fd->command != SAM_CLOSE_DONE)
3104
0
                fd->command = SAM_CLOSE;
3105
0
            pthread_cond_signal(&fd->command_c);
3106
0
            ret = -fd->errcode;
3107
0
            if (fd->q)
3108
0
                hts_tpool_wake_dispatch(fd->q); // unstick the reader
3109
3110
0
            if (!fp->is_write && fd->q && fd->dispatcher_set) {
3111
0
                for (;;) {
3112
                    // Avoid deadlocks with dispatcher
3113
0
                    if (fd->command == SAM_CLOSE_DONE)
3114
0
                        break;
3115
0
                    hts_tpool_wake_dispatch(fd->q);
3116
0
                    pthread_mutex_unlock(&fd->command_m);
3117
0
                    hts_usleep(10000);
3118
0
                    pthread_mutex_lock(&fd->command_m);
3119
0
                }
3120
0
            }
3121
0
            pthread_mutex_unlock(&fd->command_m);
3122
3123
0
            if (fp->is_write) {
3124
                // Dispatch the last partial block.
3125
0
                sp_bams *gb = fd->curr_bam;
3126
0
                if (!ret && gb && gb->nbams > 0 && fd->q)
3127
0
                    ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb);
3128
3129
                // Flush and drain output
3130
0
                if (fd->q)
3131
0
                    hts_tpool_process_flush(fd->q);
3132
0
                pthread_mutex_lock(&fd->command_m);
3133
0
                if (!ret) ret = -fd->errcode;
3134
0
                pthread_mutex_unlock(&fd->command_m);
3135
3136
0
                while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) {
3137
0
                    hts_usleep(10000);
3138
0
                    pthread_mutex_lock(&fd->command_m);
3139
0
                    ret = -fd->errcode;
3140
                    // not empty but shutdown implies error
3141
0
                    if (hts_tpool_process_is_shutdown(fd->q) && !ret)
3142
0
                        ret = EIO;
3143
0
                    pthread_mutex_unlock(&fd->command_m);
3144
0
                }
3145
0
                if (fd->q)
3146
0
                    hts_tpool_process_shutdown(fd->q);
3147
0
            }
3148
3149
            // Wait for it to acknowledge
3150
0
            if (fd->dispatcher_set)
3151
0
                pthread_join(fd->dispatcher, NULL);
3152
0
            if (!ret) ret = -fd->errcode;
3153
0
        }
3154
3155
        // Tidy up memory
3156
0
        if (fd->q)
3157
0
            hts_tpool_process_destroy(fd->q);
3158
3159
0
        if (fd->own_pool && fp->format.compression == no_compression) {
3160
0
            hts_tpool_destroy(fd->p);
3161
0
            fd->p = NULL;
3162
0
        }
3163
0
        pthread_mutex_destroy(&fd->lines_m);
3164
0
        pthread_mutex_destroy(&fd->command_m);
3165
0
        pthread_cond_destroy(&fd->command_c);
3166
3167
0
        sp_lines *l = fd->lines;
3168
0
        while (l) {
3169
0
            sp_lines *n = l->next;
3170
0
            free(l->data);
3171
0
            free(l);
3172
0
            l = n;
3173
0
        }
3174
3175
0
        sp_bams *b = fd->bams;
3176
0
        while (b) {
3177
0
            if (fd->curr_bam == b)
3178
0
                fd->curr_bam = NULL;
3179
0
            sp_bams *n = b->next;
3180
0
            sam_free_sp_bams(b);
3181
0
            b = n;
3182
0
        }
3183
3184
0
        if (fd->curr_bam)
3185
0
            sam_free_sp_bams(fd->curr_bam);
3186
3187
        // Decrement counter by one, maybe destroying too.
3188
        // This is to permit the caller using bam_hdr_destroy
3189
        // before sam_close without triggering decode errors
3190
        // in the background threads.
3191
0
        bam_hdr_destroy(fd->h);
3192
0
    }
3193
3194
0
    free(fp->state);
3195
0
    fp->state = NULL;
3196
0
    return ret;
3197
24.1k
}
3198
3199
// Cleanup function - job for sam_parse_worker; result for sam_format_worker
3200
0
static void cleanup_sp_lines(void *arg) {
3201
0
    sp_lines *gl = (sp_lines *)arg;
3202
0
    if (!gl) return;
3203
3204
    // Should always be true for lines passed to / from thread workers.
3205
0
    assert(gl->next == NULL);
3206
3207
0
    free(gl->data);
3208
0
    sam_free_sp_bams(gl->bams);
3209
0
    free(gl);
3210
0
}
3211
3212
// Run from one of the worker threads.
3213
// Convert a passed in array of lines to array of BAMs, returning
3214
// the result back to the thread queue.
3215
0
static void *sam_parse_worker(void *arg) {
3216
0
    sp_lines *gl = (sp_lines *)arg;
3217
0
    sp_bams *gb = NULL;
3218
0
    char *lines = gl->data;
3219
0
    int i;
3220
0
    bam1_t *b;
3221
0
    SAM_state *fd = gl->fd;
3222
3223
    // Use a block of BAM structs we had earlier if available.
3224
0
    pthread_mutex_lock(&fd->lines_m);
3225
0
    if (fd->bams) {
3226
0
        gb = fd->bams;
3227
0
        fd->bams = gb->next;
3228
0
    }
3229
0
    pthread_mutex_unlock(&fd->lines_m);
3230
3231
0
    if (gb == NULL) {
3232
0
        gb = calloc(1, sizeof(*gb));
3233
0
        if (!gb) {
3234
0
            return NULL;
3235
0
        }
3236
0
        gb->abams = 100;
3237
0
        gb->bams = b = calloc(gb->abams, sizeof(*b));
3238
0
        if (!gb->bams) {
3239
0
            sam_state_err(fd, ENOMEM);
3240
0
            goto err;
3241
0
        }
3242
0
        gb->nbams = 0;
3243
0
        gb->bam_mem = 0;
3244
0
    }
3245
0
    gb->serial = gl->serial;
3246
0
    gb->next = NULL;
3247
3248
0
    b = (bam1_t *)gb->bams;
3249
0
    if (!b) {
3250
0
        sam_state_err(fd, ENOMEM);
3251
0
        goto err;
3252
0
    }
3253
3254
0
    i = 0;
3255
0
    char *cp = lines, *cp_end = lines + gl->data_size;
3256
0
    while (cp < cp_end) {
3257
0
        if (i >= gb->abams) {
3258
0
            int old_abams = gb->abams;
3259
0
            gb->abams *= 2;
3260
0
            b = (bam1_t *)realloc(gb->bams, gb->abams*sizeof(bam1_t));
3261
0
            if (!b) {
3262
0
                gb->abams /= 2;
3263
0
                sam_state_err(fd, ENOMEM);
3264
0
                goto err;
3265
0
            }
3266
0
            memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b));
3267
0
            gb->bams = b;
3268
0
        }
3269
3270
        // Ideally we'd get sam_parse1 to return the number of
3271
        // bytes decoded and to be able to stop on newline as
3272
        // well as \0.
3273
        //
3274
        // We can then avoid the additional strchr loop.
3275
        // It's around 6% of our CPU cost, albeit threadable.
3276
        //
3277
        // However this is an API change so for now we copy.
3278
3279
0
        char *nl = strchr(cp, '\n');
3280
0
        char *line_end;
3281
0
        if (nl) {
3282
0
            line_end = nl;
3283
0
            if (line_end > cp && *(line_end - 1) == '\r')
3284
0
                line_end--;
3285
0
            nl++;
3286
0
        } else {
3287
0
            nl = line_end = cp_end;
3288
0
        }
3289
0
        *line_end = '\0';
3290
0
        kstring_t ks = { line_end - cp, gl->alloc, cp };
3291
0
        if (sam_parse1(&ks, fd->h, &b[i]) < 0) {
3292
0
            sam_state_err(fd, errno ? errno : EIO);
3293
0
            cleanup_sp_lines(gl);
3294
0
            goto err;
3295
0
        }
3296
3297
0
        cp = nl;
3298
0
        i++;
3299
0
    }
3300
0
    gb->nbams = i;
3301
3302
0
    pthread_mutex_lock(&fd->lines_m);
3303
0
    gl->next = fd->lines;
3304
0
    fd->lines = gl;
3305
0
    pthread_mutex_unlock(&fd->lines_m);
3306
0
    return gb;
3307
3308
0
 err:
3309
0
    sam_free_sp_bams(gb);
3310
0
    return NULL;
3311
0
}
3312
3313
0
static void *sam_parse_eof(void *arg) {
3314
0
    return NULL;
3315
0
}
3316
3317
// Cleanup function - result for sam_parse_worker; job for sam_format_worker
3318
0
static void cleanup_sp_bams(void *arg) {
3319
0
    sam_free_sp_bams((sp_bams *) arg);
3320
0
}
3321
3322
// Runs in its own thread.
3323
// Reads a block of text (SAM) and sends a new job to the thread queue to
3324
// translate this to BAM.
3325
0
static void *sam_dispatcher_read(void *vp) {
3326
0
    htsFile *fp = vp;
3327
0
    kstring_t line = {0};
3328
0
    int line_frag = 0;
3329
0
    SAM_state *fd = fp->state;
3330
0
    sp_lines *l = NULL;
3331
3332
    // Pre-allocate buffer for left-over bits of line (exact size doesn't
3333
    // matter as it will grow if necessary).
3334
0
    if (ks_resize(&line, 1000) < 0)
3335
0
        goto err;
3336
3337
0
    for (;;) {
3338
        // Check for command
3339
0
        pthread_mutex_lock(&fd->command_m);
3340
0
        switch (fd->command) {
3341
3342
0
        case SAM_CLOSE:
3343
0
            pthread_cond_signal(&fd->command_c);
3344
0
            pthread_mutex_unlock(&fd->command_m);
3345
0
            hts_tpool_process_shutdown(fd->q);
3346
0
            goto tidyup;
3347
3348
0
        default:
3349
0
            break;
3350
0
        }
3351
0
        pthread_mutex_unlock(&fd->command_m);
3352
3353
0
        pthread_mutex_lock(&fd->lines_m);
3354
0
        if (fd->lines) {
3355
            // reuse existing line buffer
3356
0
            l = fd->lines;
3357
0
            fd->lines = l->next;
3358
0
        }
3359
0
        pthread_mutex_unlock(&fd->lines_m);
3360
3361
0
        if (l == NULL) {
3362
            // none to reuse, to create a new one
3363
0
            l = calloc(1, sizeof(*l));
3364
0
            if (!l)
3365
0
                goto err;
3366
0
            l->alloc = SAM_NBYTES;
3367
0
            l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1
3368
0
            if (!l->data) {
3369
0
                free(l);
3370
0
                l = NULL;
3371
0
                goto err;
3372
0
            }
3373
0
            l->fd = fd;
3374
0
        }
3375
0
        l->next = NULL;
3376
3377
0
        if (l->alloc < line_frag+SAM_NBYTES/2) {
3378
0
            char *rp = realloc(l->data, line_frag+SAM_NBYTES/2 +8);
3379
0
            if (!rp)
3380
0
                goto err;
3381
0
            l->alloc = line_frag+SAM_NBYTES/2;
3382
0
            l->data = rp;
3383
0
        }
3384
0
        memcpy(l->data, line.s, line_frag);
3385
3386
0
        l->data_size = line_frag;
3387
0
        ssize_t nbytes;
3388
0
    longer_line:
3389
0
        if (fp->is_bgzf)
3390
0
            nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag);
3391
0
        else
3392
0
            nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag);
3393
0
        if (nbytes < 0) {
3394
0
            sam_state_err(fd, errno ? errno : EIO);
3395
0
            goto err;
3396
0
        } else if (nbytes == 0)
3397
0
            break; // EOF
3398
0
        l->data_size += nbytes;
3399
3400
        // trim to last \n. Maybe \r\n, but that's still fine
3401
0
        if (nbytes == l->alloc - line_frag) {
3402
0
            char *cp_end = l->data + l->data_size;
3403
0
            char *cp = cp_end-1;
3404
3405
0
            while (cp > (char *)l->data && *cp != '\n')
3406
0
                cp--;
3407
3408
            // entire buffer is part of a single line
3409
0
            if (cp == l->data) {
3410
0
                line_frag = l->data_size;
3411
0
                char *rp = realloc(l->data, l->alloc * 2 + 8);
3412
0
                if (!rp)
3413
0
                    goto err;
3414
0
                l->alloc *= 2;
3415
0
                l->data = rp;
3416
0
                assert(l->alloc >= l->data_size);
3417
0
                assert(l->alloc >= line_frag);
3418
0
                assert(l->alloc >= l->alloc - line_frag);
3419
0
                goto longer_line;
3420
0
            }
3421
0
            cp++;
3422
3423
            // line holds the remainder of our line.
3424
0
            if (ks_resize(&line, cp_end - cp) < 0)
3425
0
                goto err;
3426
0
            memcpy(line.s, cp, cp_end - cp);
3427
0
            line_frag = cp_end - cp;
3428
0
            l->data_size = l->alloc - line_frag;
3429
0
        } else {
3430
            // out of buffer
3431
0
            line_frag = 0;
3432
0
        }
3433
3434
0
        l->serial = fd->serial++;
3435
        //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial);
3436
0
        if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l,
3437
0
                                cleanup_sp_lines, cleanup_sp_bams, 0) < 0)
3438
0
            goto err;
3439
0
        pthread_mutex_lock(&fd->command_m);
3440
0
        if (fd->command == SAM_CLOSE) {
3441
0
            pthread_mutex_unlock(&fd->command_m);
3442
0
            l = NULL;
3443
0
            goto tidyup;
3444
0
        }
3445
0
        l = NULL;  // Now "owned" by sam_parse_worker()
3446
0
        pthread_mutex_unlock(&fd->command_m);
3447
0
    }
3448
3449
    // Submit a NULL sp_bams entry to act as an EOF marker
3450
0
    if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0)
3451
0
        goto err;
3452
3453
    // At EOF, wait for close request.
3454
    // (In future if we add support for seek, this is where we need to catch it.)
3455
0
    for (;;) {
3456
0
        pthread_mutex_lock(&fd->command_m);
3457
0
        if (fd->command == SAM_NONE)
3458
0
            pthread_cond_wait(&fd->command_c, &fd->command_m);
3459
0
        switch (fd->command) {
3460
0
        case SAM_CLOSE:
3461
0
            pthread_cond_signal(&fd->command_c);
3462
0
            pthread_mutex_unlock(&fd->command_m);
3463
0
            hts_tpool_process_shutdown(fd->q);
3464
0
            goto tidyup;
3465
3466
0
        default:
3467
0
            pthread_mutex_unlock(&fd->command_m);
3468
0
            break;
3469
0
        }
3470
0
    }
3471
3472
0
 tidyup:
3473
0
    pthread_mutex_lock(&fd->command_m);
3474
0
    fd->command = SAM_CLOSE_DONE;
3475
0
    pthread_cond_signal(&fd->command_c);
3476
0
    pthread_mutex_unlock(&fd->command_m);
3477
3478
0
    if (l) {
3479
0
        pthread_mutex_lock(&fd->lines_m);
3480
0
        l->next = fd->lines;
3481
0
        fd->lines = l;
3482
0
        pthread_mutex_unlock(&fd->lines_m);
3483
0
    }
3484
0
    free(line.s);
3485
3486
0
    return NULL;
3487
3488
0
 err:
3489
0
    sam_state_err(fd, errno ? errno : ENOMEM);
3490
0
    hts_tpool_process_shutdown(fd->q);
3491
0
    goto tidyup;
3492
0
}
3493
3494
// Runs in its own thread.
3495
// Takes encoded blocks of SAM off the thread results queue and writes them
3496
// to our output stream.
3497
0
static void *sam_dispatcher_write(void *vp) {
3498
0
    htsFile *fp = vp;
3499
0
    SAM_state *fd = fp->state;
3500
0
    hts_tpool_result *r;
3501
3502
    // Iterates until result queue is shutdown, where it returns NULL.
3503
0
    while ((r = hts_tpool_next_result_wait(fd->q))) {
3504
0
        sp_lines *gl = (sp_lines *)hts_tpool_result_data(r);
3505
0
        if (!gl) {
3506
0
            sam_state_err(fd, ENOMEM);
3507
0
            goto err;
3508
0
        }
3509
3510
0
        if (fp->idx) {
3511
0
            sp_bams *gb = gl->bams;
3512
0
            int i = 0, count = 0;
3513
0
            while (i < gl->data_size) {
3514
0
                int j = i;
3515
0
                while (i < gl->data_size && gl->data[i] != '\n')
3516
0
                    i++;
3517
0
                if (i < gl->data_size)
3518
0
                    i++;
3519
3520
0
                if (fp->is_bgzf) {
3521
0
                    if (bgzf_flush_try(fp->fp.bgzf, i-j) < 0)
3522
0
                        goto err;
3523
0
                    if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j)
3524
0
                        goto err;
3525
0
                } else {
3526
0
                    if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j)
3527
0
                        goto err;
3528
0
                }
3529
3530
0
                bam1_t *b = &gb->bams[count++];
3531
0
                if (fp->format.compression == bgzf) {
3532
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
3533
0
                                      b->core.tid, b->core.pos, bam_endpos(b),
3534
0
                                      bgzf_tell(fp->fp.bgzf),
3535
0
                                      !(b->core.flag&BAM_FUNMAP)) < 0) {
3536
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3537
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3538
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3539
0
                        goto err;
3540
0
                    }
3541
0
                } else {
3542
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
3543
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
3544
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3545
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3546
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3547
0
                        goto err;
3548
0
                    }
3549
0
                }
3550
0
            }
3551
3552
0
            assert(count == gb->nbams);
3553
3554
            // Add bam array to free-list
3555
0
            pthread_mutex_lock(&fd->lines_m);
3556
0
            gb->next = fd->bams;
3557
0
            fd->bams = gl->bams;
3558
0
            gl->bams = NULL;
3559
0
            pthread_mutex_unlock(&fd->lines_m);
3560
0
        } else {
3561
0
            if (fp->is_bgzf) {
3562
                // We keep track of how much in the current block we have
3563
                // remaining => R.  We look for the last newline in input
3564
                // [i] to [i+R], backwards => position N.
3565
                //
3566
                // If we find a newline, we write out bytes i to N.
3567
                // We know we cannot fit the next record in this bgzf block,
3568
                // so we flush what we have and copy input N to i+R into
3569
                // the start of a new block, and recompute a new R for that.
3570
                //
3571
                // If we don't find a newline (i==N) then we cannot extend
3572
                // the current block at all, so flush whatever is in it now
3573
                // if it ends on a newline.
3574
                // We still copy i(==N) to i+R to the next block and
3575
                // continue as before with a new R.
3576
                //
3577
                // The only exception on the flush is when we run out of
3578
                // data in the input.  In that case we skip it as we don't
3579
                // yet know if the next record will fit.
3580
                //
3581
                // Both conditions share the same code here:
3582
                // - Look for newline (pos N)
3583
                // - Write i to N (which maybe 0)
3584
                // - Flush if block ends on newline and not end of input
3585
                // - write N to i+R
3586
3587
0
                int i = 0;
3588
0
                BGZF *fb = fp->fp.bgzf;
3589
0
                while (i < gl->data_size) {
3590
                    // remaining space in block
3591
0
                    int R = BGZF_BLOCK_SIZE - fb->block_offset;
3592
0
                    int eod = 0;
3593
0
                    if (R > gl->data_size-i)
3594
0
                        R = gl->data_size-i, eod = 1;
3595
3596
                    // Find last newline in input data
3597
0
                    int N = i + R;
3598
0
                    while (--N > i) {
3599
0
                        if (gl->data[N] == '\n')
3600
0
                            break;
3601
0
                    }
3602
3603
0
                    if (N != i) {
3604
                        // Found a newline
3605
0
                        N++;
3606
0
                        if (bgzf_write(fb, &gl->data[i], N-i) != N-i)
3607
0
                            goto err;
3608
0
                    }
3609
3610
                    // Flush bgzf block
3611
0
                    int b_off = fb->block_offset;
3612
0
                    if (!eod && b_off &&
3613
0
                        ((char *)fb->uncompressed_block)[b_off-1] == '\n')
3614
0
                        if (bgzf_flush_try(fb, BGZF_BLOCK_SIZE) < 0)
3615
0
                            goto err;
3616
3617
                    // Copy from N onwards into next block
3618
0
                    if (i+R > N)
3619
0
                        if (bgzf_write(fb, &gl->data[N], i+R - N)
3620
0
                            != i+R - N)
3621
0
                            goto err;
3622
3623
0
                    i = i+R;
3624
0
                }
3625
0
            } else {
3626
0
                if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size)
3627
0
                    goto err;
3628
0
            }
3629
0
        }
3630
3631
0
        hts_tpool_delete_result(r, 0);
3632
3633
        // Also updated by main thread
3634
0
        pthread_mutex_lock(&fd->lines_m);
3635
0
        gl->next = fd->lines;
3636
0
        fd->lines = gl;
3637
0
        pthread_mutex_unlock(&fd->lines_m);
3638
0
    }
3639
3640
0
    sam_state_err(fd, 0); // success
3641
0
    hts_tpool_process_shutdown(fd->q);
3642
0
    return NULL;
3643
3644
0
 err:
3645
0
    sam_state_err(fd, errno ? errno : EIO);
3646
0
    return (void *)-1;
3647
0
}
3648
3649
// Run from one of the worker threads.
3650
// Convert a passed in array of BAMs (sp_bams) and converts to a block
3651
// of text SAM records (sp_lines).
3652
0
static void *sam_format_worker(void *arg) {
3653
0
    sp_bams *gb = (sp_bams *)arg;
3654
0
    sp_lines *gl = NULL;
3655
0
    int i;
3656
0
    SAM_state *fd = gb->fd;
3657
0
    htsFile *fp = fd->fp;
3658
3659
    // Use a block of SAM strings we had earlier if available.
3660
0
    pthread_mutex_lock(&fd->lines_m);
3661
0
    if (fd->lines) {
3662
0
        gl = fd->lines;
3663
0
        fd->lines = gl->next;
3664
0
    }
3665
0
    pthread_mutex_unlock(&fd->lines_m);
3666
3667
0
    if (gl == NULL) {
3668
0
        gl = calloc(1, sizeof(*gl));
3669
0
        if (!gl) {
3670
0
            sam_state_err(fd, ENOMEM);
3671
0
            return NULL;
3672
0
        }
3673
0
        gl->alloc = gl->data_size = 0;
3674
0
        gl->data = NULL;
3675
0
    }
3676
0
    gl->serial = gb->serial;
3677
0
    gl->next = NULL;
3678
3679
0
    kstring_t ks = {0, gl->alloc, gl->data};
3680
3681
0
    for (i = 0; i < gb->nbams; i++) {
3682
0
        if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) {
3683
0
            sam_state_err(fd, errno ? errno : EIO);
3684
0
            goto err;
3685
0
        }
3686
0
        kputc('\n', &ks);
3687
0
    }
3688
3689
0
    pthread_mutex_lock(&fd->lines_m);
3690
0
    gl->data_size = ks.l;
3691
0
    gl->alloc = ks.m;
3692
0
    gl->data = ks.s;
3693
3694
0
    if (fp->idx) {
3695
        // Keep hold of the bam array a little longer as
3696
        // sam_dispatcher_write needs to use them for building the index.
3697
0
        gl->bams = gb;
3698
0
    } else {
3699
        // Add bam array to free-list
3700
0
        gb->next = fd->bams;
3701
0
        fd->bams = gb;
3702
0
    }
3703
0
    pthread_mutex_unlock(&fd->lines_m);
3704
3705
0
    return gl;
3706
3707
0
 err:
3708
    // Possible race between this and fd->curr_bam.
3709
    // Easier to not free and leave it on the input list so it
3710
    // gets freed there instead?
3711
    // sam_free_sp_bams(gb);
3712
0
    if (gl) {
3713
0
        free(gl->data);
3714
0
        free(gl);
3715
0
    }
3716
0
    return NULL;
3717
0
}
3718
3719
0
int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) {
3720
0
    if (fp->state)
3721
0
        return -2;   //already exists!
3722
3723
0
    if (!(fp->state = sam_state_create(fp)))
3724
0
        return -1;
3725
0
    SAM_state *fd = (SAM_state *)fp->state;
3726
3727
0
    pthread_mutex_init(&fd->lines_m, NULL);
3728
0
    pthread_mutex_init(&fd->command_m, NULL);
3729
0
    pthread_cond_init(&fd->command_c, NULL);
3730
0
    fd->p = p->pool;
3731
0
    int qsize = p->qsize;
3732
0
    if (!qsize)
3733
0
        qsize = 2*hts_tpool_size(fd->p);
3734
0
    fd->q = hts_tpool_process_init(fd->p, qsize, 0);
3735
0
    if (!fd->q) {
3736
0
        sam_state_destroy(fp);
3737
0
        return -1;
3738
0
    }
3739
3740
0
    if (fp->format.compression == bgzf)
3741
0
        return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize);
3742
3743
0
    return 0;
3744
0
}
3745
3746
0
int sam_set_threads(htsFile *fp, int nthreads) {
3747
0
    if (nthreads <= 0)
3748
0
        return 0;
3749
3750
0
    htsThreadPool p;
3751
0
    p.pool = hts_tpool_init(nthreads);
3752
0
    p.qsize = nthreads*2;
3753
3754
0
    int ret = sam_set_thread_pool(fp, &p);
3755
0
    if (ret < 0) {
3756
0
        if (p.pool)
3757
0
            hts_tpool_destroy(p.pool);
3758
0
        return ret;
3759
0
    }
3760
3761
0
    SAM_state *fd = (SAM_state *)fp->state;
3762
0
    fd->own_pool = 1;
3763
3764
0
    return 0;
3765
0
}
3766
3767
0
#define UMI_TAGS 5
3768
typedef struct {
3769
    kstring_t name;
3770
    kstring_t comment; // NB: pointer into name, do not free
3771
    kstring_t seq;
3772
    kstring_t qual;
3773
    int casava;
3774
    int aux;
3775
    int rnum;
3776
    char BC[3];         // aux tag ID for barcode
3777
    char UMI[UMI_TAGS][3]; // aux tag list for UMIs.
3778
    khash_t(tag) *tags; // which aux tags to use (if empty, use all).
3779
    char nprefix;
3780
    int sra_names;
3781
    regex_t regex;
3782
} fastq_state;
3783
3784
// Initialise fastq state.
3785
// Name char of '@' or '>' distinguishes fastq vs fasta variant
3786
4.84k
static fastq_state *fastq_state_init(int name_char) {
3787
4.84k
    fastq_state *x = (fastq_state *)calloc(1, sizeof(*x));
3788
4.84k
    if (!x)
3789
0
        return NULL;
3790
4.84k
    strcpy(x->BC, "BC");
3791
4.84k
    x->nprefix = name_char;
3792
    // Default Illumina naming convention
3793
4.84k
    char *re = "^[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:[^:]+:([^:#/]+)";
3794
4.84k
    if (regcomp(&x->regex, re, REG_EXTENDED) != 0) {
3795
0
        free(x);
3796
0
        return NULL;
3797
0
    }
3798
3799
4.84k
    return x;
3800
4.84k
}
3801
3802
6.46k
void fastq_state_destroy(htsFile *fp) {
3803
6.46k
    if (fp->state) {
3804
4.84k
        fastq_state *x = (fastq_state *)fp->state;
3805
4.84k
        if (x->tags)
3806
0
            kh_destroy(tag, x->tags);
3807
4.84k
        ks_free(&x->name);
3808
4.84k
        ks_free(&x->seq);
3809
4.84k
        ks_free(&x->qual);
3810
4.84k
        regfree(&x->regex);
3811
4.84k
        free(fp->state);
3812
4.84k
    }
3813
6.46k
}
3814
3815
0
int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
3816
0
    va_list args;
3817
3818
0
    if (!fp)
3819
0
        return -1;
3820
0
    if (!fp->state)
3821
0
        if (!(fp->state = fastq_state_init(fp->format.format == fastq_format
3822
0
                                           ? '@' : '>')))
3823
0
            return -1;
3824
3825
0
    fastq_state *x = (fastq_state *)fp->state;
3826
3827
0
    switch (opt) {
3828
0
    case FASTQ_OPT_CASAVA:
3829
0
        x->casava = 1;
3830
0
        break;
3831
3832
0
    case FASTQ_OPT_NAME2:
3833
0
        x->sra_names = 1;
3834
0
        break;
3835
3836
0
    case FASTQ_OPT_AUX: {
3837
0
        va_start(args, opt);
3838
0
        x->aux = 1;
3839
0
        char *tag = va_arg(args, char *);
3840
0
        va_end(args);
3841
0
        if (tag && strcmp(tag, "1") != 0) {
3842
0
            if (!x->tags)
3843
0
                if (!(x->tags = kh_init(tag)))
3844
0
                    return -1;
3845
3846
0
            size_t i, tlen = strlen(tag);
3847
0
            for (i = 0; i+3 <= tlen+1; i += 3) {
3848
0
                if (tag[i+0] == ',' || tag[i+1] == ',' ||
3849
0
                    !(tag[i+2] == ',' || tag[i+2] == '\0')) {
3850
0
                    hts_log_warning("Bad tag format '%.3s'; skipping option", tag+i);
3851
0
                    break;
3852
0
                }
3853
0
                int ret, tcode = tag[i+0]*256 + tag[i+1];
3854
0
                kh_put(tag, x->tags, tcode, &ret);
3855
0
                if (ret < 0)
3856
0
                    return -1;
3857
0
            }
3858
0
        }
3859
0
        break;
3860
0
    }
3861
3862
0
    case FASTQ_OPT_BARCODE: {
3863
0
        va_start(args, opt);
3864
0
        char *bc = va_arg(args, char *);
3865
0
        va_end(args);
3866
0
        strncpy(x->BC, bc, 2);
3867
0
        x->BC[2] = 0;
3868
0
        break;
3869
0
    }
3870
3871
0
    case FASTQ_OPT_UMI: {
3872
        // UMI tag: an empty string disables UMI by setting x->UMI[0] to \0\0\0
3873
0
        va_start(args, opt);
3874
0
        char *bc = va_arg(args, char *), *bc_orig = bc;
3875
0
        va_end(args);
3876
0
        if (!bc || strcmp(bc, "1") == 0)
3877
0
            bc = "RX";
3878
0
        int ntags = 0, err = 0;
3879
0
        for (ntags = 0; *bc && ntags < UMI_TAGS; ntags++) {
3880
0
            if (!isalpha(bc[0]) || !isalnum_c(bc[1])) {
3881
0
                err = 1;
3882
0
                break;
3883
0
            }
3884
3885
0
            strncpy(x->UMI[ntags], bc, 3);
3886
0
            bc += 2;
3887
0
            if (*bc && *bc != ',') {
3888
0
                err = 1;
3889
0
                break;
3890
0
            }
3891
0
            bc+=(*bc==',');
3892
0
            x->UMI[ntags][2] = 0;
3893
0
        }
3894
0
        for (; ntags < UMI_TAGS; ntags++)
3895
0
            x->UMI[ntags][0] = x->UMI[ntags][1] = x->UMI[ntags][2] = 0;
3896
3897
3898
0
        if (err)
3899
0
            hts_log_warning("Bad UMI tag list '%s'", bc_orig);
3900
3901
0
        break;
3902
0
    }
3903
3904
0
    case FASTQ_OPT_UMI_REGEX: {
3905
0
        va_start(args, opt);
3906
0
        char *re = va_arg(args, char *);
3907
0
        va_end(args);
3908
3909
0
        regfree(&x->regex);
3910
0
        if (regcomp(&x->regex, re, REG_EXTENDED) != 0) {
3911
0
            hts_log_error("Regular expression '%s' is not supported", re);
3912
0
            return -1;
3913
0
        }
3914
0
        break;
3915
0
    }
3916
3917
0
    case FASTQ_OPT_RNUM:
3918
0
        x->rnum = 1;
3919
0
        break;
3920
3921
0
    default:
3922
0
        break;
3923
0
    }
3924
0
    return 0;
3925
0
}
3926
3927
16.5M
static int fastq_parse1(htsFile *fp, bam1_t *b) {
3928
16.5M
    fastq_state *x = (fastq_state *)fp->state;
3929
16.5M
    size_t i, l;
3930
16.5M
    int ret = 0;
3931
3932
16.5M
    if (fp->format.format == fasta_format && fp->line.s) {
3933
        // For FASTA we've already read the >name line; steal it
3934
        // Not the most efficient, but we don't optimise for fasta reading.
3935
16.5M
        if (fp->line.l == 0)
3936
3.60k
            return -1; // EOF
3937
3938
16.5M
        free(x->name.s);
3939
16.5M
        x->name = fp->line;
3940
16.5M
        fp->line.l = fp->line.m = 0;
3941
16.5M
        fp->line.s = NULL;
3942
16.5M
    } else {
3943
        // Read a FASTQ format entry.
3944
5.38k
        ret = hts_getline(fp, KS_SEP_LINE, &x->name);
3945
5.38k
        if (ret == -1)
3946
72
            return -1;  // EOF
3947
5.31k
        else if (ret < -1)
3948
111
            return ret; // ERR
3949
5.38k
    }
3950
3951
    // Name
3952
16.5M
    if (*x->name.s != x->nprefix)
3953
66
        return -2;
3954
3955
    // Reverse the SRA strangeness of putting the run_name.number before
3956
    // the read name.
3957
16.5M
    i = 0;
3958
16.5M
    char *name = x->name.s+1;
3959
16.5M
    if (x->sra_names) {
3960
0
        char *cp = strpbrk(x->name.s, " \t");
3961
0
        if (cp) {
3962
0
            while (*cp == ' ' || *cp == '\t')
3963
0
                cp++;
3964
0
            *--cp = '@';
3965
0
            i = cp - x->name.s;
3966
0
            name = cp+1;
3967
0
        }
3968
0
    }
3969
3970
16.5M
    l = x->name.l;
3971
16.5M
    char *s = x->name.s;
3972
46.6M
    while (i < l && !isspace_c(s[i]))
3973
30.1M
        i++;
3974
16.5M
    if (i < l) {
3975
111k
        s[i] = 0;
3976
111k
        x->name.l = i++;
3977
111k
    }
3978
3979
    // Comment; a kstring struct, but pointer into name line.  (Do not free)
3980
17.0M
    while (i < l && isspace_c(s[i]))
3981
567k
        i++;
3982
16.5M
    x->comment.s = s+i;
3983
16.5M
    x->comment.l = l - i;
3984
3985
    // Seq
3986
16.5M
    x->seq.l = 0;
3987
68.6M
    for (;;) {
3988
68.6M
        if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0)
3989
4.48k
            if (fp->format.format == fastq_format || ret < -1)
3990
819
                return -2;
3991
68.6M
        if (ret == -1 ||
3992
68.6M
            *fp->line.s == (fp->format.format == fastq_format ? '+' : '>'))
3993
16.5M
            break;
3994
52.1M
        if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0)
3995
0
            return -2;
3996
52.1M
    }
3997
3998
    // Qual
3999
16.5M
    if (fp->format.format == fastq_format) {
4000
570
        size_t remainder = x->seq.l;
4001
570
        x->qual.l = 0;
4002
58.8k
        do {
4003
58.8k
            if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0)
4004
21
                return -2;
4005
58.8k
            if (fp->line.l > remainder)
4006
36
                return -2;
4007
58.7k
            if (kputsn(fp->line.s, fp->line.l, &x->qual) < 0)
4008
0
                return -2;
4009
58.7k
            remainder -= fp->line.l;
4010
58.7k
        } while (remainder > 0);
4011
4012
        // Decr qual
4013
809k
        for (i = 0; i < x->qual.l; i++)
4014
808k
            x->qual.s[i] -= '!';
4015
513
    }
4016
4017
16.5M
    int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED;
4018
16.5M
    if (x->name.l > 2 &&
4019
1.26M
        x->name.s[x->name.l-2] == '/' &&
4020
68.9k
        isdigit_c(x->name.s[x->name.l-1])) {
4021
67.4k
        switch(x->name.s[x->name.l-1]) {
4022
6.17k
        case '1': flag |= BAM_FREAD1 | pflag; break;
4023
46.9k
        case '2': flag |= BAM_FREAD2 | pflag; break;
4024
14.3k
        default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
4025
67.4k
        }
4026
67.4k
        x->name.s[x->name.l-=2] = 0;
4027
67.4k
    }
4028
4029
    // Strip Illumina formatted UMI off read-name
4030
16.5M
    char UMI_seq[256]; // maximum length in spec
4031
16.5M
    size_t UMI_len = 0;
4032
16.5M
    if (x->UMI[0][0]) {
4033
0
        regmatch_t match[3];
4034
0
        if (regexec(&x->regex, x->name.s, 2, match, 0) == 0
4035
0
            && match[0].rm_so >= 0     // whole regex
4036
0
            && match[1].rm_so >= 0) {  // bracketted UMI component
4037
0
            UMI_len = match[1].rm_eo - match[1].rm_so;
4038
0
            if (UMI_len > 255) {
4039
0
                hts_log_error("SAM read name is too long");
4040
0
                return -2;
4041
0
            }
4042
4043
            // The SAMTags spec recommends (but not requires) separating
4044
            // barcodes with hyphen ('-').
4045
0
            size_t i;
4046
0
            for (i = 0; i < UMI_len; i++)
4047
0
                UMI_seq[i] = isalpha_c(x->name.s[i+match[1].rm_so])
4048
0
                    ? x->name.s[i+match[1].rm_so]
4049
0
                    : '-';
4050
4051
            // Move any trailing #num earlier in the name
4052
0
            if (UMI_len) {
4053
0
                UMI_seq[UMI_len++] = 0;
4054
4055
0
                x->name.l = match[1].rm_so;
4056
0
                if (x->name.l > 0 && x->name.s[x->name.l-1] == ':')
4057
0
                    x->name.l--; // remove colon too
4058
0
                char *cp = x->name.s + match[1].rm_eo;
4059
0
                while (*cp)
4060
0
                    x->name.s[x->name.l++] = *cp++;
4061
0
                x->name.s[x->name.l] = 0;
4062
0
            }
4063
0
        }
4064
0
    }
4065
4066
    // Convert to BAM
4067
16.5M
    ret = bam_set1(b,
4068
16.5M
                   x->name.s + x->name.l - name, name,
4069
16.5M
                   flag,
4070
16.5M
                   -1, -1, 0, // ref '*', pos, mapq,
4071
16.5M
                   0, NULL,     // no cigar,
4072
16.5M
                   -1, -1, 0,    // mate
4073
16.5M
                   x->seq.l, x->seq.s, x->qual.s,
4074
16.5M
                   0);
4075
16.5M
    if (ret < 0) return -2;
4076
4077
    // Add UMI tag if removed from read-name above
4078
16.5M
    if (UMI_len) {
4079
0
        if (bam_aux_append(b, x->UMI[0], 'Z', UMI_len, (uint8_t *)UMI_seq) < 0)
4080
0
            ret = -2;
4081
0
    }
4082
4083
    // Identify Illumina CASAVA strings.
4084
    // <read>:<is_filtered>:<control_bits>:<barcode_sequence>
4085
16.5M
    char *barcode = NULL;
4086
16.5M
    int barcode_len = 0;
4087
16.5M
    kstring_t *kc = &x->comment;
4088
16.5M
    char *endptr;
4089
16.5M
    if (x->casava &&
4090
        // \d:[YN]:\d+:[ACGTN]+
4091
0
        kc->l > 6 && (kc->s[1] | kc->s[3]) == ':' && isdigit_c(kc->s[0]) &&
4092
0
        strtol(kc->s+4, &endptr, 10) >= 0 && endptr != kc->s+4
4093
0
        && *endptr == ':') {
4094
4095
        // read num
4096
0
        switch(kc->s[0]) {
4097
0
        case '1': b->core.flag |= BAM_FREAD1 | pflag; break;
4098
0
        case '2': b->core.flag |= BAM_FREAD2 | pflag; break;
4099
0
        default : b->core.flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
4100
0
        }
4101
4102
0
        if (kc->s[2] == 'Y')
4103
0
            b->core.flag |= BAM_FQCFAIL;
4104
4105
        // Barcode, maybe numeric in which case we skip it
4106
0
        if (!isdigit_c(endptr[1])) {
4107
0
            barcode = endptr+1;
4108
0
            for (i = barcode - kc->s; i < kc->l; i++)
4109
0
                if (isspace_c(kc->s[i]))
4110
0
                    break;
4111
4112
0
            kc->s[i] = 0;
4113
0
            barcode_len = i+1-(barcode - kc->s);
4114
0
        }
4115
0
    }
4116
4117
16.5M
    if (ret >= 0 && barcode_len)
4118
0
        if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0)
4119
0
            ret = -2;
4120
4121
16.5M
    if (!x->aux)
4122
16.5M
        return ret;
4123
4124
    // Identify any SAM style aux tags in comments too.
4125
0
    if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0)
4126
0
        ret = -2;
4127
4128
0
    return ret;
4129
16.5M
}
4130
4131
// Internal component of sam_read1 below
4132
3.05k
static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4133
3.05k
    int ret = bam_read1(fp->fp.bgzf, b);
4134
3.05k
    if (h && ret >= 0) {
4135
1.85k
        if (b->core.tid  >= h->n_targets || b->core.tid  < -1 ||
4136
1.79k
            b->core.mtid >= h->n_targets || b->core.mtid < -1) {
4137
124
            errno = ERANGE;
4138
124
            return -3;
4139
124
        }
4140
1.85k
    }
4141
2.92k
    return ret;
4142
3.05k
}
4143
4144
// Internal component of sam_read1 below
4145
5.78k
static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) {
4146
5.78k
    int ret = cram_get_bam_seq(fp->fp.cram, b);
4147
5.78k
    if (ret < 0)
4148
5.78k
        return cram_eof(fp->fp.cram) ? -1 : -2;
4149
4150
0
    if (bam_tag2cigar(*b, 1, 1) < 0)
4151
0
        return -2;
4152
4153
0
    return ret;
4154
0
}
4155
4156
// Internal component of sam_read1 below
4157
10.5k
static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4158
10.5k
    int ret;
4159
4160
    // Consume 1st line after header parsing as it wasn't using peek
4161
10.5k
    if (fp->line.l != 0) {
4162
57
        ret = sam_parse1(&fp->line, h, b);
4163
57
        fp->line.l = 0;
4164
57
        return ret;
4165
57
    }
4166
4167
10.4k
    if (fp->state) {
4168
0
        SAM_state *fd = (SAM_state *)fp->state;
4169
4170
0
        if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) {
4171
            // We don't support multi-threaded SAM parsing with seeks yet.
4172
0
            int ret;
4173
0
            if ((ret = sam_state_destroy(fp)) < 0) {
4174
0
                errno = -ret;
4175
0
                return -2;
4176
0
            }
4177
0
            if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0)
4178
0
                return -2;
4179
0
            fp->fp.bgzf->seeked = 0;
4180
0
            goto err_recover;
4181
0
        }
4182
4183
0
        if (!fd->h) {
4184
0
            fd->h = h;
4185
0
            fd->h->ref_count++;
4186
            // Ensure hrecs is initialised now as we don't want multiple
4187
            // threads trying to do this simultaneously.
4188
0
            if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0)
4189
0
                return -2;
4190
4191
            // We can only do this once we've got a header
4192
0
            if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read,
4193
0
                               fp) != 0)
4194
0
                return -2;
4195
0
            fd->dispatcher_set = 1;
4196
0
        }
4197
4198
0
        if (fd->h != h) {
4199
0
            hts_log_error("SAM multi-threaded decoding does not support changing header");
4200
0
            return -2;
4201
0
        }
4202
4203
0
        sp_bams *gb = fd->curr_bam;
4204
0
        if (!gb) {
4205
0
            if (fd->errcode) {
4206
                // In case reader failed
4207
0
                errno = fd->errcode;
4208
0
                return -2;
4209
0
            }
4210
4211
0
            pthread_mutex_lock(&fd->command_m);
4212
0
            int cmd = fd->command;
4213
0
            pthread_mutex_unlock(&fd->command_m);
4214
0
            if (cmd == SAM_AT_EOF)
4215
0
                return -1;
4216
4217
0
            hts_tpool_result *r = hts_tpool_next_result_wait(fd->q);
4218
0
            if (!r)
4219
0
                return -2;
4220
0
            fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r);
4221
0
            hts_tpool_delete_result(r, 0);
4222
0
        }
4223
0
        if (!gb) {
4224
0
            pthread_mutex_lock(&fd->command_m);
4225
0
            fd->command = SAM_AT_EOF;
4226
0
            pthread_mutex_unlock(&fd->command_m);
4227
0
            return fd->errcode ? -2 : -1;
4228
0
        }
4229
0
        bam1_t *b_array = (bam1_t *)gb->bams;
4230
0
        if (fd->curr_idx < gb->nbams)
4231
0
            if (!bam_copy1(b, &b_array[fd->curr_idx++]))
4232
0
                return -2;
4233
0
        if (fd->curr_idx == gb->nbams) {
4234
0
            pthread_mutex_lock(&fd->lines_m);
4235
0
            gb->next = fd->bams;
4236
0
            fd->bams = gb;
4237
0
            pthread_mutex_unlock(&fd->lines_m);
4238
4239
0
            fd->curr_bam = NULL;
4240
0
            fd->curr_idx = 0;
4241
        // Consider prefetching next record?  I.e.
4242
        // } else {
4243
        //     __builtin_prefetch(&b_array[fd->curr_idx], 0, 3);
4244
0
        }
4245
4246
0
        ret = 0;
4247
4248
10.4k
    } else  {
4249
10.4k
    err_recover:
4250
10.4k
        ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
4251
10.4k
        if (ret < 0) return ret;
4252
4253
8.14k
        ret = sam_parse1(&fp->line, h, b);
4254
8.14k
        fp->line.l = 0;
4255
8.14k
        if (ret < 0) {
4256
2.05k
            hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
4257
2.05k
            if (h && h->ignore_sam_err) goto err_recover;
4258
2.05k
        }
4259
8.14k
    }
4260
4261
8.14k
    return ret;
4262
10.4k
}
4263
4264
// Returns 0 on success,
4265
//        -1 on EOF,
4266
//       <-1 on error
4267
int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b)
4268
16.5M
{
4269
16.5M
    int ret, pass_filter;
4270
4271
16.5M
    do {
4272
16.5M
        switch (fp->format.format) {
4273
3.05k
        case bam:
4274
3.05k
            ret = sam_read1_bam(fp, h, b);
4275
3.05k
            break;
4276
4277
5.78k
        case cram:
4278
5.78k
            ret = sam_read1_cram(fp, h, &b);
4279
5.78k
            break;
4280
4281
10.5k
        case sam:
4282
10.5k
            ret = sam_read1_sam(fp, h, b);
4283
10.5k
            break;
4284
4285
16.5M
        case fasta_format:
4286
16.5M
        case fastq_format: {
4287
16.5M
            fastq_state *x = (fastq_state *)fp->state;
4288
16.5M
            if (!x) {
4289
4.84k
                if (!(fp->state = fastq_state_init(fp->format.format
4290
4.84k
                                                   == fastq_format ? '@' : '>')))
4291
0
                    return -2;
4292
4.84k
            }
4293
4294
16.5M
            return fastq_parse1(fp, b);
4295
16.5M
        }
4296
4297
0
        case empty_format:
4298
0
            errno = EPIPE;
4299
0
            return -3;
4300
4301
0
        default:
4302
0
            errno = EFTYPE;
4303
0
            return -3;
4304
16.5M
        }
4305
4306
19.3k
        pass_filter = (ret >= 0 && fp->filter)
4307
19.3k
            ? sam_passes_filter(h, b, fp->filter)
4308
19.3k
            : 1;
4309
19.3k
    } while (pass_filter == 0);
4310
4311
19.3k
    return pass_filter < 0 ? -2 : ret;
4312
16.5M
}
4313
4314
// With gcc, -O3 or -ftree-loop-vectorize is really key here as otherwise
4315
// this code isn't vectorised and runs far slower than is necessary (even
4316
// with the restrict keyword being used).
4317
static inline void HTS_OPT3
4318
221
add33(uint8_t *a, const uint8_t * b, int32_t len) {
4319
221
    uint32_t i;
4320
270k
    for (i = 0; i < len; i++)
4321
270k
        a[i] = b[i]+33;
4322
221
}
4323
4324
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4325
5.50M
{
4326
5.50M
    int i, r = 0;
4327
5.50M
    uint8_t *s, *end;
4328
5.50M
    const bam1_core_t *c = &b->core;
4329
4330
5.50M
    if (c->l_qname == 0)
4331
0
        return -1;
4332
5.50M
    r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str);
4333
5.50M
    r |= kputc_('\t', str); // query name
4334
5.50M
    r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag
4335
5.50M
    if (c->tid >= 0) { // chr
4336
364
        r |= kputs(h->target_name[c->tid] , str);
4337
364
        r |= kputc_('\t', str);
4338
5.50M
    } else r |= kputsn_("*\t", 2, str);
4339
5.50M
    r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos
4340
5.50M
    r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual
4341
5.50M
    if (c->n_cigar) { // cigar
4342
1.45k
        uint32_t *cigar = bam_get_cigar(b);
4343
7.59k
        for (i = 0; i < c->n_cigar; ++i) {
4344
6.14k
            r |= kputw(bam_cigar_oplen(cigar[i]), str);
4345
6.14k
            r |= kputc_(bam_cigar_opchr(cigar[i]), str);
4346
6.14k
        }
4347
5.50M
    } else r |= kputc_('*', str);
4348
5.50M
    r |= kputc_('\t', str);
4349
5.50M
    if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr
4350
125
    else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str);
4351
4
    else {
4352
4
        r |= kputs(h->target_name[c->mtid], str);
4353
4
        r |= kputc_('\t', str);
4354
4
    }
4355
5.50M
    r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos
4356
5.50M
    r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len
4357
5.50M
    if (c->l_qseq) { // seq and qual
4358
156k
        uint8_t *s = bam_get_seq(b);
4359
156k
        if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err;
4360
156k
        char *cp = str->s + str->l;
4361
4362
        // Sequence, 2 bases at a time
4363
156k
        nibble2base(s, cp, c->l_qseq);
4364
156k
        cp[c->l_qseq] = '\t';
4365
156k
        cp += c->l_qseq+1;
4366
4367
        // Quality
4368
156k
        s = bam_get_qual(b);
4369
156k
        i = 0;
4370
156k
        if (s[0] == 0xff) {
4371
156k
            cp[i++] = '*';
4372
156k
        } else {
4373
221
            add33((uint8_t *)cp, s, c->l_qseq); // cp[i] = s[i]+33;
4374
221
            i = c->l_qseq;
4375
221
        }
4376
156k
        cp[i] = 0;
4377
156k
        cp += i;
4378
156k
        str->l = cp - str->s;
4379
5.35M
    } else r |= kputsn_("*\t*", 3, str);
4380
4381
5.50M
    s = bam_get_aux(b); // aux
4382
5.50M
    end = b->data + b->l_data;
4383
4384
6.12M
    while (end - s >= 4) {
4385
619k
        r |= kputc_('\t', str);
4386
619k
        if ((s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)) == NULL)
4387
264
            goto bad_aux;
4388
619k
    }
4389
5.50M
    r |= kputsn("", 0, str); // nul terminate
4390
5.50M
    if (r < 0) goto mem_err;
4391
4392
5.50M
    return str->l;
4393
4394
264
 bad_aux:
4395
264
    hts_log_error("Corrupted aux data for read %.*s flag %d",
4396
264
                  b->core.l_qname, bam_get_qname(b), b->core.flag);
4397
264
    errno = EINVAL;
4398
264
    return -1;
4399
4400
0
 mem_err:
4401
0
    hts_log_error("Out of memory");
4402
0
    errno = ENOMEM;
4403
0
    return -1;
4404
5.50M
}
4405
4406
int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4407
5.50M
{
4408
5.50M
    str->l = 0;
4409
5.50M
    return sam_format1_append(h, b, str);
4410
5.50M
}
4411
4412
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end);
4413
int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str)
4414
0
{
4415
0
    unsigned flag = b->core.flag;
4416
0
    int i, e = 0, len = b->core.l_qseq;
4417
0
    uint8_t *seq, *qual;
4418
4419
0
    str->l = 0;
4420
4421
    // Name
4422
0
    if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF)
4423
0
        return -1;
4424
4425
    // UMI tag
4426
0
    if (x && *x->UMI[0]) {
4427
        // Temporary copy of '#num' if present
4428
0
        char plex[256];
4429
0
        size_t len = str->l;
4430
0
        while (len && str->s[len] != ':' && str->s[len] != '#')
4431
0
            len--;
4432
4433
0
        if (str->s[len] == '#' && str->l - len < 255) {
4434
0
            memcpy(plex, &str->s[len], str->l - len);
4435
0
            plex[str->l - len] = 0;
4436
0
            str->l = len;
4437
0
        } else {
4438
0
            *plex = 0;
4439
0
        }
4440
4441
0
        uint8_t *bc = NULL;
4442
0
        int n;
4443
0
        for (n = 0; !bc && n < UMI_TAGS; n++)
4444
0
            bc = bam_aux_get(b, x->UMI[n]);
4445
0
        if (bc && *bc == 'Z') {
4446
0
            int err = kputc(':', str) < 0;
4447
            // Replace any non-alpha with '+'
4448
0
            while (*++bc)
4449
0
                err |= kputc(isalpha_c(*bc) ? toupper_c(*bc) : '+', str) < 0;
4450
0
            if (err)
4451
0
                return -1;
4452
0
        }
4453
4454
0
        if (*plex && kputs(plex, str) < 0)
4455
0
            return -1;
4456
0
    }
4457
4458
    // /1 or /2 suffix
4459
0
    if (x && x->rnum && (flag & BAM_FPAIRED)) {
4460
0
        int r12 = flag & (BAM_FREAD1 | BAM_FREAD2);
4461
0
        if (r12 == BAM_FREAD1) {
4462
0
            if (kputs("/1", str) == EOF)
4463
0
                return -1;
4464
0
        } else if (r12 == BAM_FREAD2) {
4465
0
            if (kputs("/2", str) == EOF)
4466
0
                return -1;
4467
0
        }
4468
0
    }
4469
4470
    // Illumina CASAVA tag.
4471
    // This is <rnum>:<Y/N qcfail>:<control-bits>:<barcode-or-zero>
4472
0
    if (x && x->casava) {
4473
0
        int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0;
4474
0
        char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N';
4475
0
        uint8_t *bc = bam_aux_get(b, x->BC);
4476
0
        if (ksprintf(str, " %d:%c:0:%s", rnum, filtered,
4477
0
                     bc ? (char *)bc+1 : "0") < 0)
4478
0
            return -1;
4479
4480
0
        if (bc && (*bc != 'Z' || (!isupper_c(bc[1]) && !islower_c(bc[1])))) {
4481
0
            hts_log_warning("BC tag starts with non-sequence base; using '0'");
4482
0
            str->l -= strlen((char *)bc)-2; // limit to 1 char
4483
0
            str->s[str->l-1] = '0';
4484
0
            str->s[str->l] = 0;
4485
0
            bc = NULL;
4486
0
        }
4487
4488
        // Replace any non-alpha with '+'.  Ie seq-seq to seq+seq
4489
0
        if (bc) {
4490
0
            int l = strlen((char *)bc+1);
4491
0
            char *c = (char *)str->s + str->l - l;
4492
0
            for (i = 0; i < l; i++) {
4493
0
                if (!isalpha_c(c[i]))
4494
0
                    c[i] = '+';
4495
0
                else if (islower_c(c[i]))
4496
0
                    c[i] = toupper_c(c[i]);
4497
0
            }
4498
0
        }
4499
0
    }
4500
4501
    // Aux tags
4502
0
    if (x && x->aux) {
4503
0
        uint8_t *s = bam_get_aux(b), *end = b->data + b->l_data;
4504
0
        while (s && end - s >= 4) {
4505
0
            int tt = s[0]*256 + s[1];
4506
0
            if (x->tags == NULL ||
4507
0
                kh_get(tag, x->tags, tt) != kh_end(x->tags)) {
4508
0
                e |= kputc_('\t', str) < 0;
4509
0
                if (!(s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)))
4510
0
                    return -1;
4511
0
            } else {
4512
0
                s = skip_aux(s+2, end);
4513
0
            }
4514
0
        }
4515
0
        e |= kputsn("", 0, str) < 0; // nul terminate
4516
0
    }
4517
4518
0
    if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1;
4519
0
    e |= kputc_('\n', str) < 0;
4520
4521
    // Seq line
4522
0
    seq = bam_get_seq(b);
4523
0
    if (flag & BAM_FREVERSE)
4524
0
        for (i = len-1; i >= 0; i--)
4525
0
            e |= kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str) < 0;
4526
0
    else
4527
0
        for (i = 0; i < len; i++)
4528
0
            e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0;
4529
4530
4531
    // Qual line
4532
0
    if (x->nprefix == '@') {
4533
0
        kputsn("\n+\n", 3, str);
4534
0
        qual = bam_get_qual(b);
4535
0
        if (qual[0] == 0xff)
4536
0
            for (i = 0; i < len; i++)
4537
0
                e |= kputc_('B', str) < 0;
4538
0
        else if (flag & BAM_FREVERSE)
4539
0
            for (i = len-1; i >= 0; i--)
4540
0
                e |= kputc_(33 + qual[i], str) < 0;
4541
0
        else
4542
0
            for (i = 0; i < len; i++)
4543
0
                e |= kputc_(33 + qual[i], str) < 0;
4544
4545
0
    }
4546
0
    e |= kputc('\n', str) < 0;
4547
4548
0
    return e ? -1 : str->l;
4549
0
}
4550
4551
// Sadly we need to be able to modify the bam_hdr here so we can
4552
// reference count the structure.
4553
int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b)
4554
16.5M
{
4555
16.5M
    switch (fp->format.format) {
4556
0
    case binary_format:
4557
0
        fp->format.category = sequence_data;
4558
0
        fp->format.format = bam;
4559
        /* fall-through */
4560
5.50M
    case bam:
4561
5.50M
        return bam_write_idx1(fp, h, b);
4562
4563
5.50M
    case cram:
4564
5.50M
        return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
4565
4566
0
    case text_format:
4567
0
        fp->format.category = sequence_data;
4568
0
        fp->format.format = sam;
4569
        /* fall-through */
4570
5.50M
    case sam:
4571
5.50M
        if (fp->state) {
4572
0
            SAM_state *fd = (SAM_state *)fp->state;
4573
4574
            // Threaded output
4575
0
            if (!fd->h) {
4576
                // NB: discard const.  We don't actually modify sam_hdr_t here,
4577
                // just data pointed to by it (which is a bit weasely still),
4578
                // but out cached pointer must be non-const as we want to
4579
                // destroy it later on and sam_hdr_destroy takes non-const.
4580
                //
4581
                // We do this because some tools do sam_hdr_destroy; sam_close
4582
                // while others do sam_close; sam_hdr_destroy.  The former is
4583
                // an issue as we need the header still when flushing.
4584
0
                fd->h = (sam_hdr_t *)h;
4585
0
                fd->h->ref_count++;
4586
4587
0
                if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write,
4588
0
                                   fp) != 0)
4589
0
                    return -2;
4590
0
                fd->dispatcher_set = 1;
4591
0
            }
4592
4593
0
            if (fd->h != h) {
4594
0
                hts_log_error("SAM multi-threaded decoding does not support changing header");
4595
0
                return -2;
4596
0
            }
4597
4598
            // Find a suitable BAM array to copy to
4599
0
            sp_bams *gb = fd->curr_bam;
4600
0
            if (!gb) {
4601
0
                pthread_mutex_lock(&fd->lines_m);
4602
0
                if (fd->bams) {
4603
0
                    fd->curr_bam = gb = fd->bams;
4604
0
                    fd->bams = gb->next;
4605
0
                    gb->next = NULL;
4606
0
                    gb->nbams = 0;
4607
0
                    gb->bam_mem = 0;
4608
0
                    pthread_mutex_unlock(&fd->lines_m);
4609
0
                } else {
4610
0
                    pthread_mutex_unlock(&fd->lines_m);
4611
0
                    if (!(gb = calloc(1, sizeof(*gb)))) return -1;
4612
0
                    if (!(gb->bams = calloc(SAM_NBAM, sizeof(*gb->bams)))) {
4613
0
                        free(gb);
4614
0
                        return -1;
4615
0
                    }
4616
0
                    gb->nbams = 0;
4617
0
                    gb->abams = SAM_NBAM;
4618
0
                    gb->bam_mem = 0;
4619
0
                    gb->fd = fd;
4620
0
                    fd->curr_idx = 0;
4621
0
                    fd->curr_bam = gb;
4622
0
                }
4623
0
            }
4624
4625
0
            if (!bam_copy1(&gb->bams[gb->nbams++], b))
4626
0
                return -2;
4627
0
            gb->bam_mem += b->l_data + sizeof(*b);
4628
4629
            // Dispatch if full
4630
0
            if (gb->nbams == SAM_NBAM || gb->bam_mem > SAM_NBYTES*0.8) {
4631
0
                gb->serial = fd->serial++;
4632
0
                pthread_mutex_lock(&fd->command_m);
4633
0
                if (fd->errcode != 0) {
4634
0
                    pthread_mutex_unlock(&fd->command_m);
4635
0
                    return -fd->errcode;
4636
0
                }
4637
0
                if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb,
4638
0
                                        cleanup_sp_bams,
4639
0
                                        cleanup_sp_lines, 0) < 0) {
4640
0
                    pthread_mutex_unlock(&fd->command_m);
4641
0
                    return -1;
4642
0
                }
4643
0
                pthread_mutex_unlock(&fd->command_m);
4644
0
                fd->curr_bam = NULL;
4645
0
            }
4646
4647
            // Dummy value as we don't know how long it really is.
4648
            // We could track file sizes via a SAM_state field, but I don't think
4649
            // it is necessary.
4650
0
            return 1;
4651
5.50M
        } else {
4652
5.50M
            if (sam_format1(h, b, &fp->line) < 0) return -1;
4653
5.50M
            kputc('\n', &fp->line);
4654
5.50M
            if (fp->is_bgzf) {
4655
0
                if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4656
0
                    return -1;
4657
0
                if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4658
5.50M
            } else {
4659
5.50M
                if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4660
5.50M
            }
4661
4662
5.50M
            if (fp->idx) {
4663
0
                if (fp->format.compression == bgzf) {
4664
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4665
0
                                      bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4666
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4667
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4668
0
                        return -1;
4669
0
                    }
4670
0
                } else {
4671
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4672
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4673
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4674
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4675
0
                        return -1;
4676
0
                    }
4677
0
                }
4678
0
            }
4679
4680
5.50M
            return fp->line.l;
4681
5.50M
        }
4682
4683
4684
0
    case fasta_format:
4685
0
    case fastq_format: {
4686
0
        fastq_state *x = (fastq_state *)fp->state;
4687
0
        if (!x) {
4688
0
            if (!(fp->state = fastq_state_init(fp->format.format
4689
0
                                               == fastq_format ? '@' : '>')))
4690
0
                return -2;
4691
0
        }
4692
4693
0
        if (fastq_format1(fp->state, b, &fp->line) < 0)
4694
0
            return -1;
4695
0
        if (fp->is_bgzf) {
4696
0
            if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4697
0
                return -1;
4698
0
            if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l)
4699
0
                return -1;
4700
0
        } else {
4701
0
            if (hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l)
4702
0
                return -1;
4703
0
        }
4704
0
        return fp->line.l;
4705
0
    }
4706
4707
0
    default:
4708
0
        errno = EBADF;
4709
0
        return -1;
4710
16.5M
    }
4711
16.5M
}
4712
4713
/************************
4714
 *** Auxiliary fields ***
4715
 ************************/
4716
#ifndef HTS_LITTLE_ENDIAN
4717
static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
4718
    int tsz = aux_type2size(type);
4719
4720
    if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
4721
4722
    switch (tsz) {
4723
        case 'H': case 'Z': case 1:  // Trivial
4724
            memcpy(out, in, len);
4725
            break;
4726
4727
#define aux_val_to_le(type_t, store_le) do {                            \
4728
        type_t v;                                                       \
4729
        size_t i;                                                       \
4730
        for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
4731
            memcpy(&v, in + i, sizeof(type_t));                         \
4732
            store_le(v, out);                                           \
4733
        }                                                               \
4734
    } while (0)
4735
4736
        case 2: aux_val_to_le(uint16_t, u16_to_le); break;
4737
        case 4: aux_val_to_le(uint32_t, u32_to_le); break;
4738
        case 8: aux_val_to_le(uint64_t, u64_to_le); break;
4739
4740
#undef aux_val_to_le
4741
4742
        case 'B': { // Recurse!
4743
            uint32_t n;
4744
            if (len < 5) return -1;
4745
            memcpy(&n, in + 1, 4);
4746
            out[0] = in[0];
4747
            u32_to_le(n, out + 1);
4748
            return aux_to_le(in[0], out + 5, in + 5, len - 5);
4749
        }
4750
4751
        default: // Unknown type code
4752
            return -1;
4753
    }
4754
4755
4756
4757
    return 0;
4758
}
4759
#endif
4760
4761
int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
4762
0
{
4763
0
    uint32_t new_len;
4764
4765
0
    assert(b->l_data >= 0);
4766
0
    new_len = b->l_data + 3 + len;
4767
0
    if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
4768
4769
0
    if (realloc_bam_data(b, new_len) < 0) return -1;
4770
4771
0
    b->data[b->l_data] = tag[0];
4772
0
    b->data[b->l_data + 1] = tag[1];
4773
0
    b->data[b->l_data + 2] = type;
4774
4775
0
#ifdef HTS_LITTLE_ENDIAN
4776
0
    memcpy(b->data + b->l_data + 3, data, len);
4777
#else
4778
    if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
4779
        errno = EINVAL;
4780
        return -1;
4781
    }
4782
#endif
4783
4784
0
    b->l_data = new_len;
4785
4786
0
    return 0;
4787
4788
0
 nomem:
4789
0
    errno = ENOMEM;
4790
0
    return -1;
4791
0
}
4792
4793
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
4794
621k
{
4795
621k
    int size;
4796
621k
    uint32_t n;
4797
621k
    if (s >= end) return end;
4798
621k
    size = aux_type2size(*s); ++s; // skip type
4799
621k
    switch (size) {
4800
3.86k
    case 'Z':
4801
4.12k
    case 'H':
4802
4.12k
        s = memchr(s, 0, end-s);
4803
4.12k
        return s ? s+1 : end;
4804
7.59k
    case 'B':
4805
7.59k
        if (end - s < 5) return NULL;
4806
7.57k
        size = aux_type2size(*s); ++s;
4807
7.57k
        n = le_to_u32(s);
4808
7.57k
        s += 4;
4809
7.57k
        if (size == 0 || end - s < size * n) return NULL;
4810
7.54k
        return s + size * n;
4811
465
    case 0:
4812
465
        return NULL;
4813
609k
    default:
4814
609k
        if (end - s < size) return NULL;
4815
609k
        return s + size;
4816
621k
    }
4817
621k
}
4818
4819
uint8_t *bam_aux_first(const bam1_t *b)
4820
5.50M
{
4821
5.50M
    uint8_t *s = bam_get_aux(b);
4822
5.50M
    uint8_t *end = b->data + b->l_data;
4823
5.50M
    if (end - s <= 2) { errno = ENOENT; return NULL; }
4824
2.53k
    return s+2;
4825
5.50M
}
4826
4827
uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s)
4828
621k
{
4829
621k
    uint8_t *end = b->data + b->l_data;
4830
621k
    uint8_t *next = s? skip_aux((uint8_t *) s, end) : end;
4831
621k
    if (next == NULL) goto bad_aux;
4832
620k
    if (end - next <= 2) { errno = ENOENT; return NULL; }
4833
618k
    return next+2;
4834
4835
514
 bad_aux:
4836
514
    hts_log_error("Corrupted aux data for read %s flag %d",
4837
514
                  bam_get_qname(b), b->core.flag);
4838
514
    errno = EINVAL;
4839
514
    return NULL;
4840
620k
}
4841
4842
uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
4843
5.50M
{
4844
5.50M
    uint8_t *s;
4845
6.13M
    for (s = bam_aux_first(b); s; s = bam_aux_next(b, s))
4846
621k
        if (s[-2] == tag[0] && s[-1] == tag[1]) {
4847
            // Check the tag value is valid and complete
4848
74
            uint8_t *e = skip_aux(s, b->data + b->l_data);
4849
74
            if (e == NULL) goto bad_aux;
4850
65
            if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux;
4851
4852
60
            return s;
4853
65
        }
4854
4855
    // errno now as set by bam_aux_first()/bam_aux_next()
4856
5.50M
    return NULL;
4857
4858
14
 bad_aux:
4859
14
    hts_log_error("Corrupted aux data for read %s flag %d",
4860
14
                  bam_get_qname(b), b->core.flag);
4861
14
    errno = EINVAL;
4862
14
    return NULL;
4863
5.50M
}
4864
4865
int bam_aux_del(bam1_t *b, uint8_t *s)
4866
0
{
4867
0
    s = bam_aux_remove(b, s);
4868
0
    return (s || errno == ENOENT)? 0 : -1;
4869
0
}
4870
4871
uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s)
4872
0
{
4873
0
    uint8_t *end = b->data + b->l_data;
4874
0
    uint8_t *next = skip_aux(s, end);
4875
0
    if (next == NULL) goto bad_aux;
4876
4877
0
    b->l_data -= next - (s-2);
4878
0
    if (next >= end) { errno = ENOENT; return NULL; }
4879
4880
0
    memmove(s-2, next, end - next);
4881
0
    return s;
4882
4883
0
 bad_aux:
4884
0
    hts_log_error("Corrupted aux data for read %s flag %d",
4885
0
                  bam_get_qname(b), b->core.flag);
4886
0
    errno = EINVAL;
4887
0
    return NULL;
4888
0
}
4889
4890
int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
4891
0
{
4892
    // FIXME: This is not at all efficient!
4893
0
    size_t ln = len >= 0 ? len : strlen(data) + 1;
4894
0
    size_t old_ln = 0;
4895
0
    int need_nul = ln == 0 || data[ln - 1] != '\0';
4896
0
    int save_errno = errno;
4897
0
    int new_tag = 0;
4898
0
    uint8_t *s = bam_aux_get(b,tag), *e;
4899
4900
0
    if (s) {  // Replacing existing tag
4901
0
        char type = *s;
4902
0
        if (type != 'Z') {
4903
0
            hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type);
4904
0
            errno = EINVAL;
4905
0
            return -1;
4906
0
        }
4907
0
        s++;
4908
0
        e = memchr(s, '\0', b->data + b->l_data - s);
4909
0
        old_ln = (e ? e - s : b->data + b->l_data - s) + 1;
4910
0
        s -= 3;
4911
0
    } else {
4912
0
        if (errno != ENOENT) { // Invalid aux data, give up
4913
0
            return -1;
4914
0
        } else { // Tag doesn't exist - put it on the end
4915
0
            errno = save_errno;
4916
0
            s = b->data + b->l_data;
4917
0
            new_tag = 3;
4918
0
        }
4919
0
    }
4920
4921
0
    if (old_ln < ln + need_nul + new_tag) {
4922
0
        ptrdiff_t s_offset = s - b->data;
4923
0
        if (possibly_expand_bam_data(b, ln + need_nul + new_tag - old_ln) < 0)
4924
0
            return -1;
4925
0
        s = b->data + s_offset;
4926
0
    }
4927
0
    if (!new_tag) {
4928
0
        memmove(s + 3 + ln + need_nul,
4929
0
                s + 3 + old_ln,
4930
0
                b->l_data - (s + 3 - b->data) - old_ln);
4931
0
    }
4932
0
    b->l_data += new_tag + ln + need_nul - old_ln;
4933
4934
0
    s[0] = tag[0];
4935
0
    s[1] = tag[1];
4936
0
    s[2] = 'Z';
4937
0
    memmove(s+3,data,ln);
4938
0
    if (need_nul) s[3 + ln] = '\0';
4939
0
    return 0;
4940
0
}
4941
4942
int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val)
4943
0
{
4944
0
    uint32_t sz, old_sz = 0, new = 0;
4945
0
    uint8_t *s, type;
4946
4947
0
    if (val < INT32_MIN || val > UINT32_MAX) {
4948
0
        errno = EOVERFLOW;
4949
0
        return -1;
4950
0
    }
4951
0
    if (val < INT16_MIN)       { type = 'i'; sz = 4; }
4952
0
    else if (val < INT8_MIN)   { type = 's'; sz = 2; }
4953
0
    else if (val < 0)          { type = 'c'; sz = 1; }
4954
0
    else if (val < UINT8_MAX)  { type = 'C'; sz = 1; }
4955
0
    else if (val < UINT16_MAX) { type = 'S'; sz = 2; }
4956
0
    else                       { type = 'I'; sz = 4; }
4957
4958
0
    s = bam_aux_get(b, tag);
4959
0
    if (s) {  // Tag present - how big was the old one?
4960
0
        switch (*s) {
4961
0
            case 'c': case 'C': old_sz = 1; break;
4962
0
            case 's': case 'S': old_sz = 2; break;
4963
0
            case 'i': case 'I': old_sz = 4; break;
4964
0
            default: errno = EINVAL; return -1;  // Not an integer
4965
0
        }
4966
0
    } else {
4967
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
4968
0
            s = b->data + b->l_data;
4969
0
            new = 1;
4970
0
        }  else { // Invalid aux data, give up.
4971
0
            return -1;
4972
0
        }
4973
0
    }
4974
4975
0
    if (new || old_sz < sz) {
4976
        // Make room for new tag
4977
0
        ptrdiff_t s_offset = s - b->data;
4978
0
        if (possibly_expand_bam_data(b, (new ? 3 : 0) + sz - old_sz) < 0)
4979
0
            return -1;
4980
0
        s =  b->data + s_offset;
4981
0
        if (new) { // Add tag id
4982
0
            *s++ = tag[0];
4983
0
            *s++ = tag[1];
4984
0
        } else {   // Shift following data so we have space
4985
0
            memmove(s + sz, s + old_sz, b->l_data - s_offset - old_sz);
4986
0
        }
4987
0
    } else {
4988
        // Reuse old space.  Data value may be bigger than necessary but
4989
        // we avoid having to move everything else
4990
0
        sz = old_sz;
4991
0
        type = (val < 0 ? "\0cs\0i" : "\0CS\0I")[old_sz];
4992
0
        assert(type > 0);
4993
0
    }
4994
0
    *s++ = type;
4995
0
#ifdef HTS_LITTLE_ENDIAN
4996
0
    memcpy(s, &val, sz);
4997
#else
4998
    switch (sz) {
4999
        case 4:  u32_to_le(val, s); break;
5000
        case 2:  u16_to_le(val, s); break;
5001
        default: *s = val; break;
5002
    }
5003
#endif
5004
0
    b->l_data += (new ? 3 : 0) + sz - old_sz;
5005
0
    return 0;
5006
0
}
5007
5008
int bam_aux_update_float(bam1_t *b, const char tag[2], float val)
5009
0
{
5010
0
    uint8_t *s = bam_aux_get(b, tag);
5011
0
    int shrink = 0, new = 0;
5012
5013
0
    if (s) { // Tag present - what was it?
5014
0
        switch (*s) {
5015
0
            case 'f': break;
5016
0
            case 'd': shrink = 1; break;
5017
0
            default: errno = EINVAL; return -1;  // Not a float
5018
0
        }
5019
0
    } else {
5020
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
5021
0
            new = 1;
5022
0
        }  else { // Invalid aux data, give up.
5023
0
            return -1;
5024
0
        }
5025
0
    }
5026
5027
0
    if (new) { // Ensure there's room
5028
0
        if (possibly_expand_bam_data(b, 3 + 4) < 0)
5029
0
            return -1;
5030
0
        s = b->data + b->l_data;
5031
0
        *s++ = tag[0];
5032
0
        *s++ = tag[1];
5033
0
    } else if (shrink) { // Convert non-standard double tag to float
5034
0
        memmove(s + 5, s + 9, b->l_data - ((s + 9) - b->data));
5035
0
        b->l_data -= 4;
5036
0
    }
5037
0
    *s++ = 'f';
5038
0
    float_to_le(val, s);
5039
0
    if (new) b->l_data += 7;
5040
5041
0
    return 0;
5042
0
}
5043
5044
int bam_aux_update_array(bam1_t *b, const char tag[2],
5045
                         uint8_t type, uint32_t items, void *data)
5046
0
{
5047
0
    uint8_t *s = bam_aux_get(b, tag);
5048
0
    size_t old_sz = 0, new_sz;
5049
0
    int new = 0;
5050
5051
0
    if (s) { // Tag present
5052
0
        if (*s != 'B') { errno = EINVAL; return -1; }
5053
0
        old_sz = aux_type2size(s[1]);
5054
0
        if (old_sz < 1 || old_sz > 4) { errno = EINVAL; return -1; }
5055
0
        old_sz *= le_to_u32(s + 2);
5056
0
    } else {
5057
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
5058
0
            s = b->data + b->l_data;
5059
0
            new = 1;
5060
0
        }  else { // Invalid aux data, give up.
5061
0
            return -1;
5062
0
        }
5063
0
    }
5064
5065
0
    new_sz = aux_type2size(type);
5066
0
    if (new_sz < 1 || new_sz > 4) { errno = EINVAL; return -1; }
5067
0
    if (items > INT32_MAX / new_sz) { errno = ENOMEM; return -1; }
5068
0
    new_sz *= items;
5069
5070
0
    if (new || old_sz < new_sz) {
5071
        // Make room for new tag
5072
0
        ptrdiff_t s_offset = s - b->data;
5073
0
        if (possibly_expand_bam_data(b, (new ? 8 : 0) + new_sz - old_sz) < 0)
5074
0
            return -1;
5075
0
        s =  b->data + s_offset;
5076
0
    }
5077
0
    if (new) { // Add tag id and type
5078
0
        *s++ = tag[0];
5079
0
        *s++ = tag[1];
5080
0
        *s = 'B';
5081
0
        b->l_data += 8 + new_sz;
5082
0
    } else if (old_sz != new_sz) { // shift following data if necessary
5083
0
        memmove(s + 6 + new_sz, s + 6 + old_sz,
5084
0
                b->l_data - ((s + 6 + old_sz) - b->data));
5085
0
        b->l_data -= old_sz;
5086
0
        b->l_data += new_sz;
5087
0
    }
5088
5089
0
    s[1] = type;
5090
0
    u32_to_le(items, s + 2);
5091
0
    if (new_sz > 0) {
5092
0
#ifdef HTS_LITTLE_ENDIAN
5093
0
        memcpy(s + 6, data, new_sz);
5094
#else
5095
        return aux_to_le(type, s + 6, data, new_sz);
5096
#endif
5097
0
    }
5098
0
    return 0;
5099
0
}
5100
5101
static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s,
5102
                                      uint32_t idx)
5103
0
{
5104
0
    switch (type) {
5105
0
        case 'c': return le_to_i8(s + idx);
5106
0
        case 'C': return s[idx];
5107
0
        case 's': return le_to_i16(s + 2 * idx);
5108
0
        case 'S': return le_to_u16(s + 2 * idx);
5109
0
        case 'i': return le_to_i32(s + 4 * idx);
5110
0
        case 'I': return le_to_u32(s + 4 * idx);
5111
0
        default:
5112
0
            errno = EINVAL;
5113
0
            return 0;
5114
0
    }
5115
0
}
5116
5117
int64_t bam_aux2i(const uint8_t *s)
5118
0
{
5119
0
    int type;
5120
0
    type = *s++;
5121
0
    return get_int_aux_val(type, s, 0);
5122
0
}
5123
5124
double bam_aux2f(const uint8_t *s)
5125
0
{
5126
0
    int type;
5127
0
    type = *s++;
5128
0
    if (type == 'd') return le_to_double(s);
5129
0
    else if (type == 'f') return le_to_float(s);
5130
0
    else return get_int_aux_val(type, s, 0);
5131
0
}
5132
5133
char bam_aux2A(const uint8_t *s)
5134
0
{
5135
0
    int type;
5136
0
    type = *s++;
5137
0
    if (type == 'A') return *(char*)s;
5138
0
    errno = EINVAL;
5139
0
    return 0;
5140
0
}
5141
5142
char *bam_aux2Z(const uint8_t *s)
5143
0
{
5144
0
    int type;
5145
0
    type = *s++;
5146
0
    if (type == 'Z' || type == 'H') return (char*)s;
5147
0
    errno = EINVAL;
5148
0
    return 0;
5149
0
}
5150
5151
uint32_t bam_auxB_len(const uint8_t *s)
5152
0
{
5153
0
    if (s[0] != 'B') {
5154
0
        errno = EINVAL;
5155
0
        return 0;
5156
0
    }
5157
0
    return le_to_u32(s + 2);
5158
0
}
5159
5160
int64_t bam_auxB2i(const uint8_t *s, uint32_t idx)
5161
0
{
5162
0
    uint32_t len = bam_auxB_len(s);
5163
0
    if (idx >= len) {
5164
0
        errno = ERANGE;
5165
0
        return 0;
5166
0
    }
5167
0
    return get_int_aux_val(s[1], s + 6, idx);
5168
0
}
5169
5170
double bam_auxB2f(const uint8_t *s, uint32_t idx)
5171
0
{
5172
0
    uint32_t len = bam_auxB_len(s);
5173
0
    if (idx >= len) {
5174
0
        errno = ERANGE;
5175
0
        return 0.0;
5176
0
    }
5177
0
    if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx);
5178
0
    else return get_int_aux_val(s[1], s + 6, idx);
5179
0
}
5180
5181
int sam_open_mode(char *mode, const char *fn, const char *format)
5182
0
{
5183
    // TODO Parse "bam5" etc for compression level
5184
0
    if (format == NULL) {
5185
        // Try to pick a format based on the filename extension
5186
0
        char extension[HTS_MAX_EXT_LEN];
5187
0
        if (find_file_extension(fn, extension) < 0) return -1;
5188
0
        return sam_open_mode(mode, fn, extension);
5189
0
    }
5190
0
    else if (strcasecmp(format, "bam") == 0) strcpy(mode, "b");
5191
0
    else if (strcasecmp(format, "cram") == 0) strcpy(mode, "c");
5192
0
    else if (strcasecmp(format, "sam") == 0) strcpy(mode, "");
5193
0
    else if (strcasecmp(format, "sam.gz") == 0) strcpy(mode, "z");
5194
0
    else if (strcasecmp(format, "fastq") == 0 ||
5195
0
             strcasecmp(format, "fq") == 0) strcpy(mode, "f");
5196
0
    else if (strcasecmp(format, "fastq.gz") == 0 ||
5197
0
             strcasecmp(format, "fq.gz") == 0) strcpy(mode, "fz");
5198
0
    else if (strcasecmp(format, "fasta") == 0 ||
5199
0
             strcasecmp(format, "fa") == 0) strcpy(mode, "F");
5200
0
    else if (strcasecmp(format, "fasta.gz") == 0 ||
5201
0
             strcasecmp(format, "fa.gz") == 0) strcpy(mode, "Fz");
5202
0
    else return -1;
5203
5204
0
    return 0;
5205
0
}
5206
5207
// A version of sam_open_mode that can handle ,key=value options.
5208
// The format string is allocated and returned, to be freed by the caller.
5209
// Prefix should be "r" or "w",
5210
char *sam_open_mode_opts(const char *fn,
5211
                         const char *mode,
5212
                         const char *format)
5213
0
{
5214
0
    char *mode_opts = malloc((format ? strlen(format) : 1) +
5215
0
                             (mode   ? strlen(mode)   : 1) + 12);
5216
0
    char *opts, *cp;
5217
0
    int format_len;
5218
5219
0
    if (!mode_opts)
5220
0
        return NULL;
5221
5222
0
    strcpy(mode_opts, mode ? mode : "r");
5223
0
    cp = mode_opts + strlen(mode_opts);
5224
5225
0
    if (format == NULL) {
5226
        // Try to pick a format based on the filename extension
5227
0
        char extension[HTS_MAX_EXT_LEN];
5228
0
        if (find_file_extension(fn, extension) < 0) {
5229
0
            free(mode_opts);
5230
0
            return NULL;
5231
0
        }
5232
0
        if (sam_open_mode(cp, fn, extension) == 0) {
5233
0
            return mode_opts;
5234
0
        } else {
5235
0
            free(mode_opts);
5236
0
            return NULL;
5237
0
        }
5238
0
    }
5239
5240
0
    if ((opts = strchr(format, ','))) {
5241
0
        format_len = opts-format;
5242
0
    } else {
5243
0
        opts="";
5244
0
        format_len = strlen(format);
5245
0
    }
5246
5247
0
    if (strncmp(format, "bam", format_len) == 0) {
5248
0
        *cp++ = 'b';
5249
0
    } else if (strncmp(format, "cram", format_len) == 0) {
5250
0
        *cp++ = 'c';
5251
0
    } else if (strncmp(format, "cram2", format_len) == 0) {
5252
0
        *cp++ = 'c';
5253
0
        strcpy(cp, ",VERSION=2.1");
5254
0
        cp += 12;
5255
0
    } else if (strncmp(format, "cram3", format_len) == 0) {
5256
0
        *cp++ = 'c';
5257
0
        strcpy(cp, ",VERSION=3.0");
5258
0
        cp += 12;
5259
0
    } else if (strncmp(format, "sam", format_len) == 0) {
5260
0
        ; // format mode=""
5261
0
    } else if (strncmp(format, "sam.gz", format_len) == 0) {
5262
0
        *cp++ = 'z';
5263
0
    } else if (strncmp(format, "fastq", format_len) == 0 ||
5264
0
               strncmp(format, "fq", format_len) == 0) {
5265
0
        *cp++ = 'f';
5266
0
    } else if (strncmp(format, "fastq.gz", format_len) == 0 ||
5267
0
               strncmp(format, "fq.gz", format_len) == 0) {
5268
0
        *cp++ = 'f';
5269
0
        *cp++ = 'z';
5270
0
    } else if (strncmp(format, "fasta", format_len) == 0 ||
5271
0
               strncmp(format, "fa", format_len) == 0) {
5272
0
        *cp++ = 'F';
5273
0
    } else if (strncmp(format, "fasta.gz", format_len) == 0 ||
5274
0
               strncmp(format, "fa", format_len) == 0) {
5275
0
        *cp++ = 'F';
5276
0
        *cp++ = 'z';
5277
0
    } else {
5278
0
        free(mode_opts);
5279
0
        return NULL;
5280
0
    }
5281
5282
0
    strcpy(cp, opts);
5283
5284
0
    return mode_opts;
5285
0
}
5286
5287
0
#define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
5288
int bam_str2flag(const char *str)
5289
0
{
5290
0
    char *end, *beg = (char*) str;
5291
0
    long int flag = strtol(str, &end, 0);
5292
0
    if ( end!=str ) return flag;    // the conversion was successful
5293
0
    flag = 0;
5294
0
    while ( *str )
5295
0
    {
5296
0
        end = beg;
5297
0
        while ( *end && *end!=',' ) end++;
5298
0
        if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
5299
0
        else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
5300
0
        else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
5301
0
        else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
5302
0
        else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
5303
0
        else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
5304
0
        else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
5305
0
        else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
5306
0
        else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
5307
0
        else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
5308
0
        else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
5309
0
        else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
5310
0
        else return -1;
5311
0
        if ( !*end ) break;
5312
0
        beg = end + 1;
5313
0
    }
5314
0
    return flag;
5315
0
}
5316
5317
char *bam_flag2str(int flag)
5318
0
{
5319
0
    kstring_t str = {0,0,0};
5320
0
    if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
5321
0
    if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
5322
0
    if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
5323
0
    if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
5324
0
    if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
5325
0
    if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
5326
0
    if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
5327
0
    if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
5328
0
    if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
5329
0
    if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
5330
0
    if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
5331
0
    if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
5332
0
    if ( str.l == 0 ) kputsn("", 0, &str);
5333
0
    return str.s;
5334
0
}
5335
5336
5337
/**************************
5338
 *** Pileup and Mpileup ***
5339
 **************************/
5340
5341
#if !defined(BAM_NO_PILEUP)
5342
5343
#include <assert.h>
5344
5345
/*******************
5346
 *** Memory pool ***
5347
 *******************/
5348
5349
typedef struct {
5350
    int k, y;
5351
    hts_pos_t x, end;
5352
} cstate_t;
5353
5354
static cstate_t g_cstate_null = { -1, 0, 0, 0 };
5355
5356
typedef struct __linkbuf_t {
5357
    bam1_t b;
5358
    hts_pos_t beg, end;
5359
    cstate_t s;
5360
    struct __linkbuf_t *next;
5361
    bam_pileup_cd cd;
5362
} lbnode_t;
5363
5364
typedef struct {
5365
    int cnt, n, max;
5366
    lbnode_t **buf;
5367
} mempool_t;
5368
5369
static mempool_t *mp_init(void)
5370
0
{
5371
0
    mempool_t *mp;
5372
0
    mp = (mempool_t*)calloc(1, sizeof(mempool_t));
5373
0
    return mp;
5374
0
}
5375
static void mp_destroy(mempool_t *mp)
5376
0
{
5377
0
    int k;
5378
0
    for (k = 0; k < mp->n; ++k) {
5379
0
        free(mp->buf[k]->b.data);
5380
0
        free(mp->buf[k]);
5381
0
    }
5382
0
    free(mp->buf);
5383
0
    free(mp);
5384
0
}
5385
static inline lbnode_t *mp_alloc(mempool_t *mp)
5386
0
{
5387
0
    ++mp->cnt;
5388
0
    if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
5389
0
    else return mp->buf[--mp->n];
5390
0
}
5391
static inline void mp_free(mempool_t *mp, lbnode_t *p)
5392
0
{
5393
0
    --mp->cnt; p->next = 0; // clear lbnode_t::next here
5394
0
    if (mp->n == mp->max) {
5395
0
        mp->max = mp->max? mp->max<<1 : 256;
5396
0
        mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
5397
0
    }
5398
0
    mp->buf[mp->n++] = p;
5399
0
}
5400
5401
/**********************
5402
 *** CIGAR resolver ***
5403
 **********************/
5404
5405
/* s->k: the index of the CIGAR operator that has just been processed.
5406
   s->x: the reference coordinate of the start of s->k
5407
   s->y: the query coordinate of the start of s->k
5408
 */
5409
static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s)
5410
0
{
5411
0
#define _cop(c) ((c)&BAM_CIGAR_MASK)
5412
0
#define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
5413
5414
0
    bam1_t *b = p->b;
5415
0
    bam1_core_t *c = &b->core;
5416
0
    uint32_t *cigar = bam_get_cigar(b);
5417
0
    int k;
5418
    // determine the current CIGAR operation
5419
    //fprintf(stderr, "%s\tpos=%ld\tend=%ld\t(%d,%ld,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
5420
0
    if (s->k == -1) { // never processed
5421
0
        p->qpos = 0;
5422
0
        if (c->n_cigar == 1) { // just one operation, save a loop
5423
0
          if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
5424
0
        } else { // find the first match or deletion
5425
0
            for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
5426
0
                int op = _cop(cigar[k]);
5427
0
                int l = _cln(cigar[k]);
5428
0
                if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP ||
5429
0
                    op == BAM_CEQUAL || op == BAM_CDIFF) break;
5430
0
                else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5431
0
            }
5432
0
            assert(k < c->n_cigar);
5433
0
            s->k = k;
5434
0
        }
5435
0
    } else { // the read has been processed before
5436
0
        int op, l = _cln(cigar[s->k]);
5437
0
        if (pos - s->x >= l) { // jump to the next operation
5438
0
            assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
5439
0
            op = _cop(cigar[s->k+1]);
5440
0
            if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
5441
0
              if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5442
0
                s->x += l;
5443
0
                ++s->k;
5444
0
            } else { // find the next M/D/N/=/X
5445
0
              if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5446
0
                s->x += l;
5447
0
                for (k = s->k + 1; k < c->n_cigar; ++k) {
5448
0
                    op = _cop(cigar[k]), l = _cln(cigar[k]);
5449
0
                    if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
5450
0
                    else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5451
0
                }
5452
0
                s->k = k;
5453
0
            }
5454
0
            assert(s->k < c->n_cigar); // otherwise a bug
5455
0
        } // else, do nothing
5456
0
    }
5457
0
    { // collect pileup information
5458
0
        int op, l;
5459
0
        op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
5460
0
        p->is_del = p->indel = p->is_refskip = 0;
5461
0
        if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
5462
0
            int op2 = _cop(cigar[s->k+1]);
5463
0
            int l2 = _cln(cigar[s->k+1]);
5464
0
            if (op2 == BAM_CDEL && op != BAM_CDEL) {
5465
                // At start of a new deletion, merge e.g. 1D2D to 3D.
5466
                // Within a deletion (the 2D in 1D2D) we keep p->indel=0
5467
                // and rely on is_del=1 as we would for 3D.
5468
0
                p->indel = -(int)l2;
5469
0
                for (k = s->k+2; k < c->n_cigar; ++k) {
5470
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5471
0
                    if (op2 == BAM_CDEL) p->indel -= l2;
5472
0
                    else break;
5473
0
                }
5474
0
            } else if (op2 == BAM_CINS) {
5475
0
                p->indel = l2;
5476
0
                for (k = s->k+2; k < c->n_cigar; ++k) {
5477
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5478
0
                    if (op2 == BAM_CINS) p->indel += l2;
5479
0
                    else if (op2 != BAM_CPAD) break;
5480
0
                }
5481
0
            } else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) {
5482
0
                int l3 = 0;
5483
0
                for (k = s->k + 2; k < c->n_cigar; ++k) {
5484
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5485
0
                    if (op2 == BAM_CINS) l3 += l2;
5486
0
                    else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
5487
0
                }
5488
0
                if (l3 > 0) p->indel = l3;
5489
0
            }
5490
0
        }
5491
0
        if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
5492
0
            p->qpos = s->y + (pos - s->x);
5493
0
        } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
5494
0
            p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
5495
0
            p->is_refskip = (op == BAM_CREF_SKIP);
5496
0
        } // cannot be other operations; otherwise a bug
5497
0
        p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
5498
0
    }
5499
0
    p->cigar_ind = s->k;
5500
0
    return 1;
5501
0
}
5502
5503
/*******************************
5504
 *** Expansion of insertions ***
5505
 *******************************/
5506
5507
/*
5508
 * Fills out the kstring with the padded insertion sequence for the current
5509
 * location in 'p'.  If this is not an insertion site, the string is blank.
5510
 *
5511
 * This variant handles base modifications, but only when "m" is non-NULL.
5512
 *
5513
 * Returns the number of inserted base on success, with string length being
5514
 *        accessable via ins->l;
5515
 *        -1 on failure.
5516
 */
5517
int bam_plp_insertion_mod(const bam_pileup1_t *p,
5518
                          hts_base_mod_state *m,
5519
0
                          kstring_t *ins, int *del_len) {
5520
0
    int j, k, indel, nb = 0;
5521
0
    uint32_t *cigar;
5522
5523
0
    if (p->indel <= 0) {
5524
0
        if (ks_resize(ins, 1) < 0)
5525
0
            return -1;
5526
0
        ins->l = 0;
5527
0
        ins->s[0] = '\0';
5528
0
        return 0;
5529
0
    }
5530
5531
0
    if (del_len)
5532
0
        *del_len = 0;
5533
5534
    // Measure indel length including pads
5535
0
    indel = 0;
5536
0
    k = p->cigar_ind+1;
5537
0
    cigar = bam_get_cigar(p->b);
5538
0
    while (k < p->b->core.n_cigar) {
5539
0
        switch (cigar[k] & BAM_CIGAR_MASK) {
5540
0
        case BAM_CPAD:
5541
0
        case BAM_CINS:
5542
0
            indel += (cigar[k] >> BAM_CIGAR_SHIFT);
5543
0
            break;
5544
0
        default:
5545
0
            k = p->b->core.n_cigar;
5546
0
            break;
5547
0
        }
5548
0
        k++;
5549
0
    }
5550
0
    nb = ins->l = indel;
5551
5552
    // Produce sequence
5553
0
    if (ks_resize(ins, indel+1) < 0)
5554
0
        return -1;
5555
0
    indel = 0;
5556
0
    k = p->cigar_ind+1;
5557
0
    j = 1;
5558
0
    while (k < p->b->core.n_cigar) {
5559
0
        int l, c;
5560
0
        switch (cigar[k] & BAM_CIGAR_MASK) {
5561
0
        case BAM_CPAD:
5562
0
            for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++)
5563
0
                ins->s[indel++] = '*';
5564
0
            break;
5565
0
        case BAM_CINS:
5566
0
            for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) {
5567
0
                c = p->qpos + j - p->is_del < p->b->core.l_qseq
5568
0
                    ? seq_nt16_str[bam_seqi(bam_get_seq(p->b),
5569
0
                                            p->qpos + j - p->is_del)]
5570
0
                    : 'N';
5571
0
                ins->s[indel++] = c;
5572
0
                int nm;
5573
0
                hts_base_mod mod[256];
5574
0
                if (m && (nm = bam_mods_at_qpos(p->b, p->qpos + j - p->is_del,
5575
0
                                                m, mod, 256)) > 0) {
5576
0
                    int o_indel = indel;
5577
0
                    if (ks_resize(ins, ins->l + nm*16+3) < 0)
5578
0
                        return -1;
5579
0
                    ins->s[indel++] = '[';
5580
0
                    int j;
5581
0
                    for (j = 0; j < nm; j++) {
5582
0
                        char qual[20];
5583
0
                        if (mod[j].qual >= 0)
5584
0
                            snprintf(qual, sizeof(qual), "%d", mod[j].qual);
5585
0
                        else
5586
0
                            *qual=0;
5587
0
                        if (mod[j].modified_base < 0)
5588
                            // ChEBI
5589
0
                            indel += snprintf(&ins->s[indel], ins->m - indel,
5590
0
                                              "%c(%d)%s",
5591
0
                                              "+-"[mod[j].strand],
5592
0
                                              -mod[j].modified_base,
5593
0
                                              qual);
5594
0
                        else
5595
0
                            indel += snprintf(&ins->s[indel], ins->m - indel,
5596
0
                                              "%c%c%s",
5597
0
                                              "+-"[mod[j].strand],
5598
0
                                              mod[j].modified_base,
5599
0
                                              qual);
5600
0
                    }
5601
0
                    ins->s[indel++] = ']';
5602
0
                    ins->l += indel - o_indel; // grow by amount we used
5603
0
                }
5604
0
            }
5605
0
            break;
5606
0
        case BAM_CDEL:
5607
            // eg cigar 1M2I1D gives mpileup output in T+2AA-1C style
5608
0
            if (del_len)
5609
0
                *del_len = cigar[k]>>BAM_CIGAR_SHIFT;
5610
            // fall through
5611
0
        default:
5612
0
            k = p->b->core.n_cigar;
5613
0
            break;
5614
0
        }
5615
0
        k++;
5616
0
    }
5617
0
    ins->s[indel] = '\0';
5618
0
    ins->l = indel; // string length
5619
5620
0
    return nb;      // base length
5621
0
}
5622
5623
/*
5624
 * Fills out the kstring with the padded insertion sequence for the current
5625
 * location in 'p'.  If this is not an insertion site, the string is blank.
5626
 *
5627
 * This is the original interface with no capability for reporting base
5628
 * modifications.
5629
 *
5630
 * Returns the length of insertion string on success;
5631
 *        -1 on failure.
5632
 */
5633
0
int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) {
5634
0
    return bam_plp_insertion_mod(p, NULL, ins, del_len);
5635
0
}
5636
5637
/***********************
5638
 *** Pileup iterator ***
5639
 ***********************/
5640
5641
// Dictionary of overlapping reads
5642
KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
5643
typedef khash_t(olap_hash) olap_hash_t;
5644
5645
struct bam_plp_s {
5646
    mempool_t *mp;
5647
    lbnode_t *head, *tail;
5648
    int32_t tid, max_tid;
5649
    hts_pos_t pos, max_pos;
5650
    int is_eof, max_plp, error, maxcnt;
5651
    uint64_t id;
5652
    bam_pileup1_t *plp;
5653
    // for the "auto" interface only
5654
    bam1_t *b;
5655
    bam_plp_auto_f func;
5656
    void *data;
5657
    olap_hash_t *overlaps;
5658
5659
    // For notification of creation and destruction events
5660
    // and associated client-owned pointer.
5661
    int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
5662
    int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd);
5663
};
5664
5665
bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
5666
0
{
5667
0
    bam_plp_t iter;
5668
0
    iter = (bam_plp_t)calloc(1, sizeof(struct bam_plp_s));
5669
0
    iter->mp = mp_init();
5670
0
    iter->head = iter->tail = mp_alloc(iter->mp);
5671
0
    iter->max_tid = iter->max_pos = -1;
5672
0
    iter->maxcnt = 8000;
5673
0
    if (func) {
5674
0
        iter->func = func;
5675
0
        iter->data = data;
5676
0
        iter->b = bam_init1();
5677
0
    }
5678
0
    return iter;
5679
0
}
5680
5681
int bam_plp_init_overlaps(bam_plp_t iter)
5682
0
{
5683
0
    iter->overlaps = kh_init(olap_hash);  // hash for tweaking quality of bases in overlapping reads
5684
0
    return iter->overlaps ? 0 : -1;
5685
0
}
5686
5687
void bam_plp_destroy(bam_plp_t iter)
5688
0
{
5689
0
    lbnode_t *p, *pnext;
5690
0
    if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
5691
0
    for (p = iter->head; p != NULL; p = pnext) {
5692
0
        if (iter->plp_destruct && p != iter->tail)
5693
0
            iter->plp_destruct(iter->data, &p->b, &p->cd);
5694
0
        pnext = p->next;
5695
0
        mp_free(iter->mp, p);
5696
0
    }
5697
0
    mp_destroy(iter->mp);
5698
0
    if (iter->b) bam_destroy1(iter->b);
5699
0
    free(iter->plp);
5700
0
    free(iter);
5701
0
}
5702
5703
void bam_plp_constructor(bam_plp_t plp,
5704
0
                         int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5705
0
    plp->plp_construct = func;
5706
0
}
5707
5708
void bam_plp_destructor(bam_plp_t plp,
5709
0
                        int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5710
0
    plp->plp_destruct = func;
5711
0
}
5712
5713
//---------------------------------
5714
//---  Tweak overlapping reads
5715
//---------------------------------
5716
5717
/**
5718
 *  cigar_iref2iseq_set()  - find the first CMATCH setting the ref and the read index
5719
 *  cigar_iref2iseq_next() - get the next CMATCH base
5720
 *  @cigar:       pointer to current cigar block (rw)
5721
 *  @cigar_max:   pointer just beyond the last cigar block
5722
 *  @icig:        position within the current cigar block (rw)
5723
 *  @iseq:        position in the sequence (rw)
5724
 *  @iref:        position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
5725
 *
5726
 *  Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered,
5727
 *  or -2 on error.
5728
 */
5729
static inline int cigar_iref2iseq_set(const uint32_t **cigar,
5730
                                      const uint32_t *cigar_max,
5731
                                      hts_pos_t *icig,
5732
                                      hts_pos_t *iseq,
5733
                                      hts_pos_t *iref)
5734
0
{
5735
0
    hts_pos_t pos = *iref;
5736
0
    if ( pos < 0 ) return -1;
5737
0
    *icig = 0;
5738
0
    *iseq = 0;
5739
0
    *iref = 0;
5740
0
    while ( *cigar<cigar_max )
5741
0
    {
5742
0
        int cig  = (**cigar) & BAM_CIGAR_MASK;
5743
0
        int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5744
5745
0
        if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5746
0
        if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
5747
0
        if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5748
0
        {
5749
0
            pos -= ncig;
5750
0
            if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
5751
0
            (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
5752
0
            continue;
5753
0
        }
5754
0
        if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5755
0
        if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
5756
0
        {
5757
0
            pos -= ncig;
5758
0
            if ( pos<0 ) pos = 0;
5759
0
            (*cigar)++; *icig = 0; *iref += ncig;
5760
0
            continue;
5761
0
        }
5762
0
        hts_log_error("Unexpected cigar %d", cig);
5763
0
        return -2;
5764
0
    }
5765
0
    *iseq = -1;
5766
0
    return -1;
5767
0
}
5768
static inline int cigar_iref2iseq_next(const uint32_t **cigar,
5769
                                       const uint32_t *cigar_max,
5770
                                       hts_pos_t *icig,
5771
                                       hts_pos_t *iseq,
5772
                                       hts_pos_t *iref)
5773
0
{
5774
0
    while ( *cigar < cigar_max )
5775
0
    {
5776
0
        int cig  = (**cigar) & BAM_CIGAR_MASK;
5777
0
        int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5778
5779
0
        if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5780
0
        {
5781
0
            if ( *icig >= ncig - 1 ) { *icig = -1;  (*cigar)++; continue; }
5782
0
            (*iseq)++; (*icig)++; (*iref)++;
5783
0
            return BAM_CMATCH;
5784
0
        }
5785
0
        if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = -1; continue; }
5786
0
        if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5787
0
        if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5788
0
        if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = -1; continue; }
5789
0
        hts_log_error("Unexpected cigar %d", cig);
5790
0
        return -2;
5791
0
    }
5792
0
    *iseq = -1;
5793
0
    *iref = -1;
5794
0
    return -1;
5795
0
}
5796
5797
// Given overlapping read 'a' (left) and 'b' (right) on the same
5798
// template, adjust quality values to zero for either a or b.
5799
// Note versions 1.12 and earlier always removed quality from 'b' for
5800
// matching bases.  Now we select a or b semi-randomly based on name hash.
5801
// Returns 0 on success,
5802
//        -1 on failure
5803
static int tweak_overlap_quality(bam1_t *a, bam1_t *b)
5804
0
{
5805
0
    const uint32_t *a_cigar = bam_get_cigar(a),
5806
0
        *a_cigar_max = a_cigar + a->core.n_cigar;
5807
0
    const uint32_t *b_cigar = bam_get_cigar(b),
5808
0
        *b_cigar_max = b_cigar + b->core.n_cigar;
5809
0
    hts_pos_t a_icig = 0, a_iseq = 0;
5810
0
    hts_pos_t b_icig = 0, b_iseq = 0;
5811
0
    uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
5812
0
    uint8_t *a_seq  = bam_get_seq(a), *b_seq = bam_get_seq(b);
5813
5814
0
    hts_pos_t iref   = b->core.pos;
5815
0
    hts_pos_t a_iref = iref - a->core.pos;
5816
0
    hts_pos_t b_iref = iref - b->core.pos;
5817
5818
0
    int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max,
5819
0
                                    &a_icig, &a_iseq, &a_iref);
5820
0
    if ( a_ret<0 )
5821
        // no overlap or error
5822
0
        return a_ret<-1 ? -1:0;
5823
5824
0
    int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max,
5825
0
                                    &b_icig, &b_iseq, &b_iref);
5826
0
    if ( b_ret<0 )
5827
        // no overlap or error
5828
0
        return b_ret<-1 ? -1:0;
5829
5830
    // Determine which seq is the one getting modified qualities.
5831
0
    uint8_t amul, bmul;
5832
0
    if (__ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(a))) & 1) {
5833
0
        amul = 1;
5834
0
        bmul = 0;
5835
0
    } else {
5836
0
        amul = 0;
5837
0
        bmul = 1;
5838
0
    }
5839
5840
    // Loop over the overlapping region nulling qualities in either
5841
    // seq a or b.
5842
0
    int err = 0;
5843
0
    while ( 1 ) {
5844
        // Step to next matching reference position in a and b
5845
0
        while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos )
5846
0
            a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
5847
0
                                         &a_icig, &a_iseq, &a_iref);
5848
0
        if ( a_ret<0 ) { // done
5849
0
            err = a_ret<-1?-1:0;
5850
0
            break;
5851
0
        }
5852
5853
0
        while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos )
5854
0
            b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig,
5855
0
                                         &b_iseq, &b_iref);
5856
0
        if ( b_ret<0 ) { // done
5857
0
            err = b_ret<-1?-1:0;
5858
0
            break;
5859
0
        }
5860
5861
0
        if ( iref < a_iref + a->core.pos )
5862
0
            iref = a_iref + a->core.pos;
5863
5864
0
        if ( iref < b_iref + b->core.pos )
5865
0
            iref = b_iref + b->core.pos;
5866
5867
0
        iref++;
5868
5869
        // If A or B has a deletion then we catch up the other to this point.
5870
        // We also amend quality values using the same rules for mismatch.
5871
0
        if (a_iref+a->core.pos != b_iref+b->core.pos) {
5872
0
            if (a_iref+a->core.pos < b_iref+b->core.pos
5873
0
                && b_cigar > bam_get_cigar(b)
5874
0
                && bam_cigar_op(b_cigar[-1]) == BAM_CDEL) {
5875
                // Del in B means it's moved on further than A
5876
0
                do {
5877
0
                    a_qual[a_iseq] = amul
5878
0
                        ? a_qual[a_iseq]*0.8
5879
0
                        : 0;
5880
0
                    a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
5881
0
                                                 &a_icig, &a_iseq, &a_iref);
5882
0
                    if (a_ret < 0)
5883
0
                        return -(a_ret<-1); // 0 or -1
5884
0
                } while (a_iref + a->core.pos < b_iref+b->core.pos);
5885
0
            } else if (a_cigar > bam_get_cigar(a)
5886
0
                       && bam_cigar_op(a_cigar[-1]) == BAM_CDEL) {
5887
                // Del in A means it's moved on further than B
5888
0
                do {
5889
0
                    b_qual[b_iseq] = bmul
5890
0
                        ? b_qual[b_iseq]*0.8
5891
0
                        : 0;
5892
0
                    b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max,
5893
0
                                                 &b_icig, &b_iseq, &b_iref);
5894
0
                    if (b_ret < 0)
5895
0
                        return -(b_ret<-1); // 0 or -1
5896
0
                } while (b_iref + b->core.pos < a_iref+a->core.pos);
5897
0
            } else {
5898
                // Anything else, eg ref-skip, we don't support here
5899
0
                continue;
5900
0
            }
5901
0
        }
5902
5903
        // fprintf(stderr, "a_cig=%ld,%ld b_cig=%ld,%ld iref=%ld "
5904
        //         "a_iref=%ld b_iref=%ld a_iseq=%ld b_iseq=%ld\n",
5905
        //         a_cigar-bam_get_cigar(a), a_icig,
5906
        //         b_cigar-bam_get_cigar(b), b_icig,
5907
        //         iref, a_iref+a->core.pos+1, b_iref+b->core.pos+1,
5908
        //         a_iseq, b_iseq);
5909
5910
0
        if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq)
5911
            // Fell off end of sequence, bad CIGAR?
5912
0
            return -1;
5913
5914
        // We're finally at the same ref base in both a and b.
5915
        // Check if the bases match (confident) or mismatch
5916
        // (not so confident).
5917
0
        if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) {
5918
            // We are very confident about this base.  Use sum of quals
5919
0
            int qual = a_qual[a_iseq] + b_qual[b_iseq];
5920
0
            a_qual[a_iseq] = amul * (qual>200 ? 200 : qual);
5921
0
            b_qual[b_iseq] = bmul * (qual>200 ? 200 : qual);;
5922
0
        } else {
5923
            // Not so confident about anymore given the mismatch.
5924
            // Reduce qual for lowest quality base.
5925
0
            if ( a_qual[a_iseq] > b_qual[b_iseq] ) {
5926
                // A highest qual base; keep
5927
0
                a_qual[a_iseq] = 0.8 * a_qual[a_iseq];
5928
0
                b_qual[b_iseq] = 0;
5929
0
            } else if (a_qual[a_iseq] < b_qual[b_iseq] ) {
5930
                // B highest qual base; keep
5931
0
                b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
5932
0
                a_qual[a_iseq] = 0;
5933
0
            } else {
5934
                // Both equal, so pick randomly
5935
0
                a_qual[a_iseq] = amul * 0.8 * a_qual[a_iseq];
5936
0
                b_qual[b_iseq] = bmul * 0.8 * b_qual[b_iseq];
5937
0
            }
5938
0
        }
5939
0
    }
5940
5941
0
    return err;
5942
0
}
5943
5944
// Fix overlapping reads. Simple soft-clipping did not give good results.
5945
// Lowering qualities of unwanted bases is more selective and works better.
5946
//
5947
// Returns 0 on success, -1 on failure
5948
static int overlap_push(bam_plp_t iter, lbnode_t *node)
5949
0
{
5950
0
    if ( !iter->overlaps ) return 0;
5951
5952
    // mapped mates and paired reads only
5953
0
    if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return 0;
5954
5955
    // no overlap possible, unless some wild cigar
5956
0
    if ( (node->b.core.mtid >= 0 && node->b.core.tid != node->b.core.mtid)
5957
0
         || (llabs(node->b.core.isize) >= 2*node->b.core.l_qseq
5958
0
         && node->b.core.mpos >= node->end) // for those wild cigars
5959
0
       ) return 0;
5960
5961
0
    khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
5962
0
    if ( kitr==kh_end(iter->overlaps) )
5963
0
    {
5964
        // Only add reads where the mate is still to arrive
5965
0
        if (node->b.core.mpos >= node->b.core.pos ||
5966
0
            ((node->b.core.flag & BAM_FPAIRED) && node->b.core.mpos == -1)) {
5967
0
            int ret;
5968
0
            kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
5969
0
            if (ret < 0) return -1;
5970
0
            kh_value(iter->overlaps, kitr) = node;
5971
0
        }
5972
0
    }
5973
0
    else
5974
0
    {
5975
0
        lbnode_t *a = kh_value(iter->overlaps, kitr);
5976
0
        int err = tweak_overlap_quality(&a->b, &node->b);
5977
0
        kh_del(olap_hash, iter->overlaps, kitr);
5978
0
        assert(a->end-1 == a->s.end);
5979
0
        return err;
5980
0
    }
5981
0
    return 0;
5982
0
}
5983
5984
static void overlap_remove(bam_plp_t iter, const bam1_t *b)
5985
0
{
5986
0
    if ( !iter->overlaps ) return;
5987
5988
0
    khiter_t kitr;
5989
0
    if ( b )
5990
0
    {
5991
0
        if ( b->core.flag&BAM_FUNMAP || !(b->core.flag&BAM_FPROPER_PAIR) ) //no need
5992
0
            return;
5993
5994
0
        kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
5995
0
        if ( kitr!=kh_end(iter->overlaps) )
5996
0
            kh_del(olap_hash, iter->overlaps, kitr);
5997
0
    }
5998
0
    else
5999
0
    {
6000
        // remove all
6001
0
        for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
6002
0
            if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
6003
0
    }
6004
0
}
6005
6006
6007
6008
// Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
6009
// pointer to the piled records if next position is ready or NULL if there is not enough records in the
6010
// buffer yet (the current position is still the maximum position across all buffered reads).
6011
const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
6012
0
{
6013
0
    if (iter->error) { *_n_plp = -1; return NULL; }
6014
0
    *_n_plp = 0;
6015
0
    if (iter->is_eof && iter->head == iter->tail) return NULL;
6016
0
    while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
6017
0
        int n_plp = 0;
6018
        // write iter->plp at iter->pos
6019
0
        lbnode_t **pptr = &iter->head;
6020
0
        while (*pptr != iter->tail) {
6021
0
            if ((*pptr)->next)
6022
0
                hts_prefetch((*pptr)->next);
6023
0
            lbnode_t *p = *pptr;
6024
0
            if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
6025
0
                overlap_remove(iter, &p->b);
6026
0
                if (iter->plp_destruct)
6027
0
                    iter->plp_destruct(iter->data, &p->b, &p->cd);
6028
0
                *pptr = p->next; mp_free(iter->mp, p);
6029
0
            }
6030
0
            else {
6031
0
                if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
6032
0
                    if (n_plp == iter->max_plp) { // then double the capacity
6033
0
                        iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
6034
0
                        iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
6035
0
                    }
6036
0
                    iter->plp[n_plp].b = &p->b;
6037
0
                    iter->plp[n_plp].cd = p->cd;
6038
0
                    if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
6039
0
                }
6040
0
                pptr = &(*pptr)->next;
6041
0
            }
6042
0
        }
6043
0
        *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
6044
        // update iter->tid and iter->pos
6045
0
        if (iter->head != iter->tail) {
6046
0
            if (iter->tid > iter->head->b.core.tid) {
6047
0
                hts_log_error("Unsorted input. Pileup aborts");
6048
0
                iter->error = 1;
6049
0
                *_n_plp = -1;
6050
0
                return NULL;
6051
0
            }
6052
0
        }
6053
0
        if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
6054
0
            iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
6055
0
        } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
6056
0
            iter->pos = iter->head->beg; // jump to the next position
6057
0
        } else ++iter->pos; // scan contiguously
6058
        // return
6059
0
        if (n_plp) return iter->plp;
6060
0
        if (iter->is_eof && iter->head == iter->tail) break;
6061
0
    }
6062
0
    return NULL;
6063
0
}
6064
6065
const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
6066
0
{
6067
0
    hts_pos_t pos64 = 0;
6068
0
    const bam_pileup1_t *p = bam_plp64_next(iter, _tid, &pos64, _n_plp);
6069
0
    if (pos64 < INT_MAX) {
6070
0
        *_pos = pos64;
6071
0
    } else {
6072
0
        hts_log_error("Position %"PRId64" too large", pos64);
6073
0
        *_pos = INT_MAX;
6074
0
        iter->error = 1;
6075
0
        *_n_plp = -1;
6076
0
        return NULL;
6077
0
    }
6078
0
    return p;
6079
0
}
6080
6081
int bam_plp_push(bam_plp_t iter, const bam1_t *b)
6082
0
{
6083
0
    if (iter->error) return -1;
6084
0
    if (b) {
6085
0
        if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
6086
        // Skip only unmapped reads here, any additional filtering must be done in iter->func
6087
0
        if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
6088
0
        if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
6089
0
        {
6090
0
            overlap_remove(iter, b);
6091
0
            return 0;
6092
0
        }
6093
0
        if (bam_copy1(&iter->tail->b, b) == NULL)
6094
0
            return -1;
6095
0
        iter->tail->b.id = iter->id++;
6096
0
        iter->tail->beg = b->core.pos;
6097
        // Use raw rlen rather than bam_endpos() which adjusts rlen=0 to rlen=1
6098
0
        iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
6099
0
        iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
6100
0
        if (b->core.tid < iter->max_tid) {
6101
0
            hts_log_error("The input is not sorted (chromosomes out of order)");
6102
0
            iter->error = 1;
6103
0
            return -1;
6104
0
        }
6105
0
        if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
6106
0
            hts_log_error("The input is not sorted (reads out of order)");
6107
0
            iter->error = 1;
6108
0
            return -1;
6109
0
        }
6110
0
        iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
6111
0
        if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
6112
0
            lbnode_t *next = mp_alloc(iter->mp);
6113
0
            if (!next) {
6114
0
                iter->error = 1;
6115
0
                return -1;
6116
0
            }
6117
0
            if (iter->plp_construct) {
6118
0
                if (iter->plp_construct(iter->data, &iter->tail->b,
6119
0
                                        &iter->tail->cd) < 0) {
6120
0
                    mp_free(iter->mp, next);
6121
0
                    iter->error = 1;
6122
0
                    return -1;
6123
0
                }
6124
0
            }
6125
0
            if (overlap_push(iter, iter->tail) < 0) {
6126
0
                mp_free(iter->mp, next);
6127
0
                iter->error = 1;
6128
0
                return -1;
6129
0
            }
6130
0
            iter->tail->next = next;
6131
0
            iter->tail = iter->tail->next;
6132
0
        }
6133
0
    } else iter->is_eof = 1;
6134
0
    return 0;
6135
0
}
6136
6137
const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
6138
0
{
6139
0
    const bam_pileup1_t *plp;
6140
0
    if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
6141
0
    if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6142
0
    else { // no pileup line can be obtained; read alignments
6143
0
        *_n_plp = 0;
6144
0
        if (iter->is_eof) return 0;
6145
0
        int ret;
6146
0
        while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
6147
0
            if (bam_plp_push(iter, iter->b) < 0) {
6148
0
                *_n_plp = -1;
6149
0
                return 0;
6150
0
            }
6151
0
            if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6152
            // otherwise no pileup line can be returned; read the next alignment.
6153
0
        }
6154
0
        if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
6155
0
        if (bam_plp_push(iter, 0) < 0) {
6156
0
            *_n_plp = -1;
6157
0
            return 0;
6158
0
        }
6159
0
        if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6160
0
        return 0;
6161
0
    }
6162
0
}
6163
6164
const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
6165
0
{
6166
0
    hts_pos_t pos64 = 0;
6167
0
    const bam_pileup1_t *p = bam_plp64_auto(iter, _tid, &pos64, _n_plp);
6168
0
    if (pos64 < INT_MAX) {
6169
0
        *_pos = pos64;
6170
0
    } else {
6171
0
        hts_log_error("Position %"PRId64" too large", pos64);
6172
0
        *_pos = INT_MAX;
6173
0
        iter->error = 1;
6174
0
        *_n_plp = -1;
6175
0
        return NULL;
6176
0
    }
6177
0
    return p;
6178
0
}
6179
6180
void bam_plp_reset(bam_plp_t iter)
6181
0
{
6182
0
    overlap_remove(iter, NULL);
6183
0
    iter->max_tid = iter->max_pos = -1;
6184
0
    iter->tid = iter->pos = 0;
6185
0
    iter->is_eof = 0;
6186
0
    while (iter->head != iter->tail) {
6187
0
        lbnode_t *p = iter->head;
6188
0
        iter->head = p->next;
6189
0
        mp_free(iter->mp, p);
6190
0
    }
6191
0
}
6192
6193
void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
6194
0
{
6195
0
    iter->maxcnt = maxcnt;
6196
0
}
6197
6198
/************************
6199
 *** Mpileup iterator ***
6200
 ************************/
6201
6202
struct bam_mplp_s {
6203
    int n;
6204
    int32_t min_tid, *tid;
6205
    hts_pos_t min_pos, *pos;
6206
    bam_plp_t *iter;
6207
    int *n_plp;
6208
    const bam_pileup1_t **plp;
6209
};
6210
6211
bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
6212
0
{
6213
0
    int i;
6214
0
    bam_mplp_t iter;
6215
0
    iter = (bam_mplp_t)calloc(1, sizeof(struct bam_mplp_s));
6216
0
    iter->pos = (hts_pos_t*)calloc(n, sizeof(hts_pos_t));
6217
0
    iter->tid = (int32_t*)calloc(n, sizeof(int32_t));
6218
0
    iter->n_plp = (int*)calloc(n, sizeof(int));
6219
0
    iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
6220
0
    iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
6221
0
    iter->n = n;
6222
0
    iter->min_pos = HTS_POS_MAX;
6223
0
    iter->min_tid = (uint32_t)-1;
6224
0
    for (i = 0; i < n; ++i) {
6225
0
        iter->iter[i] = bam_plp_init(func, data[i]);
6226
0
        iter->pos[i] = iter->min_pos;
6227
0
        iter->tid[i] = iter->min_tid;
6228
0
    }
6229
0
    return iter;
6230
0
}
6231
6232
int bam_mplp_init_overlaps(bam_mplp_t iter)
6233
0
{
6234
0
    int i, r = 0;
6235
0
    for (i = 0; i < iter->n; ++i)
6236
0
        r |= bam_plp_init_overlaps(iter->iter[i]);
6237
0
    return r == 0 ? 0 : -1;
6238
0
}
6239
6240
void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
6241
0
{
6242
0
    int i;
6243
0
    for (i = 0; i < iter->n; ++i)
6244
0
        iter->iter[i]->maxcnt = maxcnt;
6245
0
}
6246
6247
void bam_mplp_destroy(bam_mplp_t iter)
6248
0
{
6249
0
    int i;
6250
0
    for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
6251
0
    free(iter->iter); free(iter->pos); free(iter->tid);
6252
0
    free(iter->n_plp); free(iter->plp);
6253
0
    free(iter);
6254
0
}
6255
6256
int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp)
6257
0
{
6258
0
    int i, ret = 0;
6259
0
    hts_pos_t new_min_pos = HTS_POS_MAX;
6260
0
    uint32_t new_min_tid = (uint32_t)-1;
6261
0
    for (i = 0; i < iter->n; ++i) {
6262
0
        if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
6263
0
            int tid;
6264
0
            hts_pos_t pos;
6265
0
            iter->plp[i] = bam_plp64_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
6266
0
            if ( iter->iter[i]->error ) return -1;
6267
0
            if (iter->plp[i]) {
6268
0
                iter->tid[i] = tid;
6269
0
                iter->pos[i] = pos;
6270
0
            } else {
6271
0
                iter->tid[i] = 0;
6272
0
                iter->pos[i] = 0;
6273
0
            }
6274
0
        }
6275
0
        if (iter->plp[i]) {
6276
0
            if (iter->tid[i] < new_min_tid) {
6277
0
                new_min_tid = iter->tid[i];
6278
0
                new_min_pos = iter->pos[i];
6279
0
            } else if (iter->tid[i] == new_min_tid && iter->pos[i] < new_min_pos) {
6280
0
                new_min_pos = iter->pos[i];
6281
0
            }
6282
0
        }
6283
0
    }
6284
0
    iter->min_pos = new_min_pos;
6285
0
    iter->min_tid = new_min_tid;
6286
0
    if (new_min_pos == HTS_POS_MAX) return 0;
6287
0
    *_tid = new_min_tid; *_pos = new_min_pos;
6288
0
    for (i = 0; i < iter->n; ++i) {
6289
0
        if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
6290
0
            n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
6291
0
            ++ret;
6292
0
        } else n_plp[i] = 0, plp[i] = 0;
6293
0
    }
6294
0
    return ret;
6295
0
}
6296
6297
int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
6298
0
{
6299
0
    hts_pos_t pos64 = 0;
6300
0
    int ret = bam_mplp64_auto(iter, _tid, &pos64, n_plp, plp);
6301
0
    if (ret >= 0) {
6302
0
        if (pos64 < INT_MAX) {
6303
0
            *_pos = pos64;
6304
0
        } else {
6305
0
            hts_log_error("Position %"PRId64" too large", pos64);
6306
0
            *_pos = INT_MAX;
6307
0
            return -1;
6308
0
        }
6309
0
    }
6310
0
    return ret;
6311
0
}
6312
6313
void bam_mplp_reset(bam_mplp_t iter)
6314
0
{
6315
0
    int i;
6316
0
    iter->min_pos = HTS_POS_MAX;
6317
0
    iter->min_tid = (uint32_t)-1;
6318
0
    for (i = 0; i < iter->n; ++i) {
6319
0
        bam_plp_reset(iter->iter[i]);
6320
0
        iter->pos[i] = HTS_POS_MAX;
6321
0
        iter->tid[i] = (uint32_t)-1;
6322
0
        iter->n_plp[i] = 0;
6323
0
        iter->plp[i] = NULL;
6324
0
    }
6325
0
}
6326
6327
void bam_mplp_constructor(bam_mplp_t iter,
6328
0
                          int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
6329
0
    int i;
6330
0
    for (i = 0; i < iter->n; ++i)
6331
0
        bam_plp_constructor(iter->iter[i], func);
6332
0
}
6333
6334
void bam_mplp_destructor(bam_mplp_t iter,
6335
0
                         int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
6336
0
    int i;
6337
0
    for (i = 0; i < iter->n; ++i)
6338
0
        bam_plp_destructor(iter->iter[i], func);
6339
0
}
6340
6341
#endif // ~!defined(BAM_NO_PILEUP)