Coverage Report

Created: 2025-07-11 06:53

/src/htslib/sam.c
Line
Count
Source (jump to first uncovered line)
1
/*  sam.c -- SAM and BAM file I/O and manipulation.
2
3
    Copyright (C) 2008-2010, 2012-2025 Genome Research Ltd.
4
    Copyright (C) 2010, 2012, 2013 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <strings.h>
30
#include <stdio.h>
31
#include <stdlib.h>
32
#include <string.h>
33
#include <errno.h>
34
#include <zlib.h>
35
#include <assert.h>
36
#include <signal.h>
37
#include <inttypes.h>
38
#include <unistd.h>
39
40
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
41
#include "fuzz_settings.h"
42
#endif
43
44
// Suppress deprecation message for cigar_tab, which we initialise
45
#include "htslib/hts_defs.h"
46
#undef HTS_DEPRECATED
47
#define HTS_DEPRECATED(message)
48
49
#include "htslib/sam.h"
50
#include "htslib/bgzf.h"
51
#include "cram/cram.h"
52
#include "hts_internal.h"
53
#include "sam_internal.h"
54
#include "htslib/hfile.h"
55
#include "htslib/hts_endian.h"
56
#include "htslib/hts_expr.h"
57
#include "header.h"
58
59
#include "htslib/khash.h"
60
KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
61
KHASH_SET_INIT_INT(tag)
62
63
#ifndef EFTYPE
64
0
#define EFTYPE ENOEXEC
65
#endif
66
#ifndef EOVERFLOW
67
#define EOVERFLOW ERANGE
68
#endif
69
70
/**********************
71
 *** BAM header I/O ***
72
 **********************/
73
74
HTSLIB_EXPORT
75
const int8_t bam_cigar_table[256] = {
76
    // 0 .. 47
77
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
78
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
79
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
80
81
    // 48 .. 63  (including =)
82
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, BAM_CEQUAL, -1, -1,
83
84
    // 64 .. 79  (including MIDNHB)
85
    -1, -1, BAM_CBACK, -1,  BAM_CDEL, -1, -1, -1,
86
        BAM_CHARD_CLIP, BAM_CINS, -1, -1,  -1, BAM_CMATCH, BAM_CREF_SKIP, -1,
87
88
    // 80 .. 95  (including SPX)
89
    BAM_CPAD, -1, -1, BAM_CSOFT_CLIP,  -1, -1, -1, -1,
90
        BAM_CDIFF, -1, -1, -1,  -1, -1, -1, -1,
91
92
    // 96 .. 127
93
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
94
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
95
96
    // 128 .. 255
97
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
98
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
99
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
100
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
101
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
102
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
103
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
104
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1
105
};
106
107
sam_hdr_t *sam_hdr_init(void)
108
44.4k
{
109
44.4k
    sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t));
110
44.4k
    if (bh == NULL) return NULL;
111
112
44.4k
    bh->cigar_tab = bam_cigar_table;
113
44.4k
    return bh;
114
44.4k
}
115
116
void sam_hdr_destroy(sam_hdr_t *bh)
117
122k
{
118
122k
    int32_t i;
119
120
122k
    if (bh == NULL) return;
121
122
56.4k
    if (bh->ref_count > 0) {
123
12.0k
        --bh->ref_count;
124
12.0k
        return;
125
12.0k
    }
126
127
44.4k
    if (bh->target_name) {
128
43.3k
        for (i = 0; i < bh->n_targets; ++i)
129
30.6k
            free(bh->target_name[i]);
130
12.7k
        free(bh->target_name);
131
12.7k
        free(bh->target_len);
132
12.7k
    }
133
44.4k
    free(bh->text);
134
44.4k
    if (bh->hrecs)
135
39.3k
        sam_hrecs_free(bh->hrecs);
136
44.4k
    if (bh->sdict)
137
44.4k
        kh_destroy(s2i, (khash_t(s2i) *) bh->sdict);
138
44.4k
    free(bh);
139
44.4k
}
140
141
// Copy the sam_hdr_t::sdict hash, used to store the real lengths of long
142
// references before sam_hdr_t::hrecs is populated
143
int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h)
144
72
{
145
72
    const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict;
146
72
    khash_t(s2i) *dest_long_refs = kh_init(s2i);
147
72
    int i;
148
72
    if (!dest_long_refs) return -1;
149
150
1.48k
    for (i = 0; i < h->n_targets; i++) {
151
1.41k
        int ret;
152
1.41k
        khiter_t ksrc, kdest;
153
1.41k
        if (h->target_len[i] < UINT32_MAX) continue;
154
502
        ksrc = kh_get(s2i, src_long_refs, h->target_name[i]);
155
502
        if (ksrc == kh_end(src_long_refs)) continue;
156
502
        kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret);
157
502
        if (ret < 0) {
158
0
            kh_destroy(s2i, dest_long_refs);
159
0
            return -1;
160
0
        }
161
502
        kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc);
162
502
    }
163
164
72
    h->sdict = dest_long_refs;
165
72
    return 0;
166
72
}
167
168
sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0)
169
14.4k
{
170
14.4k
    if (h0 == NULL) return NULL;
171
14.4k
    sam_hdr_t *h;
172
14.4k
    if ((h = sam_hdr_init()) == NULL) return NULL;
173
    // copy the simple data
174
14.4k
    h->n_targets = 0;
175
14.4k
    h->ignore_sam_err = h0->ignore_sam_err;
176
14.4k
    h->l_text = 0;
177
178
    // Then the pointery stuff
179
180
14.4k
    if (!h0->hrecs) {
181
938
        h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t));
182
938
        if (!h->target_len) goto fail;
183
938
        h->target_name = (char**)calloc(h0->n_targets, sizeof(char*));
184
938
        if (!h->target_name) goto fail;
185
186
938
        int i;
187
2.82k
        for (i = 0; i < h0->n_targets; ++i) {
188
1.89k
            h->target_len[i] = h0->target_len[i];
189
1.89k
            h->target_name[i] = strdup(h0->target_name[i]);
190
1.89k
            if (!h->target_name[i]) break;
191
1.89k
        }
192
938
        h->n_targets = i;
193
938
        if (i < h0->n_targets) goto fail;
194
195
938
        if (h0->sdict) {
196
72
            if (sam_hdr_dup_sdict(h0, h) < 0) goto fail;
197
72
        }
198
938
    }
199
200
14.4k
    if (h0->hrecs) {
201
13.4k
        kstring_t tmp = { 0, 0, NULL };
202
13.4k
        if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) {
203
0
            free(ks_release(&tmp));
204
0
            goto fail;
205
0
        }
206
207
13.4k
        h->l_text = tmp.l;
208
13.4k
        h->text   = ks_release(&tmp);
209
210
13.4k
        if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0)
211
0
            goto fail;
212
13.4k
    } else {
213
938
        h->l_text = h0->text ? h0->l_text : 0;
214
938
        h->text = malloc(h->l_text + 1);
215
938
        if (!h->text) goto fail;
216
938
        if (h0->text)
217
938
            memcpy(h->text, h0->text, h->l_text);
218
938
        h->text[h->l_text] = '\0';
219
938
    }
220
221
14.4k
    return h;
222
223
0
 fail:
224
0
    sam_hdr_destroy(h);
225
0
    return NULL;
226
14.4k
}
227
228
sam_hdr_t *bam_hdr_read(BGZF *fp)
229
2.87k
{
230
2.87k
    sam_hdr_t *h;
231
2.87k
    uint8_t buf[4];
232
2.87k
    int magic_len, has_EOF;
233
2.87k
    int32_t i, name_len, num_names = 0;
234
2.87k
    size_t bufsize;
235
2.87k
    ssize_t bytes;
236
    // check EOF
237
2.87k
    has_EOF = bgzf_check_EOF(fp);
238
2.87k
    if (has_EOF < 0) {
239
0
        perror("[W::bam_hdr_read] bgzf_check_EOF");
240
2.87k
    } else if (has_EOF == 0) {
241
2.87k
        hts_log_warning("EOF marker is absent. The input is probably truncated");
242
2.87k
    }
243
    // read "BAM1"
244
2.87k
    magic_len = bgzf_read(fp, buf, 4);
245
2.87k
    if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) {
246
0
        hts_log_error("Invalid BAM binary header");
247
0
        return 0;
248
0
    }
249
2.87k
    h = sam_hdr_init();
250
2.87k
    if (!h) goto nomem;
251
252
    // read plain text and the number of reference sequences
253
2.87k
    bytes = bgzf_read(fp, buf, 4);
254
2.87k
    if (bytes != 4) goto read_err;
255
2.86k
    h->l_text = le_to_u32(buf);
256
257
2.86k
    bufsize = h->l_text + 1;
258
2.86k
    if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
259
2.86k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
260
2.86k
    if (bufsize > FUZZ_ALLOC_LIMIT) goto nomem;
261
2.85k
#endif
262
2.85k
    h->text = (char*)malloc(bufsize);
263
2.85k
    if (!h->text) goto nomem;
264
2.85k
    h->text[h->l_text] = 0; // make sure it is NULL terminated
265
2.85k
    bytes = bgzf_read(fp, h->text, h->l_text);
266
2.85k
    if (bytes != h->l_text) goto read_err;
267
268
2.64k
    bytes = bgzf_read(fp, &h->n_targets, 4);
269
2.64k
    if (bytes != 4) goto read_err;
270
2.63k
    if (fp->is_be) ed_swap_4p(&h->n_targets);
271
272
2.63k
    if (h->n_targets < 0) goto invalid;
273
274
    // read reference sequence names and lengths
275
2.56k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
276
2.56k
    if (h->n_targets > (FUZZ_ALLOC_LIMIT - bufsize)/(sizeof(char*)+sizeof(uint32_t)))
277
24
        goto nomem;
278
2.54k
#endif
279
2.54k
    if (h->n_targets > 0) {
280
906
        h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
281
906
        if (!h->target_name) goto nomem;
282
906
        h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
283
906
        if (!h->target_len) goto nomem;
284
906
    }
285
1.63k
    else {
286
1.63k
        h->target_name = NULL;
287
1.63k
        h->target_len = NULL;
288
1.63k
    }
289
290
3.50k
    for (i = 0; i != h->n_targets; ++i) {
291
1.26k
        bytes = bgzf_read(fp, &name_len, 4);
292
1.26k
        if (bytes != 4) goto read_err;
293
1.18k
        if (fp->is_be) ed_swap_4p(&name_len);
294
1.18k
        if (name_len <= 0) goto invalid;
295
296
1.10k
        h->target_name[i] = (char*)malloc(name_len);
297
1.10k
        if (!h->target_name[i]) goto nomem;
298
1.10k
        num_names++;
299
300
1.10k
        bytes = bgzf_read(fp, h->target_name[i], name_len);
301
1.10k
        if (bytes != name_len) goto read_err;
302
303
993
        if (h->target_name[i][name_len - 1] != '\0') {
304
            /* Fix missing NUL-termination.  Is this being too nice?
305
               We could alternatively bail out with an error. */
306
663
            char *new_name;
307
663
            if (name_len == INT32_MAX) goto invalid;
308
663
            new_name = realloc(h->target_name[i], name_len + 1);
309
663
            if (new_name == NULL) goto nomem;
310
663
            h->target_name[i] = new_name;
311
663
            h->target_name[i][name_len] = '\0';
312
663
        }
313
314
993
        bytes = bgzf_read(fp, &h->target_len[i], 4);
315
993
        if (bytes != 4) goto read_err;
316
960
        if (fp->is_be) ed_swap_4p(&h->target_len[i]);
317
960
    }
318
2.23k
    return h;
319
320
33
 nomem:
321
33
    hts_log_error("Out of memory");
322
33
    goto clean;
323
324
465
 read_err:
325
465
    if (bytes < 0) {
326
9
        hts_log_error("Error reading BGZF stream");
327
456
    } else {
328
456
        hts_log_error("Truncated BAM header");
329
456
    }
330
465
    goto clean;
331
332
147
 invalid:
333
147
    hts_log_error("Invalid BAM binary header");
334
335
645
 clean:
336
645
    if (h != NULL) {
337
645
        h->n_targets = num_names; // ensure we free only allocated target_names
338
645
        sam_hdr_destroy(h);
339
645
    }
340
645
    return NULL;
341
147
}
342
343
int bam_hdr_write(BGZF *fp, const sam_hdr_t *h)
344
8.98k
{
345
8.98k
    int32_t i, name_len, x;
346
8.98k
    kstring_t hdr_ks = { 0, 0, NULL };
347
8.98k
    char *text;
348
8.98k
    uint32_t l_text;
349
350
8.98k
    if (!h) return -1;
351
352
8.98k
    if (h->hrecs) {
353
8.04k
        if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1;
354
8.04k
        if (hdr_ks.l > UINT32_MAX) {
355
0
            hts_log_error("Header too long for BAM format");
356
0
            free(hdr_ks.s);
357
0
            return -1;
358
8.04k
        } else if (hdr_ks.l > INT32_MAX) {
359
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
360
0
            hts_log_warning("Output file may not be portable");
361
0
        }
362
8.04k
        text = hdr_ks.s;
363
8.04k
        l_text = hdr_ks.l;
364
8.04k
    } else {
365
938
        if (h->l_text > UINT32_MAX) {
366
0
            hts_log_error("Header too long for BAM format");
367
0
            return -1;
368
938
        } else if (h->l_text > INT32_MAX) {
369
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
370
0
            hts_log_warning("Output file may not be portable");
371
0
        }
372
938
        text = h->text;
373
938
        l_text = h->l_text;
374
938
    }
375
    // write "BAM1"
376
8.98k
    if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; }
377
    // write plain text and the number of reference sequences
378
8.98k
    if (fp->is_be) {
379
0
        x = ed_swap_4(l_text);
380
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
381
0
        if (l_text) {
382
0
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
383
0
        }
384
0
        x = ed_swap_4(h->n_targets);
385
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
386
8.98k
    } else {
387
8.98k
        if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; }
388
8.98k
        if (l_text) {
389
4.34k
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
390
4.34k
        }
391
8.98k
        if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; }
392
8.98k
    }
393
8.98k
    free(hdr_ks.s);
394
    // write sequence names and lengths
395
16.2k
    for (i = 0; i != h->n_targets; ++i) {
396
7.24k
        char *p = h->target_name[i];
397
7.24k
        name_len = strlen(p) + 1;
398
7.24k
        if (fp->is_be) {
399
0
            x = ed_swap_4(name_len);
400
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
401
7.24k
        } else {
402
7.24k
            if (bgzf_write(fp, &name_len, 4) < 0) return -1;
403
7.24k
        }
404
7.24k
        if (bgzf_write(fp, p, name_len) < 0) return -1;
405
7.24k
        if (fp->is_be) {
406
0
            x = ed_swap_4(h->target_len[i]);
407
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
408
7.24k
        } else {
409
7.24k
            if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
410
7.24k
        }
411
7.24k
    }
412
8.98k
    if (bgzf_flush(fp) < 0) return -1;
413
8.98k
    return 0;
414
8.98k
}
415
416
const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
417
0
                             hts_pos_t *beg, hts_pos_t *end, int flags) {
418
0
    return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags);
419
0
}
420
421
/*************************
422
 *** BAM alignment I/O ***
423
 *************************/
424
425
bam1_t *bam_init1(void)
426
633k
{
427
633k
    return (bam1_t*)calloc(1, sizeof(bam1_t));
428
633k
}
429
430
int sam_realloc_bam_data(bam1_t *b, size_t desired)
431
653k
{
432
653k
    uint32_t new_m_data;
433
653k
    uint8_t *new_data;
434
653k
    new_m_data = desired;
435
653k
    kroundup32(new_m_data); // next power of 2
436
653k
    new_m_data += 32; // reduces malloc arena migrations?
437
653k
    if (new_m_data < desired) {
438
0
        errno = ENOMEM; // Not strictly true but we can't store the size
439
0
        return -1;
440
0
    }
441
653k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
442
653k
    if (new_m_data > FUZZ_ALLOC_LIMIT) {
443
54
        errno = ENOMEM;
444
54
        return -1;
445
54
    }
446
653k
#endif
447
653k
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
448
653k
        new_data = realloc(b->data, new_m_data);
449
653k
    } else {
450
0
        if ((new_data = malloc(new_m_data)) != NULL) {
451
0
            if (b->l_data > 0)
452
0
                memcpy(new_data, b->data,
453
0
                       b->l_data < b->m_data ? b->l_data : b->m_data);
454
0
            bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA));
455
0
        }
456
0
    }
457
653k
    if (!new_data) return -1;
458
653k
    b->data = new_data;
459
653k
    b->m_data = new_m_data;
460
653k
    return 0;
461
653k
}
462
463
void bam_destroy1(bam1_t *b)
464
43.9M
{
465
43.9M
    if (b == 0) return;
466
633k
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
467
633k
        free(b->data);
468
633k
        if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) {
469
            // In case of reuse
470
0
            b->data = NULL;
471
0
            b->m_data = 0;
472
0
            b->l_data = 0;
473
0
        }
474
633k
    }
475
476
633k
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0)
477
633k
        free(b);
478
633k
}
479
480
bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
481
1.51M
{
482
1.51M
    if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL;
483
1.51M
    memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data
484
1.51M
    memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest
485
1.51M
    bdst->l_data = bsrc->l_data;
486
1.51M
    bdst->id = bsrc->id;
487
1.51M
    return bdst;
488
1.51M
}
489
490
bam1_t *bam_dup1(const bam1_t *bsrc)
491
607k
{
492
607k
    if (bsrc == NULL) return NULL;
493
607k
    bam1_t *bdst = bam_init1();
494
607k
    if (bdst == NULL) return NULL;
495
607k
    if (bam_copy1(bdst, bsrc) == NULL) {
496
0
        bam_destroy1(bdst);
497
0
        return NULL;
498
0
    }
499
607k
    return bdst;
500
607k
}
501
502
static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar,
503
                             hts_pos_t *rlen, hts_pos_t *qlen)
504
1.61k
{
505
1.61k
    int k;
506
1.61k
    *rlen = *qlen = 0;
507
15.5k
    for (k = 0; k < n_cigar; ++k) {
508
13.9k
        int type = bam_cigar_type(bam_cigar_op(cigar[k]));
509
13.9k
        int len = bam_cigar_oplen(cigar[k]);
510
13.9k
        if (type & 1) *qlen += len;
511
13.9k
        if (type & 2) *rlen += len;
512
13.9k
    }
513
1.61k
}
514
515
static int subtract_check_underflow(size_t length, size_t *limit)
516
21.6M
{
517
21.6M
    if (length <= *limit) {
518
21.6M
        *limit -= length;
519
21.6M
        return 0;
520
21.6M
    }
521
522
0
    return -1;
523
21.6M
}
524
525
int bam_set1(bam1_t *bam,
526
             size_t l_qname, const char *qname,
527
             uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq,
528
             size_t n_cigar, const uint32_t *cigar,
529
             int32_t mtid, hts_pos_t mpos, hts_pos_t isize,
530
             size_t l_seq, const char *seq, const char *qual,
531
             size_t l_aux)
532
4.32M
{
533
    // use a default qname "*" if none is provided
534
4.32M
    if (l_qname == 0) {
535
1.05M
        l_qname = 1;
536
1.05M
        qname = "*";
537
1.05M
    }
538
539
    // note: the qname is stored nul terminated and padded as described in the
540
    // documentation for the bam1_t struct.
541
4.32M
    size_t qname_nuls = 4 - l_qname % 4;
542
543
    // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos().
544
    // can't use bam_endpos() directly as some fields not yet set up.
545
4.32M
    hts_pos_t rlen = 0, qlen = 0;
546
4.32M
    if (!(flag & BAM_FUNMAP)) {
547
0
        bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen);
548
0
    }
549
4.32M
    if (rlen == 0) {
550
4.32M
        rlen = 1;
551
4.32M
    }
552
553
    // validate parameters
554
4.32M
    if (l_qname > 254) {
555
78
        hts_log_error("Query name too long");
556
78
        errno = EINVAL;
557
78
        return -1;
558
78
    }
559
4.32M
    if (HTS_POS_MAX - rlen <= pos) {
560
0
        hts_log_error("Read ends beyond highest supported position");
561
0
        errno = EINVAL;
562
0
        return -1;
563
0
    }
564
4.32M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) {
565
0
        hts_log_error("Mapped query must have a CIGAR");
566
0
        errno = EINVAL;
567
0
        return -1;
568
0
    }
569
4.32M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) {
570
0
        hts_log_error("CIGAR and query sequence are of different length");
571
0
        errno = EINVAL;
572
0
        return -1;
573
0
    }
574
575
4.32M
    size_t limit = INT32_MAX;
576
4.32M
    int u = subtract_check_underflow(l_qname + qname_nuls, &limit);
577
4.32M
    u    += subtract_check_underflow(n_cigar * 4, &limit);
578
4.32M
    u    += subtract_check_underflow((l_seq + 1) / 2, &limit);
579
4.32M
    u    += subtract_check_underflow(l_seq, &limit);
580
4.32M
    u    += subtract_check_underflow(l_aux, &limit);
581
4.32M
    if (u != 0) {
582
0
        hts_log_error("Size overflow");
583
0
        errno = EINVAL;
584
0
        return -1;
585
0
    }
586
587
    // re-allocate the data buffer as needed.
588
4.32M
    size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq;
589
4.32M
    if (realloc_bam_data(bam, data_len + l_aux) < 0) {
590
0
        return -1;
591
0
    }
592
593
4.32M
    bam->l_data = (int)data_len;
594
4.32M
    bam->core.pos = pos;
595
4.32M
    bam->core.tid = tid;
596
4.32M
    bam->core.bin = bam_reg2bin(pos, pos + rlen);
597
4.32M
    bam->core.qual = mapq;
598
4.32M
    bam->core.l_extranul = (uint8_t)(qname_nuls - 1);
599
4.32M
    bam->core.flag = flag;
600
4.32M
    bam->core.l_qname = (uint16_t)(l_qname + qname_nuls);
601
4.32M
    bam->core.n_cigar = (uint32_t)n_cigar;
602
4.32M
    bam->core.l_qseq = (int32_t)l_seq;
603
4.32M
    bam->core.mtid = mtid;
604
4.32M
    bam->core.mpos = mpos;
605
4.32M
    bam->core.isize = isize;
606
607
4.32M
    uint8_t *cp = bam->data;
608
4.32M
    strncpy((char *)cp, qname, l_qname);
609
4.32M
    int i;
610
16.6M
    for (i = 0; i < qname_nuls; i++) {
611
12.3M
        cp[l_qname + i] = '\0';
612
12.3M
    }
613
4.32M
    cp += l_qname + qname_nuls;
614
615
4.32M
    if (n_cigar > 0) {
616
0
        memcpy(cp, cigar, n_cigar * 4);
617
0
    }
618
4.32M
    cp += n_cigar * 4;
619
620
363M
#define NN 16
621
4.32M
    const uint8_t *useq = (uint8_t *)seq;
622
34.2M
    for (i = 0; i + NN < l_seq; i += NN) {
623
29.9M
        int j;
624
29.9M
        const uint8_t *u2 = useq+i;
625
269M
        for (j = 0; j < NN/2; j++)
626
239M
            cp[j] = (seq_nt16_table[u2[j*2]]<<4) | seq_nt16_table[u2[j*2+1]];
627
29.9M
        cp += NN/2;
628
29.9M
    }
629
6.20M
    for (; i + 1 < l_seq; i += 2) {
630
1.88M
        *cp++ = (seq_nt16_table[useq[i]] << 4) | seq_nt16_table[useq[i + 1]];
631
1.88M
    }
632
633
4.61M
    for (; i < l_seq; i++) {
634
297k
        *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4;
635
297k
    }
636
637
4.32M
    if (qual) {
638
723
        memcpy(cp, qual, l_seq);
639
723
    }
640
4.32M
    else {
641
4.32M
        memset(cp, '\xff', l_seq);
642
4.32M
    }
643
644
4.32M
    return (int)data_len;
645
4.32M
}
646
647
hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
648
1.54M
{
649
1.54M
    int k;
650
1.54M
    hts_pos_t l;
651
3.73M
    for (k = l = 0; k < n_cigar; ++k)
652
2.18M
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
653
1.94M
            l += bam_cigar_oplen(cigar[k]);
654
1.54M
    return l;
655
1.54M
}
656
657
hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
658
143k
{
659
143k
    int k;
660
143k
    hts_pos_t l;
661
7.34M
    for (k = l = 0; k < n_cigar; ++k)
662
7.20M
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
663
6.63M
            l += bam_cigar_oplen(cigar[k]);
664
143k
    return l;
665
143k
}
666
667
hts_pos_t bam_endpos(const bam1_t *b)
668
1.40k
{
669
1.40k
    hts_pos_t rlen = (b->core.flag & BAM_FUNMAP)? 0 : bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
670
1.40k
    if (rlen == 0) rlen = 1;
671
1.40k
    return b->core.pos + rlen;
672
1.40k
}
673
674
static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
675
233k
{
676
233k
    bam1_core_t *c = &b->core;
677
678
    // Bail out as fast as possible for the easy case
679
233k
    uint32_t test_CG = BAM_CSOFT_CLIP | (c->l_qseq << BAM_CIGAR_SHIFT);
680
233k
    if (c->n_cigar == 0 || test_CG != *bam_get_cigar(b))
681
153k
        return 0;
682
683
    // The above isn't fool proof - we may have old CIGAR tags that aren't used,
684
    // but this is much less likely so do as a secondary check.
685
80.1k
    if (c->tid < 0 || c->pos < 0)
686
42.1k
        return 0;
687
688
    // Do we have a CG tag?
689
37.9k
    uint8_t *CG = bam_aux_get(b, "CG");
690
37.9k
    int saved_errno = errno;
691
37.9k
    if (!CG) {
692
36.0k
        if (errno != ENOENT) return -1;  // Bad aux data
693
36.0k
        errno = saved_errno; // restore errno on expected no-CG-tag case
694
36.0k
        return 0;
695
36.0k
    }
696
697
    // Now we start with the serious work migrating CG to CIGAR
698
1.89k
    uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data,
699
1.89k
        *cigar0, CG_len, fake_bytes;
700
1.89k
    cigar0 = bam_get_cigar(b);
701
1.89k
    fake_bytes = c->n_cigar * 4;
702
1.89k
    if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i'))
703
482
        return 0; // not of type B,I
704
1.41k
    CG_len = le_to_u32(CG + 2);
705
    // don't move if the real CIGAR length is shorter than the fake cigar length
706
1.41k
    if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0;
707
708
    // move from the CG tag to the right position
709
1.40k
    cigar_st = (uint8_t*)cigar0 - b->data;
710
1.40k
    c->n_cigar = CG_len;
711
1.40k
    n_cigar4 = c->n_cigar * 4;
712
1.40k
    CG_st = CG - b->data - 2;
713
1.40k
    CG_en = CG_st + 8 + n_cigar4;
714
1.40k
    if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1;
715
    // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
716
1.40k
    b->l_data = b->l_data - fake_bytes + n_cigar4;
717
    // insert c->n_cigar-fake_bytes empty space to make room
718
1.40k
    memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes));
719
    // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
720
1.40k
    memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4);
721
1.40k
    if (ori_len > CG_en) // move data after the CG tag
722
157
        memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
723
1.40k
    b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
724
1.40k
    if (recal_bin)
725
1.40k
        b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5);
726
1.40k
    if (give_warning)
727
1.40k
        hts_log_warning("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
728
1.40k
    return 1;
729
1.40k
}
730
731
static inline int aux_type2size(uint8_t type)
732
1.88M
{
733
1.88M
    switch (type) {
734
1.05M
    case 'A': case 'c': case 'C':
735
1.05M
        return 1;
736
137k
    case 's': case 'S':
737
137k
        return 2;
738
385k
    case 'i': case 'I': case 'f':
739
385k
        return 4;
740
12.1k
    case 'd':
741
12.1k
        return 8;
742
292k
    case 'Z': case 'H': case 'B':
743
292k
        return type;
744
564
    default:
745
564
        return 0;
746
1.88M
    }
747
1.88M
}
748
749
static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
750
0
{
751
0
    uint32_t *cigar = (uint32_t*)(data + c->l_qname);
752
0
    uint32_t i;
753
0
    for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
754
0
}
755
756
// Fix bad records where qname is not terminated correctly.
757
1.51k
static int fixup_missing_qname_nul(bam1_t *b) {
758
1.51k
    bam1_core_t *c = &b->core;
759
760
    // Note this is called before c->l_extranul is added to c->l_qname
761
1.51k
    if (c->l_extranul > 0) {
762
737
        b->data[c->l_qname++] = '\0';
763
737
        c->l_extranul--;
764
778
    } else {
765
778
        if (b->l_data > INT_MAX - 4) return -1;
766
778
        if (realloc_bam_data(b, b->l_data + 4) < 0) return -1;
767
778
        b->l_data += 4;
768
778
        b->data[c->l_qname++] = '\0';
769
778
        c->l_extranul = 3;
770
778
    }
771
1.51k
    return 0;
772
1.51k
}
773
774
/*
775
 * Note a second interface that returns a bam pointer instead would avoid bam_copy1
776
 * in multi-threaded handling.  This may be worth considering for htslib2.
777
 */
778
int bam_read1(BGZF *fp, bam1_t *b)
779
3.47k
{
780
3.47k
    bam1_core_t *c = &b->core;
781
3.47k
    int32_t block_len, ret, i;
782
3.47k
    uint32_t new_l_data;
783
3.47k
    uint8_t tmp[32], *x;
784
785
3.47k
    b->l_data = 0;
786
787
3.47k
    if ((ret = bgzf_read_small(fp, &block_len, 4)) != 4) {
788
262
        if (ret == 0) return -1; // normal end-of-file
789
132
        else return -2; // truncated
790
262
    }
791
3.21k
    if (fp->is_be)
792
0
        ed_swap_4p(&block_len);
793
3.21k
    if (block_len < 32) return -4;  // block_len includes core data
794
2.83k
    if (fp->block_length - fp->block_offset > 32) {
795
        // Avoid bgzf_read and a temporary copy to a local buffer
796
2.48k
        x = (uint8_t *)fp->uncompressed_block + fp->block_offset;
797
2.48k
        fp->block_offset += 32;
798
2.48k
    } else {
799
343
        x = tmp;
800
343
        if (bgzf_read(fp, x, 32) != 32) return -3;
801
343
    }
802
803
2.55k
    c->tid        = le_to_u32(x);
804
2.55k
    c->pos        = le_to_i32(x+4);
805
2.55k
    uint32_t x2   = le_to_u32(x+8);
806
2.55k
    c->bin        = x2>>16;
807
2.55k
    c->qual       = x2>>8&0xff;
808
2.55k
    c->l_qname    = x2&0xff;
809
2.55k
    c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
810
2.55k
    uint32_t x3   = le_to_u32(x+12);
811
2.55k
    c->flag       = x3>>16;
812
2.55k
    c->n_cigar    = x3&0xffff;
813
2.55k
    c->l_qseq     = le_to_u32(x+16);
814
2.55k
    c->mtid       = le_to_u32(x+20);
815
2.55k
    c->mpos       = le_to_i32(x+24);
816
2.55k
    c->isize      = le_to_i32(x+28);
817
818
2.55k
    new_l_data = block_len - 32 + c->l_extranul;
819
2.55k
    if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
820
2.43k
    if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
821
2.43k
        + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data)
822
144
        return -4;
823
2.29k
    if (realloc_bam_data(b, new_l_data) < 0) return -4;
824
2.24k
    b->l_data = new_l_data;
825
826
2.24k
    if (bgzf_read_small(fp, b->data, c->l_qname) != c->l_qname) return -4;
827
2.18k
    if (b->data[c->l_qname - 1] != '\0') { // try to fix missing nul termination
828
1.51k
        if (fixup_missing_qname_nul(b) < 0) return -4;
829
1.51k
    }
830
5.90k
    for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
831
2.18k
    c->l_qname += c->l_extranul;
832
2.18k
    if (b->l_data < c->l_qname ||
833
2.18k
        bgzf_read_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
834
189
        return -4;
835
1.99k
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
836
1.99k
    if (bam_tag2cigar(b, 0, 0) < 0)
837
23
        return -4;
838
839
    // TODO: consider making this conditional
840
1.96k
    if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
841
1.61k
        hts_pos_t rlen, qlen;
842
1.61k
        bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
843
1.61k
        if ((b->core.flag & BAM_FUNMAP) || rlen == 0) rlen = 1;
844
1.61k
        b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
845
        // Sanity check for broken CIGAR alignments
846
1.61k
        if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
847
51
            hts_log_error("CIGAR and query sequence lengths differ for %s",
848
51
                    bam_get_qname(b));
849
51
            return -4;
850
51
        }
851
1.61k
    }
852
853
1.91k
    return 4 + block_len;
854
1.96k
}
855
856
int bam_write1(BGZF *fp, const bam1_t *b)
857
1.51M
{
858
1.51M
    const bam1_core_t *c = &b->core;
859
1.51M
    uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
860
1.51M
    int i, ok;
861
1.51M
    if (c->l_qname - c->l_extranul > 255) {
862
5
        hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b));
863
5
        errno = EOVERFLOW;
864
5
        return -1;
865
5
    }
866
1.51M
    if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
867
1.51M
    if (c->pos > INT_MAX ||
868
1.51M
        c->mpos > INT_MAX ||
869
1.51M
        c->isize < INT_MIN || c->isize > INT_MAX) {
870
409
        hts_log_error("Positional data is too large for BAM format");
871
409
        return -1;
872
409
    }
873
1.51M
    x[0] = c->tid;
874
1.51M
    x[1] = c->pos;
875
1.51M
    x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
876
1.51M
    if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
877
1.51M
    else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
878
1.51M
    x[4] = c->l_qseq;
879
1.51M
    x[5] = c->mtid;
880
1.51M
    x[6] = c->mpos;
881
1.51M
    x[7] = c->isize;
882
1.51M
    ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
883
1.51M
    if (fp->is_be) {
884
0
        for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
885
0
        y = block_len;
886
0
        if (ok) ok = (bgzf_write_small(fp, ed_swap_4p(&y), 4) >= 0);
887
0
        swap_data(c, b->l_data, b->data, 1);
888
1.51M
    } else {
889
1.51M
        if (ok) ok = (bgzf_write_small(fp, &block_len, 4) >= 0);
890
1.51M
    }
891
1.51M
    if (ok) ok = (bgzf_write_small(fp, x, 32) >= 0);
892
1.51M
    if (ok) ok = (bgzf_write_small(fp, b->data, c->l_qname - c->l_extranul) >= 0);
893
1.51M
    if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
894
1.51M
        if (ok) ok = (bgzf_write_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
895
1.51M
    } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
896
17
        uint8_t buf[8];
897
17
        uint32_t cigar_st, cigar_en, cigar[2];
898
17
        hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b));
899
17
        if (cigreflen >= (1<<28)) {
900
            // Length of reference covered is greater than the biggest
901
            // CIGAR operation currently allowed.
902
1
            hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos
903
1
                          " cannot be written in BAM.  Try writing SAM or CRAM instead.\n",
904
1
                          bam_get_qname(b), c->n_cigar, cigreflen);
905
1
            return -1;
906
1
        }
907
16
        cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
908
16
        cigar_en = cigar_st + c->n_cigar * 4;
909
16
        cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
910
16
        cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP;
911
16
        u32_to_le(cigar[0], buf);
912
16
        u32_to_le(cigar[1], buf + 4);
913
16
        if (ok) ok = (bgzf_write_small(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
914
16
        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
915
16
        if (ok) ok = (bgzf_write_small(fp, "CGBI", 4) >= 0); // write CG:B,I
916
16
        u32_to_le(c->n_cigar, buf);
917
16
        if (ok) ok = (bgzf_write_small(fp, buf, 4) >= 0); // write the true CIGAR length
918
16
        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
919
16
    }
920
1.51M
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
921
1.51M
    return ok? 4 + block_len : -1;
922
1.51M
}
923
924
/*
925
 * Write a BAM file and append to the in-memory index simultaneously.
926
 */
927
1.51M
static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) {
928
1.51M
    BGZF *bfp = fp->fp.bgzf;
929
930
1.51M
    if (!fp->idx)
931
1.51M
        return bam_write1(bfp, b);
932
933
0
    uint32_t block_len = b->l_data - b->core.l_extranul + 32;
934
0
    if (bgzf_flush_try(bfp, 4 + block_len) < 0)
935
0
        return -1;
936
0
    if (!bfp->mt)
937
0
        hts_idx_amend_last(fp->idx, bgzf_tell(bfp));
938
939
0
    int ret = bam_write1(bfp, b);
940
0
    if (ret < 0)
941
0
        return -1;
942
943
0
    if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) {
944
0
        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
945
0
                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
946
0
        ret = -1;
947
0
    }
948
949
0
    return ret;
950
0
}
951
952
/*
953
 * Set the qname in a BAM record
954
 */
955
int bam_set_qname(bam1_t *rec, const char *qname)
956
0
{
957
0
    if (!rec) return -1;
958
0
    if (!qname || !*qname) return -1;
959
960
0
    size_t old_len = rec->core.l_qname;
961
0
    size_t new_len = strlen(qname) + 1;
962
0
    if (new_len < 1 || new_len > 255) return -1;
963
964
0
    int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0;
965
966
0
    size_t new_data_len = rec->l_data - old_len + new_len + extranul;
967
0
    if (realloc_bam_data(rec, new_data_len) < 0) return -1;
968
969
    // Make room
970
0
    if (new_len + extranul != rec->core.l_qname)
971
0
        memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname);
972
    // Copy in new name and pad if needed
973
0
    memcpy(rec->data, qname, new_len);
974
0
    int n;
975
0
    for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0';
976
977
0
    rec->l_data = new_data_len;
978
0
    rec->core.l_qname = new_len + extranul;
979
0
    rec->core.l_extranul = extranul;
980
981
0
    return 0;
982
0
}
983
984
/********************
985
 *** BAM indexing ***
986
 ********************/
987
988
static hts_idx_t *sam_index(htsFile *fp, int min_shift)
989
0
{
990
0
    int n_lvls, i, fmt, ret;
991
0
    bam1_t *b;
992
0
    hts_idx_t *idx;
993
0
    sam_hdr_t *h;
994
0
    h = sam_hdr_read(fp);
995
0
    if (h == NULL) return NULL;
996
0
    if (min_shift > 0) {
997
0
        hts_pos_t max_len = 0, s;
998
0
        for (i = 0; i < h->n_targets; ++i) {
999
0
            hts_pos_t len = sam_hdr_tid2len(h, i);
1000
0
            if (max_len < len) max_len = len;
1001
0
        }
1002
0
        max_len += 256;
1003
0
        for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
1004
0
        fmt = HTS_FMT_CSI;
1005
0
    } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1006
0
    idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1007
0
    b = bam_init1();
1008
0
    while ((ret = sam_read1(fp, h, b)) >= 0) {
1009
0
        ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP));
1010
0
        if (ret < 0) { // unsorted or doesn't fit
1011
0
            hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
1012
0
            goto err;
1013
0
        }
1014
0
    }
1015
0
    if (ret < -1) goto err; // corrupted BAM file
1016
1017
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
1018
0
    sam_hdr_destroy(h);
1019
0
    bam_destroy1(b);
1020
0
    return idx;
1021
1022
0
err:
1023
0
    bam_destroy1(b);
1024
0
    hts_idx_destroy(idx);
1025
0
    return NULL;
1026
0
}
1027
1028
int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
1029
0
{
1030
0
    hts_idx_t *idx;
1031
0
    htsFile *fp;
1032
0
    int ret = 0;
1033
1034
0
    if ((fp = hts_open(fn, "r")) == 0) return -2;
1035
0
    if (nthreads)
1036
0
        hts_set_threads(fp, nthreads);
1037
1038
0
    switch (fp->format.format) {
1039
0
    case cram:
1040
1041
0
        ret = cram_index_build(fp->fp.cram, fn, fnidx);
1042
0
        break;
1043
1044
0
    case bam:
1045
0
    case sam:
1046
0
        if (fp->format.compression != bgzf) {
1047
0
            hts_log_error("%s file \"%s\" not BGZF compressed",
1048
0
                          fp->format.format == bam ? "BAM" : "SAM", fn);
1049
0
            ret = -1;
1050
0
            break;
1051
0
        }
1052
0
        idx = sam_index(fp, min_shift);
1053
0
        if (idx) {
1054
0
            ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
1055
0
            if (ret < 0) ret = -4;
1056
0
            hts_idx_destroy(idx);
1057
0
        }
1058
0
        else ret = -1;
1059
0
        break;
1060
1061
0
    default:
1062
0
        ret = -3;
1063
0
        break;
1064
0
    }
1065
0
    hts_close(fp);
1066
1067
0
    return ret;
1068
0
}
1069
1070
int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
1071
0
{
1072
0
    return sam_index_build3(fn, fnidx, min_shift, 0);
1073
0
}
1074
1075
int sam_index_build(const char *fn, int min_shift)
1076
0
{
1077
0
    return sam_index_build3(fn, NULL, min_shift, 0);
1078
0
}
1079
1080
// Provide bam_index_build() symbol for binary compatibility with earlier HTSlib
1081
#undef bam_index_build
1082
int bam_index_build(const char *fn, int min_shift)
1083
0
{
1084
0
    return sam_index_build2(fn, NULL, min_shift);
1085
0
}
1086
1087
// Initialise fp->idx for the current format type.
1088
// This must be called after the header has been written but no other data.
1089
0
int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) {
1090
0
    fp->fnidx = fnidx;
1091
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1092
0
        (fp->format.format == sam && fp->format.compression == bgzf)) {
1093
0
        int n_lvls, fmt = HTS_FMT_CSI;
1094
0
        if (min_shift > 0) {
1095
0
            int64_t max_len = 0, s;
1096
0
            int i;
1097
0
            for (i = 0; i < h->n_targets; ++i)
1098
0
                if (max_len < h->target_len[i]) max_len = h->target_len[i];
1099
0
            max_len += 256;
1100
0
            for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
1101
1102
0
        } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1103
1104
0
        fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1105
0
        return fp->idx ? 0 : -1;
1106
0
    }
1107
1108
0
    if (fp->format.format == cram) {
1109
0
        fp->fp.cram->idxfp = bgzf_open(fnidx, "wg");
1110
0
        return fp->fp.cram->idxfp ? 0 : -1;
1111
0
    }
1112
1113
0
    return -1;
1114
0
}
1115
1116
// Finishes an index. Call after the last record has been written.
1117
// Returns 0 on success, <0 on failure.
1118
0
int sam_idx_save(htsFile *fp) {
1119
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1120
0
        fp->format.format == vcf || fp->format.format == sam) {
1121
0
        int ret;
1122
0
        if ((ret = sam_state_destroy(fp)) < 0) {
1123
0
            errno = -ret;
1124
0
            return -1;
1125
0
        }
1126
0
        if (!fp->is_bgzf || bgzf_flush(fp->fp.bgzf) < 0)
1127
0
            return -1;
1128
0
        hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
1129
1130
0
        if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0)
1131
0
            return -1;
1132
1133
0
        return hts_idx_save_but_not_close(fp->idx, fp->fnidx, hts_idx_fmt(fp->idx));
1134
1135
0
    } else if (fp->format.format == cram) {
1136
        // flushed and closed by cram_close
1137
0
    }
1138
1139
0
    return 0;
1140
0
}
1141
1142
static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1143
0
{
1144
0
    htsFile *fp = (htsFile *)fpv;
1145
0
    bam1_t *b = bv;
1146
0
    fp->line.l = 0;
1147
0
    int ret = sam_read1(fp, fp->bam_header, b);
1148
0
    if (ret >= 0) {
1149
0
        *tid = b->core.tid;
1150
0
        *beg = b->core.pos;
1151
0
        *end = bam_endpos(b);
1152
0
    }
1153
0
    return ret;
1154
0
}
1155
1156
// This is used only with read_rest=1 iterators, so need not set tid/beg/end.
1157
static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1158
0
{
1159
0
    htsFile *fp = (htsFile *)fpv;
1160
0
    bam1_t *b = bv;
1161
0
    fp->line.l = 0;
1162
0
    int ret = sam_read1(fp, fp->bam_header, b);
1163
0
    return ret;
1164
0
}
1165
1166
// Internal (for now) func used by bam_sym_lookup.  This is copied from
1167
// samtools/bam.c.
1168
static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b)
1169
0
{
1170
0
    const char *rg;
1171
0
    kstring_t lib = { 0, 0, NULL };
1172
0
    rg = (char *)bam_aux_get(b, "RG");
1173
1174
0
    if (!rg)
1175
0
        return NULL;
1176
0
    else
1177
0
        rg++;
1178
1179
0
    if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib)  < 0)
1180
0
        return NULL;
1181
1182
0
    static char LB_text[1024];
1183
0
    int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1;
1184
1185
0
    memcpy(LB_text, lib.s, len);
1186
0
    LB_text[len] = 0;
1187
1188
0
    free(lib.s);
1189
1190
0
    return LB_text;
1191
0
}
1192
1193
1194
// Bam record pointer and SAM header combined
1195
typedef struct {
1196
    const sam_hdr_t *h;
1197
    const bam1_t *b;
1198
} hb_pair;
1199
1200
// Looks up variable names in str and replaces them with their value.
1201
// Also supports aux tags.
1202
//
1203
// Note the expression parser deliberately overallocates str size so it
1204
// is safe to use memcmp over strcmp.
1205
static int bam_sym_lookup(void *data, char *str, char **end,
1206
0
                          hts_expr_val_t *res) {
1207
0
    hb_pair *hb = (hb_pair *)data;
1208
0
    const bam1_t *b = hb->b;
1209
1210
0
    res->is_str = 0;
1211
0
    switch(*str) {
1212
0
    case 'c':
1213
0
        if (memcmp(str, "cigar", 5) == 0) {
1214
0
            *end = str+5;
1215
0
            res->is_str = 1;
1216
0
            ks_clear(&res->s);
1217
0
            uint32_t *cigar = bam_get_cigar(b);
1218
0
            int i, n = b->core.n_cigar, r = 0;
1219
0
            if (n) {
1220
0
                for (i = 0; i < n; i++) {
1221
0
                    r |= kputw (bam_cigar_oplen(cigar[i]), &res->s) < 0;
1222
0
                    r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s) < 0;
1223
0
                }
1224
0
                r |= kputs("", &res->s) < 0;
1225
0
            } else {
1226
0
                r |= kputs("*", &res->s) < 0;
1227
0
            }
1228
0
            return r ? -1 : 0;
1229
0
        }
1230
0
        break;
1231
1232
0
    case 'e':
1233
0
        if (memcmp(str, "endpos", 6) == 0) {
1234
0
            *end = str+6;
1235
0
            res->d = bam_endpos(b);
1236
0
            return 0;
1237
0
        }
1238
0
        break;
1239
1240
0
    case 'f':
1241
0
        if (memcmp(str, "flag", 4) == 0) {
1242
0
            str = *end = str+4;
1243
0
            if (*str != '.') {
1244
0
                res->d = b->core.flag;
1245
0
                return 0;
1246
0
            } else {
1247
0
                str++;
1248
0
                if (!memcmp(str, "paired", 6)) {
1249
0
                    *end = str+6;
1250
0
                    res->d = b->core.flag & BAM_FPAIRED;
1251
0
                    return 0;
1252
0
                } else if (!memcmp(str, "proper_pair", 11)) {
1253
0
                    *end = str+11;
1254
0
                    res->d = b->core.flag & BAM_FPROPER_PAIR;
1255
0
                    return 0;
1256
0
                } else if (!memcmp(str, "unmap", 5)) {
1257
0
                    *end = str+5;
1258
0
                    res->d = b->core.flag & BAM_FUNMAP;
1259
0
                    return 0;
1260
0
                } else if (!memcmp(str, "munmap", 6)) {
1261
0
                    *end = str+6;
1262
0
                    res->d = b->core.flag & BAM_FMUNMAP;
1263
0
                    return 0;
1264
0
                } else if (!memcmp(str, "reverse", 7)) {
1265
0
                    *end = str+7;
1266
0
                    res->d = b->core.flag & BAM_FREVERSE;
1267
0
                    return 0;
1268
0
                } else if (!memcmp(str, "mreverse", 8)) {
1269
0
                    *end = str+8;
1270
0
                    res->d = b->core.flag & BAM_FMREVERSE;
1271
0
                    return 0;
1272
0
                } else if (!memcmp(str, "read1", 5)) {
1273
0
                    *end = str+5;
1274
0
                    res->d = b->core.flag & BAM_FREAD1;
1275
0
                    return 0;
1276
0
                } else if (!memcmp(str, "read2", 5)) {
1277
0
                    *end = str+5;
1278
0
                    res->d = b->core.flag & BAM_FREAD2;
1279
0
                    return 0;
1280
0
                } else if (!memcmp(str, "secondary", 9)) {
1281
0
                    *end = str+9;
1282
0
                    res->d = b->core.flag & BAM_FSECONDARY;
1283
0
                    return 0;
1284
0
                } else if (!memcmp(str, "qcfail", 6)) {
1285
0
                    *end = str+6;
1286
0
                    res->d = b->core.flag & BAM_FQCFAIL;
1287
0
                    return 0;
1288
0
                } else if (!memcmp(str, "dup", 3)) {
1289
0
                    *end = str+3;
1290
0
                    res->d = b->core.flag & BAM_FDUP;
1291
0
                    return 0;
1292
0
                } else if (!memcmp(str, "supplementary", 13)) {
1293
0
                    *end = str+13;
1294
0
                    res->d = b->core.flag & BAM_FSUPPLEMENTARY;
1295
0
                    return 0;
1296
0
                } else {
1297
0
                    hts_log_error("Unrecognised flag string");
1298
0
                    return -1;
1299
0
                }
1300
0
            }
1301
0
        }
1302
0
        break;
1303
1304
0
    case 'h':
1305
0
        if (memcmp(str, "hclen", 5) == 0) {
1306
0
            int hclen = 0;
1307
0
            uint32_t *cigar = bam_get_cigar(b);
1308
0
            uint32_t ncigar = b->core.n_cigar;
1309
1310
            // left
1311
0
            if (ncigar > 0 && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP)
1312
0
                hclen = bam_cigar_oplen(cigar[0]);
1313
1314
            // right
1315
0
            if (ncigar > 1 && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP)
1316
0
                hclen += bam_cigar_oplen(cigar[ncigar-1]);
1317
1318
0
            *end = str+5;
1319
0
            res->d = hclen;
1320
0
            return 0;
1321
0
        }
1322
0
        break;
1323
1324
0
    case 'l':
1325
0
        if (memcmp(str, "library", 7) == 0) {
1326
0
            *end = str+7;
1327
0
            res->is_str = 1;
1328
0
            const char *lib = bam_get_library(hb->h, b);
1329
0
            kputs(lib ? lib : "", ks_clear(&res->s));
1330
0
            return 0;
1331
0
        }
1332
0
        break;
1333
1334
0
    case 'm':
1335
0
        if (memcmp(str, "mapq", 4) == 0) {
1336
0
            *end = str+4;
1337
0
            res->d = b->core.qual;
1338
0
            return 0;
1339
0
        } else if (memcmp(str, "mpos", 4) == 0) {
1340
0
            *end = str+4;
1341
0
            res->d = b->core.mpos+1;
1342
0
            return 0;
1343
0
        } else if (memcmp(str, "mrname", 6) == 0) {
1344
0
            *end = str+6;
1345
0
            res->is_str = 1;
1346
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1347
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1348
0
            return 0;
1349
0
        } else if (memcmp(str, "mrefid", 6) == 0) {
1350
0
            *end = str+6;
1351
0
            res->d = b->core.mtid;
1352
0
            return 0;
1353
0
        }
1354
0
        break;
1355
1356
0
    case 'n':
1357
0
        if (memcmp(str, "ncigar", 6) == 0) {
1358
0
            *end = str+6;
1359
0
            res->d = b->core.n_cigar;
1360
0
            return 0;
1361
0
        }
1362
0
        break;
1363
1364
0
    case 'p':
1365
0
        if (memcmp(str, "pos", 3) == 0) {
1366
0
            *end = str+3;
1367
0
            res->d = b->core.pos+1;
1368
0
            return 0;
1369
0
        } else if (memcmp(str, "pnext", 5) == 0) {
1370
0
            *end = str+5;
1371
0
            res->d = b->core.mpos+1;
1372
0
            return 0;
1373
0
        }
1374
0
        break;
1375
1376
0
    case 'q':
1377
0
        if (memcmp(str, "qlen", 4) == 0) {
1378
0
            *end = str+4;
1379
0
            res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
1380
0
            return 0;
1381
0
        } else if (memcmp(str, "qname", 5) == 0) {
1382
0
            *end = str+5;
1383
0
            res->is_str = 1;
1384
0
            kputs(bam_get_qname(b), ks_clear(&res->s));
1385
0
            return 0;
1386
0
        } else if (memcmp(str, "qual", 4) == 0) {
1387
0
            *end = str+4;
1388
0
            ks_clear(&res->s);
1389
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1390
0
                return -1;
1391
0
            memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq);
1392
0
            res->s.l = b->core.l_qseq;
1393
0
            res->is_str = 1;
1394
0
            return 0;
1395
0
        }
1396
0
        break;
1397
1398
0
    case 'r':
1399
0
        if (memcmp(str, "rlen", 4) == 0) {
1400
0
            *end = str+4;
1401
0
            res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
1402
0
            return 0;
1403
0
        } else if (memcmp(str, "rname", 5) == 0) {
1404
0
            *end = str+5;
1405
0
            res->is_str = 1;
1406
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.tid);
1407
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1408
0
            return 0;
1409
0
        } else if (memcmp(str, "rnext", 5) == 0) {
1410
0
            *end = str+5;
1411
0
            res->is_str = 1;
1412
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1413
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1414
0
            return 0;
1415
0
        } else if (memcmp(str, "refid", 5) == 0) {
1416
0
            *end = str+5;
1417
0
            res->d = b->core.tid;
1418
0
            return 0;
1419
0
        }
1420
0
        break;
1421
1422
0
    case 's':
1423
0
        if (memcmp(str, "seq", 3) == 0) {
1424
0
            *end = str+3;
1425
0
            ks_clear(&res->s);
1426
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1427
0
                return -1;
1428
0
            nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq);
1429
0
            res->s.s[b->core.l_qseq] = 0;
1430
0
            res->s.l = b->core.l_qseq;
1431
0
            res->is_str = 1;
1432
0
            return 0;
1433
0
        } else if (memcmp(str, "sclen", 5) == 0) {
1434
0
            int sclen = 0;
1435
0
            uint32_t *cigar = bam_get_cigar(b);
1436
0
            int ncigar = b->core.n_cigar;
1437
0
            int left = 0;
1438
1439
            // left
1440
0
            if (ncigar > 0
1441
0
                && bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP)
1442
0
                left = 0, sclen += bam_cigar_oplen(cigar[0]);
1443
0
            else if (ncigar > 1
1444
0
                     && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP
1445
0
                     && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP)
1446
0
                left = 1, sclen += bam_cigar_oplen(cigar[1]);
1447
1448
            // right
1449
0
            if (ncigar-1 > left
1450
0
                && bam_cigar_op(cigar[ncigar-1]) == BAM_CSOFT_CLIP)
1451
0
                sclen += bam_cigar_oplen(cigar[ncigar-1]);
1452
0
            else if (ncigar-2 > left
1453
0
                     && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP
1454
0
                     && bam_cigar_op(cigar[ncigar-2]) == BAM_CSOFT_CLIP)
1455
0
                sclen += bam_cigar_oplen(cigar[ncigar-2]);
1456
1457
0
            *end = str+5;
1458
0
            res->d = sclen;
1459
0
            return 0;
1460
0
        }
1461
0
        break;
1462
1463
0
    case 't':
1464
0
        if (memcmp(str, "tlen", 4) == 0) {
1465
0
            *end = str+4;
1466
0
            res->d = b->core.isize;
1467
0
            return 0;
1468
0
        }
1469
0
        break;
1470
1471
0
    case '[':
1472
0
        if (*str == '[' && str[1] && str[2] && str[3] == ']') {
1473
            /* aux tags */
1474
0
            *end = str+4;
1475
1476
0
            uint8_t *aux = bam_aux_get(b, str+1);
1477
0
            if (aux) {
1478
                // we define the truth of a tag to be its presence, even if 0.
1479
0
                res->is_true = 1;
1480
0
                switch (*aux) {
1481
0
                case 'Z':
1482
0
                case 'H':
1483
0
                    res->is_str = 1;
1484
0
                    kputs((char *)aux+1, ks_clear(&res->s));
1485
0
                    break;
1486
1487
0
                case 'A':
1488
0
                    res->is_str = 1;
1489
0
                    kputsn((char *)aux+1, 1, ks_clear(&res->s));
1490
0
                    break;
1491
1492
0
                case 'i': case 'I':
1493
0
                case 's': case 'S':
1494
0
                case 'c': case 'C':
1495
0
                    res->is_str = 0;
1496
0
                    res->d = bam_aux2i(aux);
1497
0
                    break;
1498
1499
0
                case 'f':
1500
0
                case 'd':
1501
0
                    res->is_str = 0;
1502
0
                    res->d = bam_aux2f(aux);
1503
0
                    break;
1504
1505
0
                default:
1506
0
                    hts_log_error("Aux type '%c not yet supported by filters",
1507
0
                                  *aux);
1508
0
                    return -1;
1509
0
                }
1510
0
                return 0;
1511
1512
0
            } else {
1513
                // hence absent tags are always false (and strings)
1514
0
                res->is_str = 1;
1515
0
                res->s.l = 0;
1516
0
                res->d = 0;
1517
0
                res->is_true = 0;
1518
0
                return 0;
1519
0
            }
1520
0
        }
1521
0
        break;
1522
0
    }
1523
1524
    // All successful matches in switch should return 0.
1525
    // So if we didn't match, it's a parse error.
1526
0
    return -1;
1527
0
}
1528
1529
// Returns 1 when accepted by the filter, 0 if not, -1 on error.
1530
int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt)
1531
0
{
1532
0
    hb_pair hb = {h, b};
1533
0
    hts_expr_val_t res = HTS_EXPR_VAL_INIT;
1534
0
    if (hts_filter_eval2(filt, &hb, bam_sym_lookup, &res)) {
1535
0
        hts_log_error("Couldn't process filter expression");
1536
0
        hts_expr_val_free(&res);
1537
0
        return -1;
1538
0
    }
1539
1540
0
    int t = res.is_true;
1541
0
    hts_expr_val_free(&res);
1542
1543
0
    return t;
1544
0
}
1545
1546
static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1547
0
{
1548
0
    htsFile *fp = fpv;
1549
0
    bam1_t *b = bv;
1550
0
    int pass_filter, ret;
1551
1552
0
    do {
1553
0
        ret = cram_get_bam_seq(fp->fp.cram, &b);
1554
0
        if (ret < 0)
1555
0
            return cram_eof(fp->fp.cram) ? -1 : -2;
1556
1557
0
        if (bam_tag2cigar(b, 1, 1) < 0)
1558
0
            return -2;
1559
1560
0
        *tid = b->core.tid;
1561
0
        *beg = b->core.pos;
1562
0
        *end = bam_endpos(b);
1563
1564
0
        if (fp->filter) {
1565
0
            pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter);
1566
0
            if (pass_filter < 0)
1567
0
                return -2;
1568
0
        } else {
1569
0
            pass_filter = 1;
1570
0
        }
1571
0
    } while (pass_filter == 0);
1572
1573
0
    return ret;
1574
0
}
1575
1576
static int cram_pseek(void *fp, int64_t offset, int whence)
1577
0
{
1578
0
    cram_fd *fd =  (cram_fd *)fp;
1579
1580
0
    if ((0 != cram_seek(fd, offset, SEEK_SET))
1581
0
     && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
1582
0
        return -1;
1583
1584
0
    fd->curr_position = offset;
1585
1586
0
    if (fd->ctr) {
1587
0
        cram_free_container(fd->ctr);
1588
0
        if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
1589
0
            cram_free_container(fd->ctr_mt);
1590
1591
0
        fd->ctr = NULL;
1592
0
        fd->ctr_mt = NULL;
1593
0
        fd->ooc = 0;
1594
0
    }
1595
1596
0
    return 0;
1597
0
}
1598
1599
/*
1600
 * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
1601
 *   after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
1602
 *   container previously fetched. It was designed like this to integrate with the functionality
1603
 *   of the iterator stepping logic.
1604
 */
1605
1606
static int64_t cram_ptell(void *fp)
1607
0
{
1608
0
    cram_fd *fd = (cram_fd *)fp;
1609
0
    cram_container *c;
1610
0
    cram_slice *s;
1611
0
    int64_t ret = -1L;
1612
1613
0
    if (fd) {
1614
0
        if ((c = fd->ctr) != NULL) {
1615
0
            if ((s = c->slice) != NULL && s->max_rec) {
1616
0
                if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1))
1617
0
                    fd->curr_position += c->offset + c->length;
1618
0
            }
1619
0
        }
1620
0
        ret = fd->curr_position;
1621
0
    }
1622
1623
0
    return ret;
1624
0
}
1625
1626
static int bam_pseek(void *fp, int64_t offset, int whence)
1627
0
{
1628
0
    BGZF *fd = (BGZF *)fp;
1629
1630
0
    return bgzf_seek(fd, offset, whence);
1631
0
}
1632
1633
static int64_t bam_ptell(void *fp)
1634
0
{
1635
0
    BGZF *fd = (BGZF *)fp;
1636
0
    if (!fd)
1637
0
        return -1L;
1638
1639
0
    return bgzf_tell(fd);
1640
0
}
1641
1642
1643
1644
static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags)
1645
0
{
1646
0
    switch (fp->format.format) {
1647
0
    case bam:
1648
0
    case sam:
1649
0
        return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags);
1650
1651
0
    case cram: {
1652
0
        if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
1653
1654
        // Cons up a fake "index" just pointing at the associated cram_fd:
1655
0
        hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
1656
0
        if (idx == NULL) return NULL;
1657
0
        idx->fmt = HTS_FMT_CRAI;
1658
0
        idx->cram = fp->fp.cram;
1659
0
        return (hts_idx_t *) idx;
1660
0
        }
1661
1662
0
    default:
1663
0
        return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
1664
0
    }
1665
0
}
1666
1667
hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags)
1668
0
{
1669
0
    return index_load(fp, fn, fnidx, flags);
1670
0
}
1671
1672
0
hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) {
1673
0
    return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE);
1674
0
}
1675
1676
hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
1677
0
{
1678
0
    return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE);
1679
0
}
1680
1681
static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
1682
0
{
1683
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1684
0
    hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
1685
0
    if (iter == NULL) return NULL;
1686
1687
    // Cons up a dummy iterator for which hts_itr_next() will simply invoke
1688
    // the readrec function:
1689
0
    iter->is_cram = 1;
1690
0
    iter->read_rest = 1;
1691
0
    iter->off = NULL;
1692
0
    iter->bins.a = NULL;
1693
0
    iter->readrec = readrec;
1694
1695
0
    if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
1696
0
        cram_range r = { tid, beg+1, end };
1697
0
        int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
1698
1699
0
        iter->curr_off = 0;
1700
        // The following fields are not required by hts_itr_next(), but are
1701
        // filled in in case user code wants to look at them.
1702
0
        iter->tid = tid;
1703
0
        iter->beg = beg;
1704
0
        iter->end = end;
1705
1706
0
        switch (ret) {
1707
0
        case 0:
1708
0
            break;
1709
1710
0
        case -2:
1711
            // No data vs this ref, so mark iterator as completed.
1712
            // Same as HTS_IDX_NONE.
1713
0
            iter->finished = 1;
1714
0
            break;
1715
1716
0
        default:
1717
0
            free(iter);
1718
0
            return NULL;
1719
0
        }
1720
0
    }
1721
0
    else switch (tid) {
1722
0
    case HTS_IDX_REST:
1723
0
        iter->curr_off = 0;
1724
0
        break;
1725
0
    case HTS_IDX_NONE:
1726
0
        iter->curr_off = 0;
1727
0
        iter->finished = 1;
1728
0
        break;
1729
0
    default:
1730
0
        hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
1731
0
        abort();
1732
0
        break;
1733
0
    }
1734
1735
0
    return iter;
1736
0
}
1737
1738
hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
1739
0
{
1740
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1741
0
    if (idx == NULL)
1742
0
        return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest);
1743
0
    else if (cidx->fmt == HTS_FMT_CRAI)
1744
0
        return cram_itr_query(idx, tid, beg, end, sam_readrec);
1745
0
    else
1746
0
        return hts_itr_query(idx, tid, beg, end, sam_readrec);
1747
0
}
1748
1749
static int cram_name2id(void *fdv, const char *ref)
1750
0
{
1751
0
    cram_fd *fd = (cram_fd *) fdv;
1752
0
    return sam_hdr_name2tid(fd->header, ref);
1753
0
}
1754
1755
hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region)
1756
0
{
1757
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1758
0
    return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr,
1759
0
                          cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query,
1760
0
                          sam_readrec);
1761
0
}
1762
1763
hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount)
1764
0
{
1765
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1766
0
    hts_reglist_t *r_list = NULL;
1767
0
    int r_count = 0;
1768
1769
0
    if (!cidx || !hdr)
1770
0
        return NULL;
1771
1772
0
    hts_itr_t *itr = NULL;
1773
0
    if (cidx->fmt == HTS_FMT_CRAI) {
1774
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id);
1775
0
        if (!r_list)
1776
0
            return NULL;
1777
0
        itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram,
1778
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1779
0
    } else {
1780
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, (hts_name2id_f)(bam_name2id));
1781
0
        if (!r_list)
1782
0
            return NULL;
1783
0
        itr = hts_itr_regions(idx, r_list, r_count, (hts_name2id_f)(bam_name2id), hdr,
1784
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1785
0
    }
1786
1787
0
    if (!itr)
1788
0
        hts_reglist_free(r_list, r_count);
1789
1790
0
    return itr;
1791
0
}
1792
1793
hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
1794
0
{
1795
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1796
1797
0
    if(!cidx || !hdr || !reglist)
1798
0
        return NULL;
1799
1800
0
    if (cidx->fmt == HTS_FMT_CRAI)
1801
0
        return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
1802
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1803
0
    else
1804
0
        return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr,
1805
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1806
0
}
1807
1808
/**********************
1809
 *** SAM header I/O ***
1810
 **********************/
1811
1812
#include "htslib/kseq.h"
1813
#include "htslib/kstring.h"
1814
1815
sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text)
1816
0
{
1817
0
    sam_hdr_t *bh = sam_hdr_init();
1818
0
    if (!bh) return NULL;
1819
1820
0
    if (sam_hdr_add_lines(bh, text, l_text) != 0) {
1821
0
        sam_hdr_destroy(bh);
1822
0
        return NULL;
1823
0
    }
1824
1825
0
    return bh;
1826
0
}
1827
1828
95.8k
static int valid_sam_header_type(const char *s) {
1829
95.8k
    if (s[0] != '@') return 0;
1830
95.8k
    switch (s[1]) {
1831
1.41k
    case 'H':
1832
1.41k
        return s[2] == 'D' && s[3] == '\t';
1833
63
    case 'S':
1834
63
        return s[2] == 'Q' && s[3] == '\t';
1835
84.9k
    case 'R':
1836
87.4k
    case 'P':
1837
87.4k
        return s[2] == 'G' && s[3] == '\t';
1838
6.88k
    case 'C':
1839
6.88k
        return s[2] == 'O';
1840
95.8k
    }
1841
60
    return 0;
1842
95.8k
}
1843
1844
// Minimal sanitisation of a header to ensure.
1845
// - null terminated string.
1846
// - all lines start with @ (also implies no blank lines).
1847
//
1848
// Much more could be done, but currently is not, including:
1849
// - checking header types are known (HD, SQ, etc).
1850
// - syntax (eg checking tab separated fields).
1851
// - validating n_targets matches @SQ records.
1852
// - validating target lengths against @SQ records.
1853
20.3k
static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) {
1854
20.3k
    if (!h)
1855
645
        return NULL;
1856
1857
    // Special case for empty headers.
1858
19.7k
    if (h->l_text == 0)
1859
6.95k
        return h;
1860
1861
12.7k
    size_t i;
1862
12.7k
    unsigned int lnum = 0;
1863
12.7k
    char *cp = h->text, last = '\n';
1864
28.5M
    for (i = 0; i < h->l_text; i++) {
1865
        // NB: l_text excludes terminating nul.  This finds early ones.
1866
28.5M
        if (cp[i] == 0)
1867
3.87k
            break;
1868
1869
        // Error on \n[^@], including duplicate newlines
1870
28.5M
        if (last == '\n') {
1871
60.7k
            lnum++;
1872
60.7k
            if (cp[i] != '@') {
1873
24
                hts_log_error("Malformed SAM header at line %u", lnum);
1874
24
                sam_hdr_destroy(h);
1875
24
                return NULL;
1876
24
            }
1877
60.7k
        }
1878
1879
28.5M
        last = cp[i];
1880
28.5M
    }
1881
1882
12.7k
    if (i < h->l_text) { // Early nul found.  Complain if not just padding.
1883
3.87k
        size_t j = i;
1884
19.0k
        while (j < h->l_text && cp[j] == '\0') j++;
1885
3.87k
        if (j < h->l_text)
1886
3.71k
            hts_log_warning("Unexpected NUL character in header. Possibly truncated");
1887
3.87k
    }
1888
1889
    // Add trailing newline and/or trailing nul if required.
1890
12.7k
    if (last != '\n') {
1891
3.74k
        hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
1892
1893
3.74k
        if (h->l_text < 2 || i >= h->l_text - 2) {
1894
396
            if (h->l_text >= SIZE_MAX - 2) {
1895
0
                hts_log_error("No room for extra newline");
1896
0
                sam_hdr_destroy(h);
1897
0
                return NULL;
1898
0
            }
1899
1900
396
            cp = realloc(h->text, (size_t) h->l_text+2);
1901
396
            if (!cp) {
1902
0
                sam_hdr_destroy(h);
1903
0
                return NULL;
1904
0
            }
1905
396
            h->text = cp;
1906
396
        }
1907
3.74k
        cp[i++] = '\n';
1908
1909
        // l_text may be larger already due to multiple nul padding
1910
3.74k
        if (h->l_text < i)
1911
60
            h->l_text = i;
1912
3.74k
        cp[h->l_text] = '\0';
1913
3.74k
    }
1914
1915
12.7k
    return h;
1916
12.7k
}
1917
1918
1.77k
static void known_stderr(const char *tool, const char *advice) {
1919
1.77k
    hts_log_warning("SAM file corrupted by embedded %s error/log message", tool);
1920
1.77k
    hts_log_warning("%s", advice);
1921
1.77k
}
1922
1923
18.7k
static void warn_if_known_stderr(const char *line) {
1924
18.7k
    if (strstr(line, "M::bwa_idx_load_from_disk") != NULL)
1925
318
        known_stderr("bwa", "Use `bwa mem -o file.sam ...` or `bwa sampe -f file.sam ...` instead of `bwa ... > file.sam`");
1926
18.4k
    else if (strstr(line, "M::mem_pestat") != NULL)
1927
1.23k
        known_stderr("bwa", "Use `bwa mem -o file.sam ...` instead of `bwa mem ... > file.sam`");
1928
17.2k
    else if (strstr(line, "loaded/built the index") != NULL)
1929
213
        known_stderr("minimap2", "Use `minimap2 -o file.sam ...` instead of `minimap2 ... > file.sam`");
1930
18.7k
}
1931
1932
12.5k
static sam_hdr_t *sam_hdr_create(htsFile* fp) {
1933
12.5k
    kstring_t str = { 0, 0, NULL };
1934
12.5k
    khint_t k;
1935
12.5k
    sam_hdr_t* h = sam_hdr_init();
1936
12.5k
    const char *q, *r;
1937
12.5k
    char* sn = NULL;
1938
12.5k
    khash_t(s2i) *d = kh_init(s2i);
1939
12.5k
    khash_t(s2i) *long_refs = NULL;
1940
12.5k
    if (!h || !d)
1941
0
        goto error;
1942
1943
12.5k
    int ret, has_SQ = 0;
1944
12.5k
    int next_c = '@';
1945
160k
    while (next_c == '@' && (ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) {
1946
148k
        if (fp->line.s[0] != '@')
1947
63
            break;
1948
1949
148k
        if (fp->line.l > 3 && strncmp(fp->line.s, "@SQ", 3) == 0) {
1950
52.8k
            has_SQ = 1;
1951
52.8k
            hts_pos_t ln = -1;
1952
173k
            for (q = fp->line.s + 4;; ++q) {
1953
173k
                if (strncmp(q, "SN:", 3) == 0) {
1954
52.5k
                    q += 3;
1955
750M
                    for (r = q;*r != '\t' && *r != '\n' && *r != '\0';++r);
1956
1957
52.5k
                    if (sn) {
1958
13.5k
                        hts_log_warning("SQ header line has more than one SN: tag");
1959
13.5k
                        free(sn);
1960
13.5k
                    }
1961
52.5k
                    sn = (char*)calloc(r - q + 1, 1);
1962
52.5k
                    if (!sn)
1963
0
                        goto error;
1964
1965
52.5k
                    strncpy(sn, q, r - q);
1966
52.5k
                    q = r;
1967
121k
                } else {
1968
121k
                    if (strncmp(q, "LN:", 3) == 0) {
1969
45.7k
                        hts_pos_t tmp = strtoll(q + 3, (char**)&q, 10);
1970
45.7k
                        if (ln != -1 && ln != tmp) { //duplicate & different LN
1971
285
                            hts_log_error("Header includes @SQ line \"%s\" with"
1972
285
                                " multiple LN: tag with different values.", sn);
1973
285
                            goto error;
1974
45.4k
                        } else {
1975
45.4k
                            ln = tmp;
1976
45.4k
                        }
1977
45.7k
                    }
1978
121k
                }
1979
1980
26.0M
                while (*q != '\t' && *q != '\n' && *q != '\0')
1981
25.9M
                    ++q;
1982
173k
                if (*q == '\0' || *q == '\n')
1983
52.5k
                    break;
1984
173k
            }
1985
52.5k
            if (sn) {
1986
38.9k
                if (ln >= 0) {
1987
33.9k
                    int absent;
1988
33.9k
                    k = kh_put(s2i, d, sn, &absent);
1989
33.9k
                    if (absent < 0)
1990
0
                        goto error;
1991
1992
33.9k
                    if (!absent) {
1993
14.5k
                        hts_log_warning("Duplicated sequence \"%s\" in file \"%s\"", sn, fp->fn);
1994
14.5k
                        free(sn);
1995
19.3k
                    } else {
1996
19.3k
                        sn = NULL;
1997
19.3k
                        if (ln >= UINT32_MAX) {
1998
                            // Stash away ref length that
1999
                            // doesn't fit in target_len array
2000
4.41k
                            int k2;
2001
4.41k
                            if (!long_refs) {
2002
561
                                long_refs = kh_init(s2i);
2003
561
                                if (!long_refs)
2004
0
                                    goto error;
2005
561
                            }
2006
4.41k
                            k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent);
2007
4.41k
                            if (absent < 0)
2008
0
                                goto error;
2009
4.41k
                            kh_val(long_refs, k2) = ln;
2010
4.41k
                            kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
2011
4.41k
                                            | UINT32_MAX);
2012
14.9k
                        } else {
2013
14.9k
                            kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
2014
14.9k
                        }
2015
19.3k
                    }
2016
33.9k
                } else {
2017
4.93k
                    hts_log_warning("Ignored @SQ SN:%s : bad or missing LN tag", sn);
2018
4.93k
                    warn_if_known_stderr(fp->line.s);
2019
4.93k
                    free(sn);
2020
4.93k
                }
2021
38.9k
            } else {
2022
13.6k
                hts_log_warning("Ignored @SQ line with missing SN: tag");
2023
13.6k
                warn_if_known_stderr(fp->line.s);
2024
13.6k
            }
2025
52.5k
            sn = NULL;
2026
52.5k
        }
2027
95.8k
        else if (!valid_sam_header_type(fp->line.s)) {
2028
201
            hts_log_error("Invalid header line: must start with @HD/@SQ/@RG/@PG/@CO");
2029
201
            warn_if_known_stderr(fp->line.s);
2030
201
            goto error;
2031
201
        }
2032
2033
148k
        if (kputsn(fp->line.s, fp->line.l, &str) < 0)
2034
0
            goto error;
2035
2036
148k
        if (kputc('\n', &str) < 0)
2037
0
            goto error;
2038
2039
148k
        if (fp->is_bgzf) {
2040
87.1k
            next_c = bgzf_peek(fp->fp.bgzf);
2041
87.1k
        } else {
2042
61.0k
            unsigned char nc;
2043
61.0k
            ssize_t pret = hpeek(fp->fp.hfile, &nc, 1);
2044
61.0k
            next_c = pret > 0 ? nc : pret - 1;
2045
61.0k
        }
2046
148k
        if (next_c < -1)
2047
3
            goto error;
2048
148k
    }
2049
12.0k
    if (next_c != '@')
2050
11.9k
        fp->line.l = 0;
2051
2052
12.0k
    if (ret < -1)
2053
21
        goto error;
2054
2055
12.0k
    if (!has_SQ && fp->fn_aux) {
2056
0
        kstring_t line = { 0, 0, NULL };
2057
2058
        /* The reference index (.fai) is actually needed here */
2059
0
        char *fai_fn = fp->fn_aux;
2060
0
        char *fn_delim = strstr(fp->fn_aux, HTS_IDX_DELIM);
2061
0
        if (fn_delim)
2062
0
            fai_fn = fn_delim + strlen(HTS_IDX_DELIM);
2063
2064
0
        hFILE* f = hopen(fai_fn, "r");
2065
0
        int e = 0, absent;
2066
0
        if (f == NULL)
2067
0
            goto error;
2068
2069
0
        while (line.l = 0, kgetline(&line, (kgets_func*) hgets, f) >= 0) {
2070
0
            char* tab = strchr(line.s, '\t');
2071
0
            hts_pos_t ln;
2072
2073
0
            if (tab == NULL)
2074
0
                continue;
2075
2076
0
            sn = (char*)calloc(tab-line.s+1, 1);
2077
0
            if (!sn) {
2078
0
                e = 1;
2079
0
                break;
2080
0
            }
2081
0
            memcpy(sn, line.s, tab-line.s);
2082
0
            k = kh_put(s2i, d, sn, &absent);
2083
0
            if (absent < 0) {
2084
0
                e = 1;
2085
0
                break;
2086
0
            }
2087
2088
0
            ln = strtoll(tab, NULL, 10);
2089
2090
0
            if (!absent) {
2091
0
                hts_log_warning("Duplicated sequence \"%s\" in the file \"%s\"", sn, fai_fn);
2092
0
                free(sn);
2093
0
                sn = NULL;
2094
0
            } else {
2095
0
                sn = NULL;
2096
0
                if (ln >= UINT32_MAX) {
2097
                    // Stash away ref length that
2098
                    // doesn't fit in target_len array
2099
0
                    khint_t k2;
2100
0
                    int absent = -1;
2101
0
                    if (!long_refs) {
2102
0
                        long_refs = kh_init(s2i);
2103
0
                        if (!long_refs) {
2104
0
                            e = 1;
2105
0
                            break;
2106
0
                        }
2107
0
                    }
2108
0
                    k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent);
2109
0
                    if (absent < 0) {
2110
0
                         e = 1;
2111
0
                         break;
2112
0
                    }
2113
0
                    kh_val(long_refs, k2) = ln;
2114
0
                    kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
2115
0
                                    | UINT32_MAX);
2116
0
                } else {
2117
0
                    kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
2118
0
                }
2119
0
                has_SQ = 1;
2120
0
            }
2121
2122
0
            e |= kputs("@SQ\tSN:", &str) < 0;
2123
0
            e |= kputsn(line.s, tab - line.s, &str) < 0;
2124
0
            e |= kputs("\tLN:", &str) < 0;
2125
0
            e |= kputll(ln, &str) < 0;
2126
0
            e |= kputc('\n', &str) < 0;
2127
0
            if (e)
2128
0
                break;
2129
0
        }
2130
2131
0
        ks_free(&line);
2132
0
        if (hclose(f) != 0) {
2133
0
            hts_log_error("Error on closing %s", fai_fn);
2134
0
            e = 1;
2135
0
        }
2136
0
        if (e)
2137
0
            goto error;
2138
0
    }
2139
2140
12.0k
    if (has_SQ) {
2141
        // Populate the targets array
2142
7.45k
        h->n_targets = kh_size(d);
2143
2144
7.45k
        h->target_name = (char**) malloc(sizeof(char*) * h->n_targets);
2145
7.45k
        if (!h->target_name) {
2146
0
            h->n_targets = 0;
2147
0
            goto error;
2148
0
        }
2149
2150
7.45k
        h->target_len = (uint32_t*) malloc(sizeof(uint32_t) * h->n_targets);
2151
7.45k
        if (!h->target_len) {
2152
0
            h->n_targets = 0;
2153
0
            goto error;
2154
0
        }
2155
2156
54.4k
        for (k = kh_begin(d); k != kh_end(d); ++k) {
2157
46.9k
            if (!kh_exist(d, k))
2158
27.9k
                continue;
2159
2160
19.0k
            h->target_name[kh_val(d, k) >> 32] = (char*) kh_key(d, k);
2161
19.0k
            h->target_len[kh_val(d, k) >> 32] = kh_val(d, k) & 0xffffffffUL;
2162
19.0k
            kh_val(d, k) >>= 32;
2163
19.0k
        }
2164
7.45k
    }
2165
2166
    // Repurpose sdict to hold any references longer than UINT32_MAX
2167
12.0k
    h->sdict = long_refs;
2168
2169
12.0k
    kh_destroy(s2i, d);
2170
2171
12.0k
    if (str.l == 0)
2172
63
        kputsn("", 0, &str);
2173
12.0k
    h->l_text = str.l;
2174
12.0k
    h->text = ks_release(&str);
2175
12.0k
    fp->bam_header = sam_hdr_sanitise(h);
2176
12.0k
    fp->bam_header->ref_count = 1;
2177
2178
12.0k
    return fp->bam_header;
2179
2180
510
 error:
2181
510
    if (h && d && (!h->target_name || !h->target_len)) {
2182
1.27k
        for (k = kh_begin(d); k != kh_end(d); ++k)
2183
768
            if (kh_exist(d, k)) free((void *)kh_key(d, k));
2184
510
    }
2185
510
    sam_hdr_destroy(h);
2186
510
    ks_free(&str);
2187
510
    kh_destroy(s2i, d);
2188
510
    kh_destroy(s2i, long_refs);
2189
510
    if (sn) free(sn);
2190
510
    return NULL;
2191
12.0k
}
2192
2193
sam_hdr_t *sam_hdr_read(htsFile *fp)
2194
28.1k
{
2195
28.1k
    if (!fp) {
2196
0
        errno = EINVAL;
2197
0
        return NULL;
2198
0
    }
2199
2200
28.1k
    switch (fp->format.format) {
2201
2.87k
    case bam:
2202
2.87k
        return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
2203
2204
5.42k
    case cram:
2205
5.42k
        return sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header));
2206
2207
12.5k
    case sam:
2208
12.5k
        return sam_hdr_create(fp);
2209
2210
411
    case fastq_format:
2211
7.27k
    case fasta_format:
2212
7.27k
        return sam_hdr_init();
2213
2214
0
    case empty_format:
2215
0
        errno = EPIPE;
2216
0
        return NULL;
2217
2218
0
    default:
2219
0
        errno = EFTYPE;
2220
0
        return NULL;
2221
28.1k
    }
2222
28.1k
}
2223
2224
int sam_hdr_write(htsFile *fp, const sam_hdr_t *h)
2225
26.9k
{
2226
26.9k
    if (!fp || !h) {
2227
0
        errno = EINVAL;
2228
0
        return -1;
2229
0
    }
2230
2231
26.9k
    switch (fp->format.format) {
2232
8.98k
    case binary_format:
2233
8.98k
        fp->format.category = sequence_data;
2234
8.98k
        fp->format.format = bam;
2235
        /* fall-through */
2236
8.98k
    case bam:
2237
8.98k
        if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
2238
8.98k
        break;
2239
2240
8.98k
    case cram: {
2241
8.98k
        cram_fd *fd = fp->fp.cram;
2242
8.98k
        if (cram_set_header2(fd, h) < 0) return -1;
2243
7.93k
        if (fp->fn_aux)
2244
0
            cram_load_reference(fd, fp->fn_aux);
2245
7.93k
        if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
2246
7.93k
        }
2247
7.93k
        break;
2248
2249
8.98k
    case text_format:
2250
8.98k
        fp->format.category = sequence_data;
2251
8.98k
        fp->format.format = sam;
2252
        /* fall-through */
2253
8.98k
    case sam: {
2254
8.98k
        if (!h->hrecs && !h->text)
2255
0
            return 0;
2256
8.98k
        char *text;
2257
8.98k
        kstring_t hdr_ks = { 0, 0, NULL };
2258
8.98k
        size_t l_text;
2259
8.98k
        ssize_t bytes;
2260
8.98k
        int r = 0, no_sq = 0;
2261
2262
8.98k
        if (h->hrecs) {
2263
8.04k
            if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0)
2264
0
                return -1;
2265
8.04k
            text = hdr_ks.s;
2266
8.04k
            l_text = hdr_ks.l;
2267
8.04k
        } else {
2268
938
            const char *p = NULL;
2269
1.16k
            do {
2270
1.16k
                const char *q = p == NULL ? h->text : p + 4;
2271
1.16k
                p = strstr(q, "@SQ\t");
2272
1.16k
            } while (!(p == NULL || p == h->text || *(p - 1) == '\n'));
2273
938
            no_sq = p == NULL;
2274
938
            text = h->text;
2275
938
            l_text = h->l_text;
2276
938
        }
2277
2278
8.98k
        if (fp->is_bgzf) {
2279
0
            bytes = bgzf_write(fp->fp.bgzf, text, l_text);
2280
8.98k
        } else {
2281
8.98k
            bytes = hwrite(fp->fp.hfile, text, l_text);
2282
8.98k
        }
2283
8.98k
        free(hdr_ks.s);
2284
8.98k
        if (bytes != l_text)
2285
0
            return -1;
2286
2287
8.98k
        if (no_sq) {
2288
492
            int i;
2289
1.28k
            for (i = 0; i < h->n_targets; ++i) {
2290
789
                fp->line.l = 0;
2291
789
                r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0;
2292
789
                r |= kputs(h->target_name[i], &fp->line) < 0;
2293
789
                r |= kputsn("\tLN:", 4, &fp->line) < 0;
2294
789
                r |= kputw(h->target_len[i], &fp->line) < 0;
2295
789
                r |= kputc('\n', &fp->line) < 0;
2296
789
                if (r != 0)
2297
0
                    return -1;
2298
2299
789
                if (fp->is_bgzf) {
2300
0
                    bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
2301
789
                } else {
2302
789
                    bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
2303
789
                }
2304
789
                if (bytes != fp->line.l)
2305
0
                    return -1;
2306
789
            }
2307
492
        }
2308
8.98k
        if (fp->is_bgzf) {
2309
0
            if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2310
8.98k
        } else {
2311
8.98k
            if (hflush(fp->fp.hfile) != 0) return -1;
2312
8.98k
        }
2313
8.98k
        }
2314
8.98k
        break;
2315
2316
8.98k
    case fastq_format:
2317
0
    case fasta_format:
2318
        // Nothing to output; FASTQ has no file headers.
2319
0
        break;
2320
2321
0
    default:
2322
0
        errno = EBADF;
2323
0
        return -1;
2324
26.9k
    }
2325
25.9k
    return 0;
2326
26.9k
}
2327
2328
static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2329
0
{
2330
0
    char *p, *q, *beg = NULL, *end = NULL, *newtext;
2331
0
    size_t new_l_text;
2332
0
    if (!h || !key)
2333
0
        return -1;
2334
2335
0
    if (h->l_text > 3) {
2336
0
        if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
2337
0
            if ((p = strchr(h->text, '\n')) == 0) return -1;
2338
0
            *p = '\0'; // for strstr call
2339
2340
0
            char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
2341
2342
0
            if ((q = strstr(h->text, tmp)) != 0) { // key exists
2343
0
                *p = '\n'; // change back
2344
2345
                // mark the key:val
2346
0
                beg = q;
2347
0
                for (q += 4; *q != '\n' && *q != '\t'; ++q);
2348
0
                end = q;
2349
2350
0
                if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
2351
0
                    && strlen(val) == end - beg - 4)
2352
0
                     return 0; // val is the same, no need to change
2353
2354
0
            } else {
2355
0
                beg = end = p;
2356
0
                *p = '\n';
2357
0
            }
2358
0
        }
2359
0
    }
2360
0
    if (beg == NULL) { // no @HD
2361
0
        new_l_text = h->l_text;
2362
0
        if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9)
2363
0
            return -1;
2364
0
        new_l_text += strlen(SAM_FORMAT_VERSION) + 8;
2365
0
        if (val) {
2366
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2367
0
                return -1;
2368
0
            new_l_text += strlen(val) + 4;
2369
0
        }
2370
0
        newtext = (char*)malloc(new_l_text + 1);
2371
0
        if (!newtext) return -1;
2372
2373
0
        if (val)
2374
0
            snprintf(newtext, new_l_text + 1,
2375
0
                    "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
2376
0
        else
2377
0
            snprintf(newtext, new_l_text + 1,
2378
0
                    "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
2379
0
    } else { // has @HD but different or no key
2380
0
        new_l_text = (beg - h->text) + (h->text + h->l_text - end);
2381
0
        if (val) {
2382
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2383
0
                return -1;
2384
0
            new_l_text += strlen(val) + 4;
2385
0
        }
2386
0
        newtext = (char*)malloc(new_l_text + 1);
2387
0
        if (!newtext) return -1;
2388
2389
0
        if (val) {
2390
0
            snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s",
2391
0
                    (int) (beg - h->text), h->text, key, val, end);
2392
0
        } else { //delete key
2393
0
            snprintf(newtext, new_l_text + 1, "%.*s%s",
2394
0
                    (int) (beg - h->text), h->text, end);
2395
0
        }
2396
0
    }
2397
0
    free(h->text);
2398
0
    h->text = newtext;
2399
0
    h->l_text = new_l_text;
2400
0
    return 0;
2401
0
}
2402
2403
2404
int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2405
0
{
2406
0
    if (!h || !key)
2407
0
        return -1;
2408
2409
0
    if (!h->hrecs)
2410
0
        return old_sam_hdr_change_HD(h, key, val);
2411
2412
0
    if (val) {
2413
0
        if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0)
2414
0
            return -1;
2415
0
    } else {
2416
0
        if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0)
2417
0
            return -1;
2418
0
    }
2419
0
    return sam_hdr_rebuild(h);
2420
0
}
2421
/**********************
2422
 *** SAM record I/O ***
2423
 **********************/
2424
2425
// The speed of this code can vary considerably depending on minor code
2426
// changes elsewhere as some of the tight loops are particularly prone to
2427
// speed changes when the instruction blocks are split over a 32-byte
2428
// boundary.  To protect against this, we explicitly specify an alignment
2429
// for this function.  If this is insufficient, we may also wish to
2430
// consider alignment of blocks within this function via
2431
// __attribute__((optimize("align-loops=5"))) (gcc) or clang equivalents.
2432
// However it's not very portable.
2433
// Instead we break into separate functions so we can explicitly specify
2434
// use __attribute__((aligned(32))) instead and force consistent loop
2435
// alignment.
2436
270k
static inline int64_t grow_B_array(bam1_t *b, uint32_t *n, size_t size) {
2437
    // Avoid overflow on 32-bit platforms, but it breaks BAM anyway
2438
270k
    if (*n > INT32_MAX*0.666) {
2439
0
        errno = ENOMEM;
2440
0
        return -1;
2441
0
    }
2442
2443
270k
    size_t bytes = (size_t)size * (size_t)(*n>>1);
2444
270k
    if (possibly_expand_bam_data(b, bytes) < 0) {
2445
0
        hts_log_error("Out of memory");
2446
0
        return -1;
2447
0
    }
2448
2449
270k
    (*n)+=*n>>1;
2450
270k
    return 0;
2451
270k
}
2452
2453
2454
// This ensures that q always ends up at the next comma after
2455
// reading a number even if it's followed by junk.  It
2456
// prevents the possibility of trying to read more than n items.
2457
10.6M
#define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0)
2458
2459
HTS_ALIGN32
2460
static char *sam_parse_Bc_vals(bam1_t *b, char *q, uint32_t *nused,
2461
37.6k
                               uint32_t *nalloc, int *overflow) {
2462
4.04M
    while (*q == ',') {
2463
4.00M
        if ((*nused)++ >= (*nalloc)) {
2464
255k
            if (grow_B_array(b, nalloc, 1) < 0)
2465
0
                return NULL;
2466
255k
        }
2467
4.00M
        *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, overflow);
2468
4.00M
        b->l_data++;
2469
4.00M
    }
2470
37.6k
    return q;
2471
37.6k
}
2472
2473
HTS_ALIGN32
2474
static char *sam_parse_BC_vals(bam1_t *b, char *q, uint32_t *nused,
2475
17.7k
                               uint32_t *nalloc, int *overflow) {
2476
539k
    while (*q == ',') {
2477
521k
        if ((*nused)++ >= (*nalloc)) {
2478
1.30k
            if (grow_B_array(b, nalloc, 1) < 0)
2479
0
                return NULL;
2480
1.30k
        }
2481
521k
        if (q[1] != '-') {
2482
509k
            *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, overflow);
2483
509k
            b->l_data++;
2484
509k
        } else {
2485
11.7k
            *overflow = 1;
2486
11.7k
            q++;
2487
11.7k
            skip_to_comma_(q);
2488
11.7k
        }
2489
521k
    }
2490
17.7k
    return q;
2491
17.7k
}
2492
2493
HTS_ALIGN32
2494
static char *sam_parse_Bs_vals(bam1_t *b, char *q, uint32_t *nused,
2495
8.32k
                               uint32_t *nalloc, int *overflow) {
2496
241k
    while (*q == ',') {
2497
233k
        if ((*nused)++ >= (*nalloc)) {
2498
2.79k
            if (grow_B_array(b, nalloc, 2) < 0)
2499
0
                return NULL;
2500
2.79k
        }
2501
233k
        i16_to_le(hts_str2int(q + 1, &q, 16, overflow),
2502
233k
                  b->data + b->l_data);
2503
233k
        b->l_data += 2;
2504
233k
    }
2505
8.32k
    return q;
2506
8.32k
}
2507
2508
HTS_ALIGN32
2509
static char *sam_parse_BS_vals(bam1_t *b, char *q, uint32_t *nused,
2510
5.96k
                               uint32_t *nalloc, int *overflow) {
2511
5.84M
    while (*q == ',') {
2512
5.83M
        if ((*nused)++ >= (*nalloc)) {
2513
7.25k
            if (grow_B_array(b, nalloc, 2) < 0)
2514
0
                return NULL;
2515
7.25k
        }
2516
5.83M
        if (q[1] != '-') {
2517
5.77M
            u16_to_le(hts_str2uint(q + 1, &q, 16, overflow),
2518
5.77M
                      b->data + b->l_data);
2519
5.77M
            b->l_data += 2;
2520
5.77M
        } else {
2521
60.8k
            *overflow = 1;
2522
60.8k
            q++;
2523
60.8k
            skip_to_comma_(q);
2524
60.8k
        }
2525
5.83M
    }
2526
5.96k
    return q;
2527
5.96k
}
2528
2529
HTS_ALIGN32
2530
static char *sam_parse_Bi_vals(bam1_t *b, char *q, uint32_t *nused,
2531
15.8k
                               uint32_t *nalloc, int *overflow) {
2532
6.56M
    while (*q == ',') {
2533
6.54M
        if ((*nused)++ >= (*nalloc)) {
2534
590
            if (grow_B_array(b, nalloc, 4) < 0)
2535
0
                return NULL;
2536
590
        }
2537
6.54M
        i32_to_le(hts_str2int(q + 1, &q, 32, overflow),
2538
6.54M
                  b->data + b->l_data);
2539
6.54M
        b->l_data += 4;
2540
6.54M
    }
2541
15.8k
    return q;
2542
15.8k
}
2543
2544
HTS_ALIGN32
2545
static char *sam_parse_BI_vals(bam1_t *b, char *q, uint32_t *nused,
2546
31.1k
                               uint32_t *nalloc, int *overflow) {
2547
3.65M
    while (*q == ',') {
2548
3.62M
        if ((*nused)++ >= (*nalloc)) {
2549
2.15k
            if (grow_B_array(b, nalloc, 4) < 0)
2550
0
                return NULL;
2551
2.15k
        }
2552
3.62M
        if (q[1] != '-') {
2553
3.62M
            u32_to_le(hts_str2uint(q + 1, &q, 32, overflow),
2554
3.62M
                      b->data + b->l_data);
2555
3.62M
            b->l_data += 4;
2556
3.62M
        } else {
2557
1.62k
            *overflow = 1;
2558
1.62k
            q++;
2559
1.62k
            skip_to_comma_(q);
2560
1.62k
        }
2561
3.62M
    }
2562
31.1k
    return q;
2563
31.1k
}
2564
2565
HTS_ALIGN32
2566
static char *sam_parse_Bf_vals(bam1_t *b, char *q, uint32_t *nused,
2567
4.90k
                               uint32_t *nalloc, int *overflow) {
2568
13.6k
    while (*q == ',') {
2569
8.78k
        if ((*nused)++ >= (*nalloc)) {
2570
796
            if (grow_B_array(b, nalloc, 4) < 0)
2571
0
                return NULL;
2572
796
        }
2573
8.78k
        float_to_le(strtod(q + 1, &q), b->data + b->l_data);
2574
8.78k
        b->l_data += 4;
2575
8.78k
    }
2576
4.90k
    return q;
2577
4.90k
}
2578
2579
HTS_ALIGN32
2580
static int sam_parse_B_vals_r(char type, uint32_t nalloc, char *in,
2581
                              char **end, bam1_t *b,
2582
121k
                              int *ctr) {
2583
    // Protect against infinite recursion when dealing with invalid input.
2584
    // An example string is "XX:B:C,-".  The lack of a number means min=0,
2585
    // but it overflowed due to "-" and so we repeat ad-infinitum.
2586
    //
2587
    // Loop detection is the safest solution incase there are other
2588
    // strange corner cases with malformed inputs.
2589
121k
    if (++(*ctr) > 2) {
2590
69
        hts_log_error("Malformed data in B:%c array", type);
2591
69
        return -1;
2592
69
    }
2593
2594
121k
    int orig_l = b->l_data;
2595
121k
    char *q = in;
2596
121k
    int32_t size;
2597
121k
    size_t bytes;
2598
121k
    int overflow = 0;
2599
2600
121k
    size = aux_type2size(type);
2601
121k
    if (size <= 0 || size > 4) {
2602
23
        hts_log_error("Unrecognized type B:%c", type);
2603
23
        return -1;
2604
23
    }
2605
2606
    // Ensure space for type + values.
2607
    // The first pass through here we don't know the number of entries and
2608
    // nalloc == 0.  We start with a small working set and then parse the
2609
    // data, growing as needed.
2610
    //
2611
    // If we have a second pass through we do know the number of entries
2612
    // and nalloc is already known.  We have no need to expand the bam data.
2613
121k
    if (!nalloc)
2614
82.6k
         nalloc=7;
2615
2616
    // Ensure allocated memory is big enough (for current nalloc estimate)
2617
121k
    bytes = (size_t) nalloc * (size_t) size;
2618
121k
    if (bytes / size != nalloc
2619
121k
        || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) {
2620
0
        hts_log_error("Out of memory");
2621
0
        return -1;
2622
0
    }
2623
2624
121k
    uint32_t nused = 0;
2625
2626
121k
    b->data[b->l_data++] = 'B';
2627
121k
    b->data[b->l_data++] = type;
2628
    // 32-bit B-array length is inserted later once we know it.
2629
121k
    int b_len_idx = b->l_data;
2630
121k
    b->l_data += sizeof(uint32_t);
2631
2632
121k
    if (type == 'c') {
2633
37.6k
        if (!(q = sam_parse_Bc_vals(b, q, &nused, &nalloc, &overflow)))
2634
0
            return -1;
2635
83.9k
    } else if (type == 'C') {
2636
17.7k
        if (!(q = sam_parse_BC_vals(b, q, &nused, &nalloc, &overflow)))
2637
0
            return -1;
2638
66.2k
    } else if (type == 's') {
2639
8.32k
        if (!(q = sam_parse_Bs_vals(b, q, &nused, &nalloc, &overflow)))
2640
0
            return -1;
2641
57.9k
    } else if (type == 'S') {
2642
5.96k
        if (!(q = sam_parse_BS_vals(b, q, &nused, &nalloc, &overflow)))
2643
0
            return -1;
2644
51.9k
    } else if (type == 'i') {
2645
15.8k
        if (!(q = sam_parse_Bi_vals(b, q, &nused, &nalloc, &overflow)))
2646
0
            return -1;
2647
36.0k
    } else if (type == 'I') {
2648
31.1k
        if (!(q = sam_parse_BI_vals(b, q, &nused, &nalloc, &overflow)))
2649
0
            return -1;
2650
31.1k
    } else if (type == 'f') {
2651
4.90k
        if (!(q = sam_parse_Bf_vals(b, q, &nused, &nalloc, &overflow)))
2652
0
            return -1;
2653
4.90k
    }
2654
121k
    if (*q != '\t' && *q != '\0') {
2655
        // Unknown B array type or junk in the numbers
2656
219
        hts_log_error("Malformed B:%c", type);
2657
219
        return -1;
2658
219
    }
2659
121k
    i32_to_le(nused, b->data + b_len_idx);
2660
2661
121k
    if (!overflow) {
2662
82.0k
        *end = q;
2663
82.0k
        return 0;
2664
82.0k
    } else {
2665
39.3k
        int64_t max = 0, min = 0, val;
2666
        // Given type was incorrect.  Try to rescue the situation.
2667
39.3k
        char *r = q;
2668
39.3k
        q = in;
2669
39.3k
        overflow = 0;
2670
39.3k
        b->l_data = orig_l;
2671
        // Find out what range of values is present
2672
10.2M
        while (q < r) {
2673
10.2M
            val = hts_str2int(q + 1, &q, 64, &overflow);
2674
10.2M
            if (max < val) max = val;
2675
10.2M
            if (min > val) min = val;
2676
10.2M
            skip_to_comma_(q);
2677
10.2M
        }
2678
        // Retry with appropriate type
2679
39.3k
        if (!overflow) {
2680
39.2k
            if (min < 0) {
2681
17.4k
                if (min >= INT8_MIN && max <= INT8_MAX) {
2682
900
                    return sam_parse_B_vals_r('c', nalloc, in, end, b, ctr);
2683
16.5k
                } else if (min >= INT16_MIN && max <= INT16_MAX) {
2684
1.28k
                    return sam_parse_B_vals_r('s', nalloc, in, end, b, ctr);
2685
15.2k
                } else if (min >= INT32_MIN && max <= INT32_MAX) {
2686
15.0k
                    return sam_parse_B_vals_r('i', nalloc, in, end, b, ctr);
2687
15.0k
                }
2688
21.8k
            } else {
2689
21.8k
                if (max < UINT8_MAX) {
2690
942
                    return sam_parse_B_vals_r('C', nalloc, in, end, b, ctr);
2691
20.8k
                } else if (max <= UINT16_MAX) {
2692
931
                    return sam_parse_B_vals_r('S', nalloc, in, end, b, ctr);
2693
19.9k
                } else if (max <= UINT32_MAX) {
2694
19.8k
                    return sam_parse_B_vals_r('I', nalloc, in, end, b, ctr);
2695
19.8k
                }
2696
21.8k
            }
2697
39.2k
        }
2698
        // If here then at least one of the values is too big to store
2699
337
        hts_log_error("Numeric value in B array out of allowed range");
2700
337
        return -1;
2701
39.3k
    }
2702
121k
#undef skip_to_comma_
2703
121k
}
2704
2705
HTS_ALIGN32
2706
static int sam_parse_B_vals(char type, char *in, char **end, bam1_t *b)
2707
82.6k
{
2708
82.6k
    int ctr = 0;
2709
82.6k
    uint32_t nalloc = 0;
2710
82.6k
    return sam_parse_B_vals_r(type, nalloc, in, end, b, &ctr);
2711
82.6k
}
2712
2713
235k
static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) {
2714
235k
    if (*v >= '1' && *v <= '9') {
2715
123k
        return hts_str2uint(v, rv, 16, overflow);
2716
123k
    }
2717
112k
    else if (*v == '0') {
2718
        // handle single-digit "0" directly; otherwise it's hex or octal
2719
19.6k
        if (v[1] == '\t') { *rv = v+1; return 0; }
2720
289
        else {
2721
289
            unsigned long val = strtoul(v, rv, 0);
2722
289
            if (val > 65535) { *overflow = 1; return 65535; }
2723
202
            return val;
2724
289
        }
2725
19.6k
    }
2726
92.5k
    else {
2727
        // TODO implement symbolic flag letters
2728
92.5k
        *rv = v;
2729
92.5k
        return 0;
2730
92.5k
    }
2731
235k
}
2732
2733
// Parse tag line and append to bam object b.
2734
// Shared by both SAM and FASTQ parsers.
2735
//
2736
// The difference between the two is how lenient we are to recognising
2737
// non-compliant strings.  The FASTQ parser glosses over arbitrary
2738
// non-SAM looking strings.
2739
static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient,
2740
233k
                            khash_t(tag) *tag_whitelist) {
2741
233k
    int overflow = 0;
2742
233k
    int checkpoint;
2743
233k
    char logbuf[40];
2744
233k
    char *q = start, *p = end;
2745
2746
233k
#define _parse_err(cond, ...)                   \
2747
5.26M
    do {                                        \
2748
11.2M
        if (cond) {                             \
2749
479
            if (lenient) {                      \
2750
0
                while (q < p && !isspace_c(*q))   \
2751
0
                    q++;                        \
2752
0
                while (q < p && isspace_c(*q))    \
2753
0
                    q++;                        \
2754
0
                b->l_data = checkpoint;         \
2755
0
                goto loop;                      \
2756
479
            } else {                            \
2757
479
                hts_log_error(__VA_ARGS__);     \
2758
479
                goto err_ret;                   \
2759
479
            }                                   \
2760
479
        }                                       \
2761
5.26M
    } while (0)
2762
2763
4.95M
    while (q < p) loop: {
2764
4.95M
        char type;
2765
4.95M
        checkpoint = b->l_data;
2766
4.95M
        if (p - q < 5) {
2767
107
            if (lenient) {
2768
0
                break;
2769
107
            } else {
2770
107
                hts_log_error("Incomplete aux field");
2771
107
                goto err_ret;
2772
107
            }
2773
107
        }
2774
2.47M
        _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id");
2775
2776
2.47M
        if (lenient && (q[2] | q[4]) != ':') {
2777
0
            while (q < p && !isspace_c(*q))
2778
0
                q++;
2779
0
            while (q < p && isspace_c(*q))
2780
0
                q++;
2781
0
            continue;
2782
0
        }
2783
2784
2.47M
        if (tag_whitelist) {
2785
0
            int tt = q[0]*256 + q[1];
2786
0
            if (kh_get(tag, tag_whitelist, tt) == kh_end(tag_whitelist)) {
2787
0
                while (q < p && *q != '\t')
2788
0
                    q++;
2789
0
                continue;
2790
0
            }
2791
0
        }
2792
2793
        // Copy over id
2794
2.47M
        if (possibly_expand_bam_data(b, 2) < 0) goto err_ret;
2795
2.47M
        memcpy(b->data + b->l_data, q, 2); b->l_data += 2;
2796
2.47M
        q += 3; type = *q++; ++q; // q points to value
2797
2.47M
        if (type != 'Z' && type != 'H') // the only zero length acceptable fields
2798
2.20M
            _parse_err(*q <= '\t', "incomplete aux field");
2799
2800
        // Ensure enough space for a double + type allocated.
2801
2.47M
        if (possibly_expand_bam_data(b, 16) < 0) goto err_ret;
2802
2803
2.47M
        if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
2804
844k
            b->data[b->l_data++] = 'A';
2805
844k
            b->data[b->l_data++] = *q++;
2806
1.63M
        } else if (type == 'i' || type == 'I') {
2807
1.21M
            if (*q == '-') {
2808
970k
                int32_t x = hts_str2int(q, &q, 32, &overflow);
2809
970k
                if (x >= INT8_MIN) {
2810
477k
                    b->data[b->l_data++] = 'c';
2811
477k
                    b->data[b->l_data++] = x;
2812
492k
                } else if (x >= INT16_MIN) {
2813
146k
                    b->data[b->l_data++] = 's';
2814
146k
                    i16_to_le(x, b->data + b->l_data);
2815
146k
                    b->l_data += 2;
2816
346k
                } else {
2817
346k
                    b->data[b->l_data++] = 'i';
2818
346k
                    i32_to_le(x, b->data + b->l_data);
2819
346k
                    b->l_data += 4;
2820
346k
                }
2821
970k
            } else {
2822
242k
                uint32_t x = hts_str2uint(q, &q, 32, &overflow);
2823
242k
                if (x <= UINT8_MAX) {
2824
153k
                    b->data[b->l_data++] = 'C';
2825
153k
                    b->data[b->l_data++] = x;
2826
153k
                } else if (x <= UINT16_MAX) {
2827
74.4k
                    b->data[b->l_data++] = 'S';
2828
74.4k
                    u16_to_le(x, b->data + b->l_data);
2829
74.4k
                    b->l_data += 2;
2830
74.4k
                } else {
2831
14.0k
                    b->data[b->l_data++] = 'I';
2832
14.0k
                    u32_to_le(x, b->data + b->l_data);
2833
14.0k
                    b->l_data += 4;
2834
14.0k
                }
2835
242k
            }
2836
1.21M
        } else if (type == 'f') {
2837
25.4k
            b->data[b->l_data++] = 'f';
2838
25.4k
            float_to_le(strtod(q, &q), b->data + b->l_data);
2839
25.4k
            b->l_data += sizeof(float);
2840
392k
        } else if (type == 'd') {
2841
36.4k
            b->data[b->l_data++] = 'd';
2842
36.4k
            double_to_le(strtod(q, &q), b->data + b->l_data);
2843
36.4k
            b->l_data += sizeof(double);
2844
356k
        } else if (type == 'Z' || type == 'H') {
2845
273k
            char *end = strchr(q, '\t');
2846
273k
            if (!end) end = q + strlen(q);
2847
273k
            _parse_err(type == 'H' && ((end-q)&1) != 0,
2848
273k
                       "hex field does not have an even number of digits");
2849
273k
            b->data[b->l_data++] = type;
2850
273k
            if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret;
2851
273k
            memcpy(b->data + b->l_data, q, end - q);
2852
273k
            b->l_data += end - q;
2853
273k
            b->data[b->l_data++] = '\0';
2854
273k
            q = end;
2855
273k
        } else if (type == 'B') {
2856
82.6k
            type = *q++; // q points to the first ',' following the typing byte
2857
82.6k
            _parse_err(*q && *q != ',' && *q != '\t',
2858
82.6k
                       "B aux field type not followed by ','");
2859
2860
82.6k
            if (sam_parse_B_vals(type, q, &q, b) < 0)
2861
648
                goto err_ret;
2862
82.6k
        } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1));
2863
2864
13.9M
        while (*q > '\t') { q++; } // Skip any junk to next tab
2865
2.47M
        q++;
2866
2.47M
    }
2867
2868
231k
    _parse_err(!lenient && overflow != 0, "numeric value out of allowed range");
2869
231k
#undef _parse_err
2870
2871
231k
    return 0;
2872
2873
1.23k
err_ret:
2874
1.23k
    return -2;
2875
231k
}
2876
2877
int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
2878
236k
{
2879
955k
#define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0)
2880
2881
236k
#if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff
2882
2883
// Macro that operates on 64-bits at a time.
2884
236k
#define COPY_MINUS_N(to,from,n,l,failed)                        \
2885
236k
    do {                                                        \
2886
223k
        uint64_u *from8 = (uint64_u *)(from);                   \
2887
223k
        uint64_u *to8 = (uint64_u *)(to);                       \
2888
223k
        uint64_t uflow = 0;                                     \
2889
223k
        size_t l8 = (l)>>3, i;                                  \
2890
224k
        for (i = 0; i < l8; i++) {                              \
2891
203
            to8[i] = from8[i] - (n)*0x0101010101010101UL;       \
2892
203
            uflow |= to8[i];                                    \
2893
203
        }                                                       \
2894
225k
        for (i<<=3; i < (l); ++i) {                             \
2895
1.34k
            to[i] = from[i] - (n);                              \
2896
1.34k
            uflow |= to[i];                                     \
2897
1.34k
        }                                                       \
2898
223k
        failed = (uflow & 0x8080808080808080UL) > 0;            \
2899
223k
    } while (0)
2900
2901
#else
2902
2903
// Basic version which operates a byte at a time
2904
#define COPY_MINUS_N(to,from,n,l,failed) do {                \
2905
        uint8_t uflow = 0;                                   \
2906
        for (i = 0; i < (l); ++i) {                          \
2907
            (to)[i] = (from)[i] - (n);                       \
2908
            uflow |= (uint8_t) (to)[i];                      \
2909
        }                                                    \
2910
        failed = (uflow & 0x80) > 0;                         \
2911
    } while (0)
2912
2913
#endif
2914
2915
446k
#define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l)
2916
3.35M
#define _parse_err(cond, ...) do { if (cond) { hts_log_error(__VA_ARGS__); goto err_ret; } } while (0)
2917
793k
#define _parse_warn(cond, ...) do { if (cond) { hts_log_warning(__VA_ARGS__); } } while (0)
2918
2919
236k
    uint8_t *t;
2920
2921
236k
    char *p = s->s, *q;
2922
236k
    int i, overflow = 0;
2923
236k
    char logbuf[40];
2924
236k
    hts_pos_t cigreflen;
2925
236k
    bam1_core_t *c = &b->core;
2926
2927
236k
    b->l_data = 0;
2928
236k
    memset(c, 0, 32);
2929
2930
    // qname
2931
236k
    q = _read_token(p);
2932
2933
235k
    _parse_warn(p - q <= 1, "empty query name");
2934
235k
    _parse_err(p - q > 255, "query name too long");
2935
    // resize large enough for name + extranul
2936
235k
    if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret;
2937
235k
    memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q;
2938
2939
235k
    c->l_extranul = (4 - (b->l_data & 3)) & 3;
2940
235k
    memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul);
2941
235k
    b->l_data += c->l_extranul;
2942
2943
235k
    c->l_qname = p - q + c->l_extranul;
2944
2945
    // flag
2946
235k
    c->flag = parse_sam_flag(p, &p, &overflow);
2947
235k
    if (*p++ != '\t') goto err_ret; // malformated flag
2948
2949
    // chr
2950
235k
    q = _read_token(p);
2951
235k
    if (strcmp(q, "*")) {
2952
223k
        _parse_err(h->n_targets == 0, "no SQ lines present in the header");
2953
223k
        c->tid = bam_name2id(h, q);
2954
223k
        _parse_err(c->tid < -1, "failed to parse header");
2955
223k
        _parse_warn(c->tid < 0, "unrecognized reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2956
223k
    } else c->tid = -1;
2957
2958
    // pos
2959
235k
    c->pos = hts_str2uint(p, &p, 62, &overflow) - 1;
2960
235k
    if (*p++ != '\t') goto err_ret;
2961
234k
    if (c->pos < 0 && c->tid >= 0) {
2962
6.02k
        _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
2963
6.02k
        c->tid = -1;
2964
6.02k
    }
2965
234k
    if (c->tid < 0) c->flag |= BAM_FUNMAP;
2966
2967
    // mapq
2968
234k
    c->qual = hts_str2uint(p, &p, 8, &overflow);
2969
234k
    if (*p++ != '\t') goto err_ret;
2970
    // cigar
2971
234k
    if (*p != '*') {
2972
217k
        uint32_t *cigar = NULL;
2973
217k
        int old_l_data = b->l_data;
2974
217k
        int n_cigar = bam_parse_cigar(p, &p, b);
2975
217k
        if (n_cigar < 1 || *p++ != '\t') goto err_ret;
2976
217k
        cigar = (uint32_t *)(b->data + old_l_data);
2977
2978
        // can't use bam_endpos() directly as some fields not yet set up
2979
217k
        cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
2980
217k
        if (cigreflen == 0) cigreflen = 1;
2981
217k
    } else {
2982
16.8k
        _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
2983
16.8k
        c->flag |= BAM_FUNMAP;
2984
16.8k
        q = _read_token(p);
2985
16.8k
        cigreflen = 1;
2986
16.8k
    }
2987
233k
    _parse_err(HTS_POS_MAX - cigreflen <= c->pos,
2988
233k
               "read ends beyond highest supported position");
2989
233k
    c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5);
2990
    // mate chr
2991
233k
    q = _read_token(p);
2992
233k
    if (strcmp(q, "=") == 0) {
2993
419
        c->mtid = c->tid;
2994
233k
    } else if (strcmp(q, "*") == 0) {
2995
461
        c->mtid = -1;
2996
232k
    } else {
2997
232k
        c->mtid = bam_name2id(h, q);
2998
232k
        _parse_err(c->mtid < -1, "failed to parse header");
2999
232k
        _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
3000
232k
    }
3001
    // mpos
3002
233k
    c->mpos = hts_str2uint(p, &p, 62, &overflow) - 1;
3003
233k
    if (*p++ != '\t') goto err_ret;
3004
233k
    if (c->mpos < 0 && c->mtid >= 0) {
3005
78.0k
        _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
3006
78.0k
        c->mtid = -1;
3007
78.0k
    }
3008
    // tlen
3009
233k
    c->isize = hts_str2int(p, &p, 63, &overflow);
3010
233k
    if (*p++ != '\t') goto err_ret;
3011
233k
    _parse_err(overflow, "number outside allowed range");
3012
    // seq
3013
233k
    q = _read_token(p);
3014
233k
    if (strcmp(q, "*")) {
3015
213k
        _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long");
3016
213k
        c->l_qseq = p - q - 1;
3017
213k
        hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname));
3018
213k
        _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length");
3019
213k
        i = (c->l_qseq + 1) >> 1;
3020
213k
        _get_mem(uint8_t, &t, b, i);
3021
3022
213k
        unsigned int lqs2 = c->l_qseq&~1, i;
3023
332k
        for (i = 0; i < lqs2; i+=2)
3024
119k
            t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]];
3025
222k
        for (; i < c->l_qseq; ++i)
3026
9.62k
            t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2);
3027
213k
    } else c->l_qseq = 0;
3028
    // qual
3029
466k
    _get_mem(uint8_t, &t, b, c->l_qseq);
3030
466k
    if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) {
3031
9.22k
        memset(t, 0xff, c->l_qseq);
3032
9.22k
        p += 2;
3033
224k
    } else {
3034
224k
        int failed = 0;
3035
224k
        _parse_err(s->l - (p - s->s) < c->l_qseq
3036
224k
                   || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'),
3037
224k
                   "SEQ and QUAL are of different length");
3038
223k
        COPY_MINUS_N(t, p, 33, c->l_qseq, failed);
3039
223k
        _parse_err(failed, "invalid QUAL character");
3040
223k
        p += c->l_qseq + 1;
3041
223k
    }
3042
3043
    // aux
3044
233k
    if (aux_parse(p, s->s + s->l, b, 0, NULL) < 0)
3045
1.23k
        goto err_ret;
3046
3047
231k
    if (bam_tag2cigar(b, 1, 1) < 0)
3048
0
        return -2;
3049
231k
    return 0;
3050
3051
0
#undef _parse_warn
3052
0
#undef _parse_err
3053
0
#undef _get_mem
3054
0
#undef _read_token
3055
4.52k
err_ret:
3056
4.52k
    return -2;
3057
231k
}
3058
3059
217k
static uint32_t read_ncigar(const char *q) {
3060
217k
    uint32_t n_cigar = 0;
3061
2.73M
    for (; *q && *q != '\t'; ++q)
3062
2.51M
        if (!isdigit_c(*q)) ++n_cigar;
3063
217k
    if (!n_cigar) {
3064
107
        hts_log_error("No CIGAR operations");
3065
107
        return 0;
3066
107
    }
3067
217k
    if (n_cigar >= 2147483647) {
3068
0
        hts_log_error("Too many CIGAR operations");
3069
0
        return 0;
3070
0
    }
3071
3072
217k
    return n_cigar;
3073
217k
}
3074
3075
/*! @function
3076
 @abstract  Parse a CIGAR string into preallocated a uint32_t array
3077
 @param  in      [in]  pointer to the source string
3078
 @param  a_cigar [out]  address of the destination uint32_t buffer
3079
 @return         number of processed input characters; 0 on error
3080
 */
3081
217k
static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) {
3082
217k
    int i, overflow = 0;
3083
217k
    const char *p = in;
3084
536k
    for (i = 0; i < n_cigar; i++) {
3085
319k
        uint32_t len;
3086
319k
        int op;
3087
319k
        char *q;
3088
319k
        len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT;
3089
319k
        if (q == p) {
3090
174
            hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p);
3091
174
            return 0;
3092
174
        }
3093
318k
        if (overflow) {
3094
58
            hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p);
3095
58
            return 0;
3096
58
        }
3097
318k
        p = q;
3098
318k
        op = bam_cigar_table[(unsigned char)*p++];
3099
318k
        if (op < 0) {
3100
261
            hts_log_error("Unrecognized CIGAR operator");
3101
261
            return 0;
3102
261
        }
3103
318k
        a_cigar[i] = len;
3104
318k
        a_cigar[i] |= op;
3105
318k
    }
3106
3107
217k
    return p-in;
3108
217k
}
3109
3110
0
ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) {
3111
0
    size_t n_cigar = 0;
3112
0
    int diff;
3113
3114
0
    if (!in || !a_cigar || !a_mem) {
3115
0
        hts_log_error("NULL pointer arguments");
3116
0
        return -1;
3117
0
    }
3118
0
    if (end) *end = (char *)in;
3119
3120
0
    if (*in == '*') {
3121
0
        if (end) (*end)++;
3122
0
        return 0;
3123
0
    }
3124
0
    n_cigar = read_ncigar(in);
3125
0
    if (!n_cigar) return 0;
3126
0
    if (n_cigar > *a_mem) {
3127
0
        uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar));
3128
0
        if (a_tmp) {
3129
0
            *a_cigar = a_tmp;
3130
0
            *a_mem = n_cigar;
3131
0
        } else {
3132
0
            hts_log_error("Memory allocation error");
3133
0
            return -1;
3134
0
        }
3135
0
    }
3136
3137
0
    if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1;
3138
0
    if (end) *end = (char *)in+diff;
3139
3140
0
    return n_cigar;
3141
0
}
3142
3143
217k
ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) {
3144
217k
    size_t n_cigar = 0;
3145
217k
    int diff;
3146
3147
217k
    if (!in || !b) {
3148
0
        hts_log_error("NULL pointer arguments");
3149
0
        return -1;
3150
0
    }
3151
217k
    if (end) *end = (char *)in;
3152
3153
217k
    n_cigar = (*in == '*') ? 0 : read_ncigar(in);
3154
217k
    if (!n_cigar && b->core.n_cigar == 0) {
3155
107
        if (end) *end = (char *)in+1;
3156
107
        return 0;
3157
107
    }
3158
3159
217k
    ssize_t cig_diff = n_cigar - b->core.n_cigar;
3160
217k
    if (cig_diff > 0 &&
3161
217k
        possibly_expand_bam_data(b, cig_diff * sizeof(uint32_t)) < 0) {
3162
0
        hts_log_error("Memory allocation error");
3163
0
        return -1;
3164
0
    }
3165
3166
217k
    uint32_t *cig = bam_get_cigar(b);
3167
217k
    if ((uint8_t *)cig != b->data + b->l_data) {
3168
        // Modifying an BAM existing BAM record
3169
0
        uint8_t  *seq = bam_get_seq(b);
3170
0
        memmove(cig + n_cigar, seq, (b->data + b->l_data) - seq);
3171
0
    }
3172
3173
217k
    if (n_cigar) {
3174
217k
        if (!(diff = parse_cigar(in, cig, n_cigar)))
3175
493
            return -1;
3176
217k
    } else {
3177
0
        diff = 1; // handle "*"
3178
0
    }
3179
3180
217k
    b->l_data += cig_diff * sizeof(uint32_t);
3181
217k
    b->core.n_cigar = n_cigar;
3182
217k
    if (end) *end = (char *)in + diff;
3183
3184
217k
    return n_cigar;
3185
217k
}
3186
3187
/*
3188
 * -----------------------------------------------------------------------------
3189
 * SAM threading
3190
 */
3191
// Size of SAM text block (reading)
3192
0
#define SAM_NBYTES 240000
3193
3194
// Number of BAM records (writing, up to NB_mem in size)
3195
0
#define SAM_NBAM 1000
3196
3197
struct SAM_state;
3198
3199
// Output job - a block of BAM records
3200
typedef struct sp_bams {
3201
    struct sp_bams *next;
3202
    int serial;
3203
3204
    bam1_t *bams;
3205
    int nbams, abams; // used and alloc for bams[] array
3206
    size_t bam_mem;   // very approximate total size
3207
3208
    struct SAM_state *fd;
3209
} sp_bams;
3210
3211
// Input job - a block of SAM text
3212
typedef struct sp_lines {
3213
    struct sp_lines *next;
3214
    int serial;
3215
3216
    char *data;
3217
    int data_size;
3218
    int alloc;
3219
3220
    struct SAM_state *fd;
3221
    sp_bams *bams;
3222
} sp_lines;
3223
3224
enum sam_cmd {
3225
    SAM_NONE = 0,
3226
    SAM_CLOSE,
3227
    SAM_CLOSE_DONE,
3228
    SAM_AT_EOF,
3229
};
3230
3231
typedef struct SAM_state {
3232
    sam_hdr_t *h;
3233
3234
    hts_tpool *p;
3235
    int own_pool;
3236
    pthread_mutex_t lines_m;
3237
    hts_tpool_process *q;
3238
    pthread_t dispatcher;
3239
    int dispatcher_set;
3240
3241
    sp_lines *lines;
3242
    sp_bams *bams;
3243
3244
    sp_bams *curr_bam;
3245
    int curr_idx;
3246
    int serial;
3247
3248
    // Be warned: moving these mutexes around in this struct can reduce
3249
    // threading performance by up to 70%!
3250
    pthread_mutex_t command_m;
3251
    pthread_cond_t command_c;
3252
    enum sam_cmd command;
3253
3254
    // One of the E* errno codes
3255
    int errcode;
3256
3257
    htsFile *fp;
3258
} SAM_state;
3259
3260
// Returns a SAM_state struct from a generic hFILE.
3261
//
3262
// Returns NULL on failure.
3263
0
static SAM_state *sam_state_create(htsFile *fp) {
3264
    // Ideally sam_open wouldn't be a #define to hts_open but instead would
3265
    // be a redirect call with an additional 'S' mode.  This in turn would
3266
    // correctly set the designed format to sam instead of a generic
3267
    // text_format.
3268
0
    if (fp->format.format != sam && fp->format.format != text_format)
3269
0
        return NULL;
3270
3271
0
    SAM_state *fd = calloc(1, sizeof(*fd));
3272
0
    if (!fd)
3273
0
        return NULL;
3274
3275
0
    fp->state = fd;
3276
0
    fd->fp = fp;
3277
3278
0
    return fd;
3279
0
}
3280
3281
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str);
3282
static void *sam_format_worker(void *arg);
3283
3284
0
static void sam_state_err(SAM_state *fd, int errcode) {
3285
0
    pthread_mutex_lock(&fd->command_m);
3286
0
    if (!fd->errcode)
3287
0
        fd->errcode = errcode;
3288
0
    pthread_mutex_unlock(&fd->command_m);
3289
0
}
3290
3291
0
static void sam_free_sp_bams(sp_bams *b) {
3292
0
    if (!b)
3293
0
        return;
3294
3295
0
    if (b->bams) {
3296
0
        int i;
3297
0
        for (i = 0; i < b->abams; i++) {
3298
0
            if (b->bams[i].data)
3299
0
                free(b->bams[i].data);
3300
0
        }
3301
0
        free(b->bams);
3302
0
    }
3303
0
    free(b);
3304
0
}
3305
3306
// Destroys the state produce by sam_state_create.
3307
25.7k
int sam_state_destroy(htsFile *fp) {
3308
25.7k
    int ret = 0;
3309
3310
25.7k
    if (!fp->state)
3311
25.7k
        return 0;
3312
3313
0
    SAM_state *fd = fp->state;
3314
0
    if (fd->p) {
3315
0
        if (fd->h) {
3316
            // Notify sam_dispatcher we're closing
3317
0
            pthread_mutex_lock(&fd->command_m);
3318
0
            if (fd->command != SAM_CLOSE_DONE)
3319
0
                fd->command = SAM_CLOSE;
3320
0
            pthread_cond_signal(&fd->command_c);
3321
0
            ret = -fd->errcode;
3322
0
            if (fd->q)
3323
0
                hts_tpool_wake_dispatch(fd->q); // unstick the reader
3324
3325
0
            if (!fp->is_write && fd->q && fd->dispatcher_set) {
3326
0
                for (;;) {
3327
                    // Avoid deadlocks with dispatcher
3328
0
                    if (fd->command == SAM_CLOSE_DONE)
3329
0
                        break;
3330
0
                    hts_tpool_wake_dispatch(fd->q);
3331
0
                    pthread_mutex_unlock(&fd->command_m);
3332
0
                    hts_usleep(10000);
3333
0
                    pthread_mutex_lock(&fd->command_m);
3334
0
                }
3335
0
            }
3336
0
            pthread_mutex_unlock(&fd->command_m);
3337
3338
0
            if (fp->is_write) {
3339
                // Dispatch the last partial block.
3340
0
                sp_bams *gb = fd->curr_bam;
3341
0
                if (!ret && gb && gb->nbams > 0 && fd->q)
3342
0
                    ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb);
3343
3344
                // Flush and drain output
3345
0
                if (fd->q)
3346
0
                    hts_tpool_process_flush(fd->q);
3347
0
                pthread_mutex_lock(&fd->command_m);
3348
0
                if (!ret) ret = -fd->errcode;
3349
0
                pthread_mutex_unlock(&fd->command_m);
3350
3351
0
                while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) {
3352
0
                    hts_usleep(10000);
3353
0
                    pthread_mutex_lock(&fd->command_m);
3354
0
                    ret = -fd->errcode;
3355
                    // not empty but shutdown implies error
3356
0
                    if (hts_tpool_process_is_shutdown(fd->q) && !ret)
3357
0
                        ret = EIO;
3358
0
                    pthread_mutex_unlock(&fd->command_m);
3359
0
                }
3360
0
                if (fd->q)
3361
0
                    hts_tpool_process_shutdown(fd->q);
3362
0
            }
3363
3364
            // Wait for it to acknowledge
3365
0
            if (fd->dispatcher_set)
3366
0
                pthread_join(fd->dispatcher, NULL);
3367
0
            if (!ret) ret = -fd->errcode;
3368
0
        }
3369
3370
        // Tidy up memory
3371
0
        if (fd->q)
3372
0
            hts_tpool_process_destroy(fd->q);
3373
3374
0
        if (fd->own_pool && fp->format.compression == no_compression) {
3375
0
            hts_tpool_destroy(fd->p);
3376
0
            fd->p = NULL;
3377
0
        }
3378
0
        pthread_mutex_destroy(&fd->lines_m);
3379
0
        pthread_mutex_destroy(&fd->command_m);
3380
0
        pthread_cond_destroy(&fd->command_c);
3381
3382
0
        sp_lines *l = fd->lines;
3383
0
        while (l) {
3384
0
            sp_lines *n = l->next;
3385
0
            free(l->data);
3386
0
            free(l);
3387
0
            l = n;
3388
0
        }
3389
3390
0
        sp_bams *b = fd->bams;
3391
0
        while (b) {
3392
0
            if (fd->curr_bam == b)
3393
0
                fd->curr_bam = NULL;
3394
0
            sp_bams *n = b->next;
3395
0
            sam_free_sp_bams(b);
3396
0
            b = n;
3397
0
        }
3398
3399
0
        if (fd->curr_bam)
3400
0
            sam_free_sp_bams(fd->curr_bam);
3401
3402
        // Decrement counter by one, maybe destroying too.
3403
        // This is to permit the caller using bam_hdr_destroy
3404
        // before sam_close without triggering decode errors
3405
        // in the background threads.
3406
0
        bam_hdr_destroy(fd->h);
3407
0
    }
3408
3409
0
    free(fp->state);
3410
0
    fp->state = NULL;
3411
0
    return ret;
3412
25.7k
}
3413
3414
// Cleanup function - job for sam_parse_worker; result for sam_format_worker
3415
0
static void cleanup_sp_lines(void *arg) {
3416
0
    sp_lines *gl = (sp_lines *)arg;
3417
0
    if (!gl) return;
3418
3419
    // Should always be true for lines passed to / from thread workers.
3420
0
    assert(gl->next == NULL);
3421
3422
0
    free(gl->data);
3423
0
    sam_free_sp_bams(gl->bams);
3424
0
    free(gl);
3425
0
}
3426
3427
// Run from one of the worker threads.
3428
// Convert a passed in array of lines to array of BAMs, returning
3429
// the result back to the thread queue.
3430
0
static void *sam_parse_worker(void *arg) {
3431
0
    sp_lines *gl = (sp_lines *)arg;
3432
0
    sp_bams *gb = NULL;
3433
0
    char *lines = gl->data;
3434
0
    int i;
3435
0
    bam1_t *b;
3436
0
    SAM_state *fd = gl->fd;
3437
3438
    // Use a block of BAM structs we had earlier if available.
3439
0
    pthread_mutex_lock(&fd->lines_m);
3440
0
    if (fd->bams) {
3441
0
        gb = fd->bams;
3442
0
        fd->bams = gb->next;
3443
0
    }
3444
0
    pthread_mutex_unlock(&fd->lines_m);
3445
3446
0
    if (gb == NULL) {
3447
0
        gb = calloc(1, sizeof(*gb));
3448
0
        if (!gb) {
3449
0
            return NULL;
3450
0
        }
3451
0
        gb->abams = 100;
3452
0
        gb->bams = b = calloc(gb->abams, sizeof(*b));
3453
0
        if (!gb->bams) {
3454
0
            sam_state_err(fd, ENOMEM);
3455
0
            goto err;
3456
0
        }
3457
0
        gb->nbams = 0;
3458
0
        gb->bam_mem = 0;
3459
0
    }
3460
0
    gb->serial = gl->serial;
3461
0
    gb->next = NULL;
3462
3463
0
    b = (bam1_t *)gb->bams;
3464
0
    if (!b) {
3465
0
        sam_state_err(fd, ENOMEM);
3466
0
        goto err;
3467
0
    }
3468
3469
0
    i = 0;
3470
0
    char *cp = lines, *cp_end = lines + gl->data_size;
3471
0
    while (cp < cp_end) {
3472
0
        if (i >= gb->abams) {
3473
0
            int old_abams = gb->abams;
3474
0
            gb->abams *= 2;
3475
0
            b = (bam1_t *)realloc(gb->bams, gb->abams*sizeof(bam1_t));
3476
0
            if (!b) {
3477
0
                gb->abams /= 2;
3478
0
                sam_state_err(fd, ENOMEM);
3479
0
                goto err;
3480
0
            }
3481
0
            memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b));
3482
0
            gb->bams = b;
3483
0
        }
3484
3485
        // Ideally we'd get sam_parse1 to return the number of
3486
        // bytes decoded and to be able to stop on newline as
3487
        // well as \0.
3488
        //
3489
        // We can then avoid the additional strchr loop.
3490
        // It's around 6% of our CPU cost, albeit threadable.
3491
        //
3492
        // However this is an API change so for now we copy.
3493
3494
0
        char *nl = strchr(cp, '\n');
3495
0
        char *line_end;
3496
0
        if (nl) {
3497
0
            line_end = nl;
3498
0
            if (line_end > cp && *(line_end - 1) == '\r')
3499
0
                line_end--;
3500
0
            nl++;
3501
0
        } else {
3502
0
            nl = line_end = cp_end;
3503
0
        }
3504
0
        *line_end = '\0';
3505
0
        kstring_t ks = { line_end - cp, gl->alloc, cp };
3506
0
        if (sam_parse1(&ks, fd->h, &b[i]) < 0) {
3507
0
            sam_state_err(fd, errno ? errno : EIO);
3508
0
            cleanup_sp_lines(gl);
3509
0
            goto err;
3510
0
        }
3511
3512
0
        cp = nl;
3513
0
        i++;
3514
0
    }
3515
0
    gb->nbams = i;
3516
3517
0
    pthread_mutex_lock(&fd->lines_m);
3518
0
    gl->next = fd->lines;
3519
0
    fd->lines = gl;
3520
0
    pthread_mutex_unlock(&fd->lines_m);
3521
0
    return gb;
3522
3523
0
 err:
3524
0
    sam_free_sp_bams(gb);
3525
0
    return NULL;
3526
0
}
3527
3528
0
static void *sam_parse_eof(void *arg) {
3529
0
    return NULL;
3530
0
}
3531
3532
// Cleanup function - result for sam_parse_worker; job for sam_format_worker
3533
0
static void cleanup_sp_bams(void *arg) {
3534
0
    sam_free_sp_bams((sp_bams *) arg);
3535
0
}
3536
3537
// Runs in its own thread.
3538
// Reads a block of text (SAM) and sends a new job to the thread queue to
3539
// translate this to BAM.
3540
0
static void *sam_dispatcher_read(void *vp) {
3541
0
    htsFile *fp = vp;
3542
0
    kstring_t line = {0};
3543
0
    int line_frag = 0;
3544
0
    SAM_state *fd = fp->state;
3545
0
    sp_lines *l = NULL;
3546
3547
    // Pre-allocate buffer for left-over bits of line (exact size doesn't
3548
    // matter as it will grow if necessary).
3549
0
    if (ks_resize(&line, 1000) < 0)
3550
0
        goto err;
3551
3552
0
    for (;;) {
3553
        // Check for command
3554
0
        pthread_mutex_lock(&fd->command_m);
3555
0
        switch (fd->command) {
3556
3557
0
        case SAM_CLOSE:
3558
0
            pthread_cond_signal(&fd->command_c);
3559
0
            pthread_mutex_unlock(&fd->command_m);
3560
0
            hts_tpool_process_shutdown(fd->q);
3561
0
            goto tidyup;
3562
3563
0
        default:
3564
0
            break;
3565
0
        }
3566
0
        pthread_mutex_unlock(&fd->command_m);
3567
3568
0
        pthread_mutex_lock(&fd->lines_m);
3569
0
        if (fd->lines) {
3570
            // reuse existing line buffer
3571
0
            l = fd->lines;
3572
0
            fd->lines = l->next;
3573
0
        }
3574
0
        pthread_mutex_unlock(&fd->lines_m);
3575
3576
0
        if (l == NULL) {
3577
            // none to reuse, to create a new one
3578
0
            l = calloc(1, sizeof(*l));
3579
0
            if (!l)
3580
0
                goto err;
3581
0
            l->alloc = SAM_NBYTES;
3582
0
            l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1
3583
0
            if (!l->data) {
3584
0
                free(l);
3585
0
                l = NULL;
3586
0
                goto err;
3587
0
            }
3588
0
            l->fd = fd;
3589
0
        }
3590
0
        l->next = NULL;
3591
3592
0
        if (l->alloc < line_frag+SAM_NBYTES/2) {
3593
0
            char *rp = realloc(l->data, line_frag+SAM_NBYTES/2 +8);
3594
0
            if (!rp)
3595
0
                goto err;
3596
0
            l->alloc = line_frag+SAM_NBYTES/2;
3597
0
            l->data = rp;
3598
0
        }
3599
0
        memcpy(l->data, line.s, line_frag);
3600
3601
0
        l->data_size = line_frag;
3602
0
        ssize_t nbytes;
3603
0
    longer_line:
3604
0
        if (fp->is_bgzf)
3605
0
            nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag);
3606
0
        else
3607
0
            nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag);
3608
0
        if (nbytes < 0) {
3609
0
            sam_state_err(fd, errno ? errno : EIO);
3610
0
            goto err;
3611
0
        } else if (nbytes == 0)
3612
0
            break; // EOF
3613
0
        l->data_size += nbytes;
3614
3615
        // trim to last \n. Maybe \r\n, but that's still fine
3616
0
        if (nbytes == l->alloc - line_frag) {
3617
0
            char *cp_end = l->data + l->data_size;
3618
0
            char *cp = cp_end-1;
3619
3620
0
            while (cp > (char *)l->data && *cp != '\n')
3621
0
                cp--;
3622
3623
            // entire buffer is part of a single line
3624
0
            if (cp == l->data) {
3625
0
                line_frag = l->data_size;
3626
0
                char *rp = realloc(l->data, l->alloc * 2 + 8);
3627
0
                if (!rp)
3628
0
                    goto err;
3629
0
                l->alloc *= 2;
3630
0
                l->data = rp;
3631
0
                assert(l->alloc >= l->data_size);
3632
0
                assert(l->alloc >= line_frag);
3633
0
                assert(l->alloc >= l->alloc - line_frag);
3634
0
                goto longer_line;
3635
0
            }
3636
0
            cp++;
3637
3638
            // line holds the remainder of our line.
3639
0
            if (ks_resize(&line, cp_end - cp) < 0)
3640
0
                goto err;
3641
0
            memcpy(line.s, cp, cp_end - cp);
3642
0
            line_frag = cp_end - cp;
3643
0
            l->data_size = l->alloc - line_frag;
3644
0
        } else {
3645
            // out of buffer
3646
0
            line_frag = 0;
3647
0
        }
3648
3649
0
        l->serial = fd->serial++;
3650
        //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial);
3651
0
        if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l,
3652
0
                                cleanup_sp_lines, cleanup_sp_bams, 0) < 0)
3653
0
            goto err;
3654
0
        pthread_mutex_lock(&fd->command_m);
3655
0
        if (fd->command == SAM_CLOSE) {
3656
0
            pthread_mutex_unlock(&fd->command_m);
3657
0
            l = NULL;
3658
0
            goto tidyup;
3659
0
        }
3660
0
        l = NULL;  // Now "owned" by sam_parse_worker()
3661
0
        pthread_mutex_unlock(&fd->command_m);
3662
0
    }
3663
3664
    // Submit a NULL sp_bams entry to act as an EOF marker
3665
0
    if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0)
3666
0
        goto err;
3667
3668
    // At EOF, wait for close request.
3669
    // (In future if we add support for seek, this is where we need to catch it.)
3670
0
    for (;;) {
3671
0
        pthread_mutex_lock(&fd->command_m);
3672
0
        if (fd->command == SAM_NONE)
3673
0
            pthread_cond_wait(&fd->command_c, &fd->command_m);
3674
0
        switch (fd->command) {
3675
0
        case SAM_CLOSE:
3676
0
            pthread_cond_signal(&fd->command_c);
3677
0
            pthread_mutex_unlock(&fd->command_m);
3678
0
            hts_tpool_process_shutdown(fd->q);
3679
0
            goto tidyup;
3680
3681
0
        default:
3682
0
            pthread_mutex_unlock(&fd->command_m);
3683
0
            break;
3684
0
        }
3685
0
    }
3686
3687
0
 tidyup:
3688
0
    pthread_mutex_lock(&fd->command_m);
3689
0
    fd->command = SAM_CLOSE_DONE;
3690
0
    pthread_cond_signal(&fd->command_c);
3691
0
    pthread_mutex_unlock(&fd->command_m);
3692
3693
0
    if (l) {
3694
0
        pthread_mutex_lock(&fd->lines_m);
3695
0
        l->next = fd->lines;
3696
0
        fd->lines = l;
3697
0
        pthread_mutex_unlock(&fd->lines_m);
3698
0
    }
3699
0
    free(line.s);
3700
3701
0
    return NULL;
3702
3703
0
 err:
3704
0
    sam_state_err(fd, errno ? errno : ENOMEM);
3705
0
    hts_tpool_process_shutdown(fd->q);
3706
0
    goto tidyup;
3707
0
}
3708
3709
// Runs in its own thread.
3710
// Takes encoded blocks of SAM off the thread results queue and writes them
3711
// to our output stream.
3712
0
static void *sam_dispatcher_write(void *vp) {
3713
0
    htsFile *fp = vp;
3714
0
    SAM_state *fd = fp->state;
3715
0
    hts_tpool_result *r;
3716
3717
    // Iterates until result queue is shutdown, where it returns NULL.
3718
0
    while ((r = hts_tpool_next_result_wait(fd->q))) {
3719
0
        sp_lines *gl = (sp_lines *)hts_tpool_result_data(r);
3720
0
        if (!gl) {
3721
0
            sam_state_err(fd, ENOMEM);
3722
0
            goto err;
3723
0
        }
3724
3725
0
        if (fp->idx) {
3726
0
            sp_bams *gb = gl->bams;
3727
0
            int i = 0, count = 0;
3728
0
            while (i < gl->data_size) {
3729
0
                int j = i;
3730
0
                while (i < gl->data_size && gl->data[i] != '\n')
3731
0
                    i++;
3732
0
                if (i < gl->data_size)
3733
0
                    i++;
3734
3735
0
                if (fp->is_bgzf) {
3736
0
                    if (bgzf_flush_try(fp->fp.bgzf, i-j) < 0)
3737
0
                        goto err;
3738
0
                    if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j)
3739
0
                        goto err;
3740
0
                } else {
3741
0
                    if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j)
3742
0
                        goto err;
3743
0
                }
3744
3745
0
                bam1_t *b = &gb->bams[count++];
3746
0
                if (fp->format.compression == bgzf) {
3747
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
3748
0
                                      b->core.tid, b->core.pos, bam_endpos(b),
3749
0
                                      bgzf_tell(fp->fp.bgzf),
3750
0
                                      !(b->core.flag&BAM_FUNMAP)) < 0) {
3751
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3752
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3753
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3754
0
                        goto err;
3755
0
                    }
3756
0
                } else {
3757
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
3758
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
3759
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3760
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3761
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3762
0
                        goto err;
3763
0
                    }
3764
0
                }
3765
0
            }
3766
3767
0
            assert(count == gb->nbams);
3768
3769
            // Add bam array to free-list
3770
0
            pthread_mutex_lock(&fd->lines_m);
3771
0
            gb->next = fd->bams;
3772
0
            fd->bams = gl->bams;
3773
0
            gl->bams = NULL;
3774
0
            pthread_mutex_unlock(&fd->lines_m);
3775
0
        } else {
3776
0
            if (fp->is_bgzf) {
3777
                // We keep track of how much in the current block we have
3778
                // remaining => R.  We look for the last newline in input
3779
                // [i] to [i+R], backwards => position N.
3780
                //
3781
                // If we find a newline, we write out bytes i to N.
3782
                // We know we cannot fit the next record in this bgzf block,
3783
                // so we flush what we have and copy input N to i+R into
3784
                // the start of a new block, and recompute a new R for that.
3785
                //
3786
                // If we don't find a newline (i==N) then we cannot extend
3787
                // the current block at all, so flush whatever is in it now
3788
                // if it ends on a newline.
3789
                // We still copy i(==N) to i+R to the next block and
3790
                // continue as before with a new R.
3791
                //
3792
                // The only exception on the flush is when we run out of
3793
                // data in the input.  In that case we skip it as we don't
3794
                // yet know if the next record will fit.
3795
                //
3796
                // Both conditions share the same code here:
3797
                // - Look for newline (pos N)
3798
                // - Write i to N (which maybe 0)
3799
                // - Flush if block ends on newline and not end of input
3800
                // - write N to i+R
3801
3802
0
                int i = 0;
3803
0
                BGZF *fb = fp->fp.bgzf;
3804
0
                while (i < gl->data_size) {
3805
                    // remaining space in block
3806
0
                    int R = BGZF_BLOCK_SIZE - fb->block_offset;
3807
0
                    int eod = 0;
3808
0
                    if (R > gl->data_size-i)
3809
0
                        R = gl->data_size-i, eod = 1;
3810
3811
                    // Find last newline in input data
3812
0
                    int N = i + R;
3813
0
                    while (--N > i) {
3814
0
                        if (gl->data[N] == '\n')
3815
0
                            break;
3816
0
                    }
3817
3818
0
                    if (N != i) {
3819
                        // Found a newline
3820
0
                        N++;
3821
0
                        if (bgzf_write(fb, &gl->data[i], N-i) != N-i)
3822
0
                            goto err;
3823
0
                    }
3824
3825
                    // Flush bgzf block
3826
0
                    int b_off = fb->block_offset;
3827
0
                    if (!eod && b_off &&
3828
0
                        ((char *)fb->uncompressed_block)[b_off-1] == '\n')
3829
0
                        if (bgzf_flush_try(fb, BGZF_BLOCK_SIZE) < 0)
3830
0
                            goto err;
3831
3832
                    // Copy from N onwards into next block
3833
0
                    if (i+R > N)
3834
0
                        if (bgzf_write(fb, &gl->data[N], i+R - N)
3835
0
                            != i+R - N)
3836
0
                            goto err;
3837
3838
0
                    i = i+R;
3839
0
                }
3840
0
            } else {
3841
0
                if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size)
3842
0
                    goto err;
3843
0
            }
3844
0
        }
3845
3846
0
        hts_tpool_delete_result(r, 0);
3847
3848
        // Also updated by main thread
3849
0
        pthread_mutex_lock(&fd->lines_m);
3850
0
        gl->next = fd->lines;
3851
0
        fd->lines = gl;
3852
0
        pthread_mutex_unlock(&fd->lines_m);
3853
0
    }
3854
3855
0
    sam_state_err(fd, 0); // success
3856
0
    hts_tpool_process_shutdown(fd->q);
3857
0
    return NULL;
3858
3859
0
 err:
3860
0
    sam_state_err(fd, errno ? errno : EIO);
3861
0
    return (void *)-1;
3862
0
}
3863
3864
// Run from one of the worker threads.
3865
// Convert a passed in array of BAMs (sp_bams) and converts to a block
3866
// of text SAM records (sp_lines).
3867
0
static void *sam_format_worker(void *arg) {
3868
0
    sp_bams *gb = (sp_bams *)arg;
3869
0
    sp_lines *gl = NULL;
3870
0
    int i;
3871
0
    SAM_state *fd = gb->fd;
3872
0
    htsFile *fp = fd->fp;
3873
3874
    // Use a block of SAM strings we had earlier if available.
3875
0
    pthread_mutex_lock(&fd->lines_m);
3876
0
    if (fd->lines) {
3877
0
        gl = fd->lines;
3878
0
        fd->lines = gl->next;
3879
0
    }
3880
0
    pthread_mutex_unlock(&fd->lines_m);
3881
3882
0
    if (gl == NULL) {
3883
0
        gl = calloc(1, sizeof(*gl));
3884
0
        if (!gl) {
3885
0
            sam_state_err(fd, ENOMEM);
3886
0
            return NULL;
3887
0
        }
3888
0
        gl->alloc = gl->data_size = 0;
3889
0
        gl->data = NULL;
3890
0
    }
3891
0
    gl->serial = gb->serial;
3892
0
    gl->next = NULL;
3893
3894
0
    kstring_t ks = {0, gl->alloc, gl->data};
3895
3896
0
    for (i = 0; i < gb->nbams; i++) {
3897
0
        if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) {
3898
0
            sam_state_err(fd, errno ? errno : EIO);
3899
0
            goto err;
3900
0
        }
3901
0
        kputc('\n', &ks);
3902
0
    }
3903
3904
0
    pthread_mutex_lock(&fd->lines_m);
3905
0
    gl->data_size = ks.l;
3906
0
    gl->alloc = ks.m;
3907
0
    gl->data = ks.s;
3908
3909
0
    if (fp->idx) {
3910
        // Keep hold of the bam array a little longer as
3911
        // sam_dispatcher_write needs to use them for building the index.
3912
0
        gl->bams = gb;
3913
0
    } else {
3914
        // Add bam array to free-list
3915
0
        gb->next = fd->bams;
3916
0
        fd->bams = gb;
3917
0
    }
3918
0
    pthread_mutex_unlock(&fd->lines_m);
3919
3920
0
    return gl;
3921
3922
0
 err:
3923
    // Possible race between this and fd->curr_bam.
3924
    // Easier to not free and leave it on the input list so it
3925
    // gets freed there instead?
3926
    // sam_free_sp_bams(gb);
3927
0
    if (gl) {
3928
0
        free(gl->data);
3929
0
        free(gl);
3930
0
    }
3931
0
    return NULL;
3932
0
}
3933
3934
0
int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) {
3935
0
    if (fp->state)
3936
0
        return 0;
3937
3938
0
    if (!(fp->state = sam_state_create(fp)))
3939
0
        return -1;
3940
0
    SAM_state *fd = (SAM_state *)fp->state;
3941
3942
0
    pthread_mutex_init(&fd->lines_m, NULL);
3943
0
    pthread_mutex_init(&fd->command_m, NULL);
3944
0
    pthread_cond_init(&fd->command_c, NULL);
3945
0
    fd->p = p->pool;
3946
0
    int qsize = p->qsize;
3947
0
    if (!qsize)
3948
0
        qsize = 2*hts_tpool_size(fd->p);
3949
0
    fd->q = hts_tpool_process_init(fd->p, qsize, 0);
3950
0
    if (!fd->q) {
3951
0
        sam_state_destroy(fp);
3952
0
        return -1;
3953
0
    }
3954
3955
0
    if (fp->format.compression == bgzf)
3956
0
        return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize);
3957
3958
0
    return 0;
3959
0
}
3960
3961
0
int sam_set_threads(htsFile *fp, int nthreads) {
3962
0
    if (nthreads <= 0)
3963
0
        return 0;
3964
3965
0
    htsThreadPool p;
3966
0
    p.pool = hts_tpool_init(nthreads);
3967
0
    p.qsize = nthreads*2;
3968
3969
0
    int ret = sam_set_thread_pool(fp, &p);
3970
0
    if (ret < 0)
3971
0
        return ret;
3972
3973
0
    SAM_state *fd = (SAM_state *)fp->state;
3974
0
    fd->own_pool = 1;
3975
3976
0
    return 0;
3977
0
}
3978
3979
typedef struct {
3980
    kstring_t name;
3981
    kstring_t comment; // NB: pointer into name, do not free
3982
    kstring_t seq;
3983
    kstring_t qual;
3984
    int casava;
3985
    int aux;
3986
    int rnum;
3987
    char BC[3];         // aux tag ID for barcode
3988
    khash_t(tag) *tags; // which aux tags to use (if empty, use all).
3989
    char nprefix;
3990
    int sra_names;
3991
} fastq_state;
3992
3993
// Initialise fastq state.
3994
// Name char of '@' or '>' distinguishes fastq vs fasta variant
3995
7.27k
static fastq_state *fastq_state_init(int name_char) {
3996
7.27k
    fastq_state *x = (fastq_state *)calloc(1, sizeof(*x));
3997
7.27k
    if (!x)
3998
0
        return NULL;
3999
7.27k
    strcpy(x->BC, "BC");
4000
7.27k
    x->nprefix = name_char;
4001
4002
7.27k
    return x;
4003
7.27k
}
4004
4005
9.69k
void fastq_state_destroy(htsFile *fp) {
4006
9.69k
    if (fp->state) {
4007
7.27k
        fastq_state *x = (fastq_state *)fp->state;
4008
7.27k
        if (x->tags)
4009
7.27k
            kh_destroy(tag, x->tags);
4010
7.27k
        ks_free(&x->name);
4011
7.27k
        ks_free(&x->seq);
4012
7.27k
        ks_free(&x->qual);
4013
7.27k
        free(fp->state);
4014
7.27k
    }
4015
9.69k
}
4016
4017
0
int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
4018
0
    va_list args;
4019
4020
0
    if (!fp)
4021
0
        return -1;
4022
0
    if (!fp->state)
4023
0
        if (!(fp->state = fastq_state_init(fp->format.format == fastq_format
4024
0
                                           ? '@' : '>')))
4025
0
            return -1;
4026
4027
0
    fastq_state *x = (fastq_state *)fp->state;
4028
4029
0
    switch (opt) {
4030
0
    case FASTQ_OPT_CASAVA:
4031
0
        x->casava = 1;
4032
0
        break;
4033
4034
0
    case FASTQ_OPT_NAME2:
4035
0
        x->sra_names = 1;
4036
0
        break;
4037
4038
0
    case FASTQ_OPT_AUX: {
4039
0
        va_start(args, opt);
4040
0
        x->aux = 1;
4041
0
        char *tag = va_arg(args, char *);
4042
0
        va_end(args);
4043
0
        if (tag && strcmp(tag, "1") != 0) {
4044
0
            if (!x->tags)
4045
0
                if (!(x->tags = kh_init(tag)))
4046
0
                    return -1;
4047
4048
0
            size_t i, tlen = strlen(tag);
4049
0
            for (i = 0; i+3 <= tlen+1; i += 3) {
4050
0
                if (tag[i+0] == ',' || tag[i+1] == ',' ||
4051
0
                    !(tag[i+2] == ',' || tag[i+2] == '\0')) {
4052
0
                    hts_log_warning("Bad tag format '%.3s'; skipping option", tag+i);
4053
0
                    break;
4054
0
                }
4055
0
                int ret, tcode = tag[i+0]*256 + tag[i+1];
4056
0
                kh_put(tag, x->tags, tcode, &ret);
4057
0
                if (ret < 0)
4058
0
                    return -1;
4059
0
            }
4060
0
        }
4061
0
        break;
4062
0
    }
4063
4064
0
    case FASTQ_OPT_BARCODE: {
4065
0
        va_start(args, opt);
4066
0
        char *bc = va_arg(args, char *);
4067
0
        va_end(args);
4068
0
        strncpy(x->BC, bc, 2);
4069
0
        x->BC[2] = 0;
4070
0
        break;
4071
0
    }
4072
4073
0
    case FASTQ_OPT_RNUM:
4074
0
        x->rnum = 1;
4075
0
        break;
4076
4077
0
    default:
4078
0
        break;
4079
0
    }
4080
0
    return 0;
4081
0
}
4082
4083
4.32M
static int fastq_parse1(htsFile *fp, bam1_t *b) {
4084
4.32M
    fastq_state *x = (fastq_state *)fp->state;
4085
4.32M
    size_t i, l;
4086
4.32M
    int ret = 0;
4087
4088
4.32M
    if (fp->format.format == fasta_format && fp->line.s) {
4089
        // For FASTA we've already read the >name line; steal it
4090
        // Not the most efficient, but we don't optimise for fasta reading.
4091
4.32M
        if (fp->line.l == 0)
4092
6.18k
            return -1; // EOF
4093
4094
4.31M
        free(x->name.s);
4095
4.31M
        x->name = fp->line;
4096
4.31M
        fp->line.l = fp->line.m = 0;
4097
4.31M
        fp->line.s = NULL;
4098
4.31M
    } else {
4099
        // Read a FASTQ format entry.
4100
8.01k
        ret = hts_getline(fp, KS_SEP_LINE, &x->name);
4101
8.01k
        if (ret == -1)
4102
120
            return -1;  // EOF
4103
7.89k
        else if (ret < -1)
4104
84
            return ret; // ERR
4105
8.01k
    }
4106
4107
    // Name
4108
4.32M
    if (*x->name.s != x->nprefix)
4109
102
        return -2;
4110
4111
    // Reverse the SRA strangeness of putting the run_name.number before
4112
    // the read name.
4113
4.32M
    i = 0;
4114
4.32M
    char *name = x->name.s+1;
4115
4.32M
    if (x->sra_names) {
4116
0
        char *cp = strpbrk(x->name.s, " \t");
4117
0
        if (cp) {
4118
0
            while (*cp == ' ' || *cp == '\t')
4119
0
                cp++;
4120
0
            *--cp = '@';
4121
0
            i = cp - x->name.s;
4122
0
            name = cp+1;
4123
0
        }
4124
0
    }
4125
4126
4.32M
    l = x->name.l;
4127
4.32M
    char *s = x->name.s;
4128
36.1M
    while (i < l && !isspace_c(s[i]))
4129
31.8M
        i++;
4130
4.32M
    if (i < l) {
4131
70.9k
        s[i] = 0;
4132
70.9k
        x->name.l = i++;
4133
70.9k
    }
4134
4135
    // Comment; a kstring struct, but pointer into name line.  (Do not free)
4136
4.51M
    while (i < l && isspace_c(s[i]))
4137
192k
        i++;
4138
4.32M
    x->comment.s = s+i;
4139
4.32M
    x->comment.l = l - i;
4140
4141
    // Seq
4142
4.32M
    x->seq.l = 0;
4143
28.8M
    for (;;) {
4144
28.8M
        if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0)
4145
6.85k
            if (fp->format.format == fastq_format || ret < -1)
4146
627
                return -2;
4147
28.8M
        if (ret == -1 ||
4148
28.8M
            *fp->line.s == (fp->format.format == fastq_format ? '+' : '>'))
4149
4.32M
            break;
4150
24.5M
        if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0)
4151
0
            return -2;
4152
24.5M
    }
4153
4154
    // Qual
4155
4.32M
    if (fp->format.format == fastq_format) {
4156
798
        size_t remainder = x->seq.l;
4157
798
        x->qual.l = 0;
4158
45.6k
        do {
4159
45.6k
            if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0)
4160
30
                return -2;
4161
45.6k
            if (fp->line.l > remainder)
4162
45
                return -2;
4163
45.6k
            if (kputsn(fp->line.s, fp->line.l, &x->qual) < 0)
4164
0
                return -2;
4165
45.6k
            remainder -= fp->line.l;
4166
45.6k
        } while (remainder > 0);
4167
4168
        // Decr qual
4169
530k
        for (i = 0; i < x->qual.l; i++)
4170
530k
            x->qual.s[i] -= '!';
4171
723
    }
4172
4173
4.32M
    int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED;
4174
4.32M
    if (x->name.l > 2 &&
4175
4.32M
        x->name.s[x->name.l-2] == '/' &&
4176
4.32M
        isdigit_c(x->name.s[x->name.l-1])) {
4177
105k
        switch(x->name.s[x->name.l-1]) {
4178
6.44k
        case '1': flag |= BAM_FREAD1 | pflag; break;
4179
2.52k
        case '2': flag |= BAM_FREAD2 | pflag; break;
4180
96.0k
        default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
4181
105k
        }
4182
105k
        x->name.s[x->name.l-=2] = 0;
4183
105k
    }
4184
4185
    // Convert to BAM
4186
4.32M
    ret = bam_set1(b,
4187
4.32M
                   x->name.s + x->name.l - name, name,
4188
4.32M
                   flag,
4189
4.32M
                   -1, -1, 0, // ref '*', pos, mapq,
4190
4.32M
                   0, NULL,     // no cigar,
4191
4.32M
                   -1, -1, 0,    // mate
4192
4.32M
                   x->seq.l, x->seq.s, x->qual.s,
4193
4.32M
                   0);
4194
4.32M
    if (ret < 0) return -2;
4195
4196
    // Identify Illumina CASAVA strings.
4197
    // <read>:<is_filtered>:<control_bits>:<barcode_sequence>
4198
4.32M
    char *barcode = NULL;
4199
4.32M
    int barcode_len = 0;
4200
4.32M
    kstring_t *kc = &x->comment;
4201
4.32M
    char *endptr;
4202
4.32M
    if (x->casava &&
4203
        // \d:[YN]:\d+:[ACGTN]+
4204
4.32M
        kc->l > 6 && (kc->s[1] | kc->s[3]) == ':' && isdigit_c(kc->s[0]) &&
4205
4.32M
        strtol(kc->s+4, &endptr, 10) >= 0 && endptr != kc->s+4
4206
4.32M
        && *endptr == ':') {
4207
4208
        // read num
4209
0
        switch(kc->s[0]) {
4210
0
        case '1': b->core.flag |= BAM_FREAD1 | pflag; break;
4211
0
        case '2': b->core.flag |= BAM_FREAD2 | pflag; break;
4212
0
        default : b->core.flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
4213
0
        }
4214
4215
0
        if (kc->s[2] == 'Y')
4216
0
            b->core.flag |= BAM_FQCFAIL;
4217
4218
        // Barcode, maybe numeric in which case we skip it
4219
0
        if (!isdigit_c(endptr[1])) {
4220
0
            barcode = endptr+1;
4221
0
            for (i = barcode - kc->s; i < kc->l; i++)
4222
0
                if (isspace_c(kc->s[i]))
4223
0
                    break;
4224
4225
0
            kc->s[i] = 0;
4226
0
            barcode_len = i+1-(barcode - kc->s);
4227
0
        }
4228
0
    }
4229
4230
4.32M
    if (ret >= 0 && barcode_len)
4231
0
        if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0)
4232
0
            ret = -2;
4233
4234
4.32M
    if (!x->aux)
4235
4.32M
        return ret;
4236
4237
    // Identify any SAM style aux tags in comments too.
4238
0
    if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0)
4239
0
        ret = -2;
4240
4241
0
    return ret;
4242
4.32M
}
4243
4244
// Internal component of sam_read1 below
4245
3.47k
static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4246
3.47k
    int ret = bam_read1(fp->fp.bgzf, b);
4247
3.47k
    if (h && ret >= 0) {
4248
1.91k
        if (b->core.tid  >= h->n_targets || b->core.tid  < -1 ||
4249
1.91k
            b->core.mtid >= h->n_targets || b->core.mtid < -1) {
4250
228
            errno = ERANGE;
4251
228
            return -3;
4252
228
        }
4253
1.91k
    }
4254
3.24k
    return ret;
4255
3.47k
}
4256
4257
// Internal component of sam_read1 below
4258
5.33k
static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) {
4259
5.33k
    int ret = cram_get_bam_seq(fp->fp.cram, b);
4260
5.33k
    if (ret < 0)
4261
5.33k
        return cram_eof(fp->fp.cram) ? -1 : -2;
4262
4263
0
    if (bam_tag2cigar(*b, 1, 1) < 0)
4264
0
        return -2;
4265
4266
0
    return ret;
4267
0
}
4268
4269
// Internal component of sam_read1 below
4270
242k
static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4271
242k
    int ret;
4272
4273
    // Consume 1st line after header parsing as it wasn't using peek
4274
242k
    if (fp->line.l != 0) {
4275
63
        ret = sam_parse1(&fp->line, h, b);
4276
63
        fp->line.l = 0;
4277
63
        return ret;
4278
63
    }
4279
4280
242k
    if (fp->state) {
4281
0
        SAM_state *fd = (SAM_state *)fp->state;
4282
4283
0
        if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) {
4284
            // We don't support multi-threaded SAM parsing with seeks yet.
4285
0
            int ret;
4286
0
            if ((ret = sam_state_destroy(fp)) < 0) {
4287
0
                errno = -ret;
4288
0
                return -2;
4289
0
            }
4290
0
            if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0)
4291
0
                return -2;
4292
0
            fp->fp.bgzf->seeked = 0;
4293
0
            goto err_recover;
4294
0
        }
4295
4296
0
        if (!fd->h) {
4297
0
            fd->h = h;
4298
0
            fd->h->ref_count++;
4299
            // Ensure hrecs is initialised now as we don't want multiple
4300
            // threads trying to do this simultaneously.
4301
0
            if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0)
4302
0
                return -2;
4303
4304
            // We can only do this once we've got a header
4305
0
            if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read,
4306
0
                               fp) != 0)
4307
0
                return -2;
4308
0
            fd->dispatcher_set = 1;
4309
0
        }
4310
4311
0
        if (fd->h != h) {
4312
0
            hts_log_error("SAM multi-threaded decoding does not support changing header");
4313
0
            return -2;
4314
0
        }
4315
4316
0
        sp_bams *gb = fd->curr_bam;
4317
0
        if (!gb) {
4318
0
            if (fd->errcode) {
4319
                // In case reader failed
4320
0
                errno = fd->errcode;
4321
0
                return -2;
4322
0
            }
4323
4324
0
            pthread_mutex_lock(&fd->command_m);
4325
0
            int cmd = fd->command;
4326
0
            pthread_mutex_unlock(&fd->command_m);
4327
0
            if (cmd == SAM_AT_EOF)
4328
0
                return -1;
4329
4330
0
            hts_tpool_result *r = hts_tpool_next_result_wait(fd->q);
4331
0
            if (!r)
4332
0
                return -2;
4333
0
            fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r);
4334
0
            hts_tpool_delete_result(r, 0);
4335
0
        }
4336
0
        if (!gb) {
4337
0
            pthread_mutex_lock(&fd->command_m);
4338
0
            fd->command = SAM_AT_EOF;
4339
0
            pthread_mutex_unlock(&fd->command_m);
4340
0
            return fd->errcode ? -2 : -1;
4341
0
        }
4342
0
        bam1_t *b_array = (bam1_t *)gb->bams;
4343
0
        if (fd->curr_idx < gb->nbams)
4344
0
            if (!bam_copy1(b, &b_array[fd->curr_idx++]))
4345
0
                return -2;
4346
0
        if (fd->curr_idx == gb->nbams) {
4347
0
            pthread_mutex_lock(&fd->lines_m);
4348
0
            gb->next = fd->bams;
4349
0
            fd->bams = gb;
4350
0
            pthread_mutex_unlock(&fd->lines_m);
4351
4352
0
            fd->curr_bam = NULL;
4353
0
            fd->curr_idx = 0;
4354
        // Consider prefetching next record?  I.e.
4355
        // } else {
4356
        //     __builtin_prefetch(&b_array[fd->curr_idx], 0, 3);
4357
0
        }
4358
4359
0
        ret = 0;
4360
4361
242k
    } else  {
4362
242k
    err_recover:
4363
242k
        ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
4364
242k
        if (ret < 0) return ret;
4365
4366
236k
        ret = sam_parse1(&fp->line, h, b);
4367
236k
        fp->line.l = 0;
4368
236k
        if (ret < 0) {
4369
4.49k
            hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
4370
4.49k
            if (h && h->ignore_sam_err) goto err_recover;
4371
4.49k
        }
4372
236k
    }
4373
4374
236k
    return ret;
4375
242k
}
4376
4377
// Returns 0 on success,
4378
//        -1 on EOF,
4379
//       <-1 on error
4380
int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b)
4381
4.58M
{
4382
4.58M
    int ret, pass_filter;
4383
4384
4.58M
    do {
4385
4.58M
        switch (fp->format.format) {
4386
3.47k
        case bam:
4387
3.47k
            ret = sam_read1_bam(fp, h, b);
4388
3.47k
            break;
4389
4390
5.33k
        case cram:
4391
5.33k
            ret = sam_read1_cram(fp, h, &b);
4392
5.33k
            break;
4393
4394
242k
        case sam:
4395
242k
            ret = sam_read1_sam(fp, h, b);
4396
242k
            break;
4397
4398
4.32M
        case fasta_format:
4399
4.32M
        case fastq_format: {
4400
4.32M
            fastq_state *x = (fastq_state *)fp->state;
4401
4.32M
            if (!x) {
4402
7.27k
                if (!(fp->state = fastq_state_init(fp->format.format
4403
7.27k
                                                   == fastq_format ? '@' : '>')))
4404
0
                    return -2;
4405
7.27k
            }
4406
4407
4.32M
            return fastq_parse1(fp, b);
4408
4.32M
        }
4409
4410
0
        case empty_format:
4411
0
            errno = EPIPE;
4412
0
            return -3;
4413
4414
0
        default:
4415
0
            errno = EFTYPE;
4416
0
            return -3;
4417
4.58M
        }
4418
4419
251k
        pass_filter = (ret >= 0 && fp->filter)
4420
251k
            ? sam_passes_filter(h, b, fp->filter)
4421
251k
            : 1;
4422
251k
    } while (pass_filter == 0);
4423
4424
251k
    return pass_filter < 0 ? -2 : ret;
4425
4.58M
}
4426
4427
// With gcc, -O3 or -ftree-loop-vectorize is really key here as otherwise
4428
// this code isn't vectorised and runs far slower than is necessary (even
4429
// with the restrict keyword being used).
4430
static inline void HTS_OPT3
4431
525
add33(uint8_t *a, const uint8_t * b, int32_t len) {
4432
525
    uint32_t i;
4433
179k
    for (i = 0; i < len; i++)
4434
178k
        a[i] = b[i]+33;
4435
525
}
4436
4437
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4438
1.51M
{
4439
1.51M
    int i, r = 0;
4440
1.51M
    uint8_t *s, *end;
4441
1.51M
    const bam1_core_t *c = &b->core;
4442
4443
1.51M
    if (c->l_qname == 0)
4444
0
        return -1;
4445
1.51M
    r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str);
4446
1.51M
    r |= kputc_('\t', str); // query name
4447
1.51M
    r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag
4448
1.51M
    if (c->tid >= 0) { // chr
4449
56.4k
        r |= kputs(h->target_name[c->tid] , str);
4450
56.4k
        r |= kputc_('\t', str);
4451
1.46M
    } else r |= kputsn_("*\t", 2, str);
4452
1.51M
    r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos
4453
1.51M
    r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual
4454
1.51M
    if (c->n_cigar) { // cigar
4455
73.0k
        uint32_t *cigar = bam_get_cigar(b);
4456
2.04M
        for (i = 0; i < c->n_cigar; ++i) {
4457
1.97M
            r |= kputw(bam_cigar_oplen(cigar[i]), str);
4458
1.97M
            r |= kputc_(bam_cigar_opchr(cigar[i]), str);
4459
1.97M
        }
4460
1.44M
    } else r |= kputc_('*', str);
4461
1.51M
    r |= kputc_('\t', str);
4462
1.51M
    if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr
4463
2.00k
    else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str);
4464
1.50k
    else {
4465
1.50k
        r |= kputs(h->target_name[c->mtid], str);
4466
1.50k
        r |= kputc_('\t', str);
4467
1.50k
    }
4468
1.51M
    r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos
4469
1.51M
    r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len
4470
1.51M
    if (c->l_qseq) { // seq and qual
4471
185k
        uint8_t *s = bam_get_seq(b);
4472
185k
        if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err;
4473
185k
        char *cp = str->s + str->l;
4474
4475
        // Sequence, 2 bases at a time
4476
185k
        nibble2base(s, cp, c->l_qseq);
4477
185k
        cp[c->l_qseq] = '\t';
4478
185k
        cp += c->l_qseq+1;
4479
4480
        // Quality
4481
185k
        s = bam_get_qual(b);
4482
185k
        i = 0;
4483
185k
        if (s[0] == 0xff) {
4484
184k
            cp[i++] = '*';
4485
184k
        } else {
4486
525
            add33((uint8_t *)cp, s, c->l_qseq); // cp[i] = s[i]+33;
4487
525
            i = c->l_qseq;
4488
525
        }
4489
185k
        cp[i] = 0;
4490
185k
        cp += i;
4491
185k
        str->l = cp - str->s;
4492
1.33M
    } else r |= kputsn_("*\t*", 3, str);
4493
4494
1.51M
    s = bam_get_aux(b); // aux
4495
1.51M
    end = b->data + b->l_data;
4496
4497
2.34M
    while (end - s >= 4) {
4498
822k
        r |= kputc_('\t', str);
4499
822k
        if ((s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)) == NULL)
4500
322
            goto bad_aux;
4501
822k
    }
4502
1.51M
    r |= kputsn("", 0, str); // nul terminate
4503
1.51M
    if (r < 0) goto mem_err;
4504
4505
1.51M
    return str->l;
4506
4507
322
 bad_aux:
4508
322
    hts_log_error("Corrupted aux data for read %.*s flag %d",
4509
322
                  b->core.l_qname, bam_get_qname(b), b->core.flag);
4510
322
    errno = EINVAL;
4511
322
    return -1;
4512
4513
0
 mem_err:
4514
0
    hts_log_error("Out of memory");
4515
0
    errno = ENOMEM;
4516
0
    return -1;
4517
1.51M
}
4518
4519
int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4520
1.51M
{
4521
1.51M
    str->l = 0;
4522
1.51M
    return sam_format1_append(h, b, str);
4523
1.51M
}
4524
4525
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end);
4526
int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str)
4527
0
{
4528
0
    unsigned flag = b->core.flag;
4529
0
    int i, e = 0, len = b->core.l_qseq;
4530
0
    uint8_t *seq, *qual;
4531
4532
0
    str->l = 0;
4533
4534
    // Name
4535
0
    if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF)
4536
0
        return -1;
4537
4538
    // /1 or /2 suffix
4539
0
    if (x && x->rnum && (flag & BAM_FPAIRED)) {
4540
0
        int r12 = flag & (BAM_FREAD1 | BAM_FREAD2);
4541
0
        if (r12 == BAM_FREAD1) {
4542
0
            if (kputs("/1", str) == EOF)
4543
0
                return -1;
4544
0
        } else if (r12 == BAM_FREAD2) {
4545
0
            if (kputs("/2", str) == EOF)
4546
0
                return -1;
4547
0
        }
4548
0
    }
4549
4550
    // Illumina CASAVA tag.
4551
    // This is <rnum>:<Y/N qcfail>:<control-bits>:<barcode-or-zero>
4552
0
    if (x && x->casava) {
4553
0
        int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0;
4554
0
        char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N';
4555
0
        uint8_t *bc = bam_aux_get(b, x->BC);
4556
0
        if (ksprintf(str, " %d:%c:0:%s", rnum, filtered,
4557
0
                     bc ? (char *)bc+1 : "0") < 0)
4558
0
            return -1;
4559
4560
0
        if (bc && (*bc != 'Z' || (!isupper_c(bc[1]) && !islower_c(bc[1])))) {
4561
0
            hts_log_warning("BC tag starts with non-sequence base; using '0'");
4562
0
            str->l -= strlen((char *)bc)-2; // limit to 1 char
4563
0
            str->s[str->l-1] = '0';
4564
0
            str->s[str->l] = 0;
4565
0
            bc = NULL;
4566
0
        }
4567
4568
        // Replace any non-alpha with '+'.  Ie seq-seq to seq+seq
4569
0
        if (bc) {
4570
0
            int l = strlen((char *)bc+1);
4571
0
            char *c = (char *)str->s + str->l - l;
4572
0
            for (i = 0; i < l; i++) {
4573
0
                if (!isalpha_c(c[i]))
4574
0
                    c[i] = '+';
4575
0
                else if (islower_c(c[i]))
4576
0
                    c[i] = toupper_c(c[i]);
4577
0
            }
4578
0
        }
4579
0
    }
4580
4581
    // Aux tags
4582
0
    if (x && x->aux) {
4583
0
        uint8_t *s = bam_get_aux(b), *end = b->data + b->l_data;
4584
0
        while (s && end - s >= 4) {
4585
0
            int tt = s[0]*256 + s[1];
4586
0
            if (x->tags == NULL ||
4587
0
                kh_get(tag, x->tags, tt) != kh_end(x->tags)) {
4588
0
                e |= kputc_('\t', str) < 0;
4589
0
                if (!(s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)))
4590
0
                    return -1;
4591
0
            } else {
4592
0
                s = skip_aux(s+2, end);
4593
0
            }
4594
0
        }
4595
0
        e |= kputsn("", 0, str) < 0; // nul terminate
4596
0
    }
4597
4598
0
    if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1;
4599
0
    e |= kputc_('\n', str) < 0;
4600
4601
    // Seq line
4602
0
    seq = bam_get_seq(b);
4603
0
    if (flag & BAM_FREVERSE)
4604
0
        for (i = len-1; i >= 0; i--)
4605
0
            e |= kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str) < 0;
4606
0
    else
4607
0
        for (i = 0; i < len; i++)
4608
0
            e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0;
4609
4610
4611
    // Qual line
4612
0
    if (x->nprefix == '@') {
4613
0
        kputsn("\n+\n", 3, str);
4614
0
        qual = bam_get_qual(b);
4615
0
        if (qual[0] == 0xff)
4616
0
            for (i = 0; i < len; i++)
4617
0
                e |= kputc_('B', str) < 0;
4618
0
        else if (flag & BAM_FREVERSE)
4619
0
            for (i = len-1; i >= 0; i--)
4620
0
                e |= kputc_(33 + qual[i], str) < 0;
4621
0
        else
4622
0
            for (i = 0; i < len; i++)
4623
0
                e |= kputc_(33 + qual[i], str) < 0;
4624
4625
0
    }
4626
0
    e |= kputc('\n', str) < 0;
4627
4628
0
    return e ? -1 : str->l;
4629
0
}
4630
4631
// Sadly we need to be able to modify the bam_hdr here so we can
4632
// reference count the structure.
4633
int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b)
4634
4.55M
{
4635
4.55M
    switch (fp->format.format) {
4636
0
    case binary_format:
4637
0
        fp->format.category = sequence_data;
4638
0
        fp->format.format = bam;
4639
        /* fall-through */
4640
1.51M
    case bam:
4641
1.51M
        return bam_write_idx1(fp, h, b);
4642
4643
1.51M
    case cram:
4644
1.51M
        return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
4645
4646
0
    case text_format:
4647
0
        fp->format.category = sequence_data;
4648
0
        fp->format.format = sam;
4649
        /* fall-through */
4650
1.51M
    case sam:
4651
1.51M
        if (fp->state) {
4652
0
            SAM_state *fd = (SAM_state *)fp->state;
4653
4654
            // Threaded output
4655
0
            if (!fd->h) {
4656
                // NB: discard const.  We don't actually modify sam_hdr_t here,
4657
                // just data pointed to by it (which is a bit weasely still),
4658
                // but out cached pointer must be non-const as we want to
4659
                // destroy it later on and sam_hdr_destroy takes non-const.
4660
                //
4661
                // We do this because some tools do sam_hdr_destroy; sam_close
4662
                // while others do sam_close; sam_hdr_destroy.  The former is
4663
                // an issue as we need the header still when flushing.
4664
0
                fd->h = (sam_hdr_t *)h;
4665
0
                fd->h->ref_count++;
4666
4667
0
                if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write,
4668
0
                                   fp) != 0)
4669
0
                    return -2;
4670
0
                fd->dispatcher_set = 1;
4671
0
            }
4672
4673
0
            if (fd->h != h) {
4674
0
                hts_log_error("SAM multi-threaded decoding does not support changing header");
4675
0
                return -2;
4676
0
            }
4677
4678
            // Find a suitable BAM array to copy to
4679
0
            sp_bams *gb = fd->curr_bam;
4680
0
            if (!gb) {
4681
0
                pthread_mutex_lock(&fd->lines_m);
4682
0
                if (fd->bams) {
4683
0
                    fd->curr_bam = gb = fd->bams;
4684
0
                    fd->bams = gb->next;
4685
0
                    gb->next = NULL;
4686
0
                    gb->nbams = 0;
4687
0
                    gb->bam_mem = 0;
4688
0
                    pthread_mutex_unlock(&fd->lines_m);
4689
0
                } else {
4690
0
                    pthread_mutex_unlock(&fd->lines_m);
4691
0
                    if (!(gb = calloc(1, sizeof(*gb)))) return -1;
4692
0
                    if (!(gb->bams = calloc(SAM_NBAM, sizeof(*gb->bams)))) {
4693
0
                        free(gb);
4694
0
                        return -1;
4695
0
                    }
4696
0
                    gb->nbams = 0;
4697
0
                    gb->abams = SAM_NBAM;
4698
0
                    gb->bam_mem = 0;
4699
0
                    gb->fd = fd;
4700
0
                    fd->curr_idx = 0;
4701
0
                    fd->curr_bam = gb;
4702
0
                }
4703
0
            }
4704
4705
0
            if (!bam_copy1(&gb->bams[gb->nbams++], b))
4706
0
                return -2;
4707
0
            gb->bam_mem += b->l_data + sizeof(*b);
4708
4709
            // Dispatch if full
4710
0
            if (gb->nbams == SAM_NBAM || gb->bam_mem > SAM_NBYTES*0.8) {
4711
0
                gb->serial = fd->serial++;
4712
0
                pthread_mutex_lock(&fd->command_m);
4713
0
                if (fd->errcode != 0) {
4714
0
                    pthread_mutex_unlock(&fd->command_m);
4715
0
                    return -fd->errcode;
4716
0
                }
4717
0
                if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb,
4718
0
                                        cleanup_sp_bams,
4719
0
                                        cleanup_sp_lines, 0) < 0) {
4720
0
                    pthread_mutex_unlock(&fd->command_m);
4721
0
                    return -1;
4722
0
                }
4723
0
                pthread_mutex_unlock(&fd->command_m);
4724
0
                fd->curr_bam = NULL;
4725
0
            }
4726
4727
            // Dummy value as we don't know how long it really is.
4728
            // We could track file sizes via a SAM_state field, but I don't think
4729
            // it is necessary.
4730
0
            return 1;
4731
1.51M
        } else {
4732
1.51M
            if (sam_format1(h, b, &fp->line) < 0) return -1;
4733
1.51M
            kputc('\n', &fp->line);
4734
1.51M
            if (fp->is_bgzf) {
4735
0
                if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4736
0
                    return -1;
4737
0
                if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4738
1.51M
            } else {
4739
1.51M
                if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4740
1.51M
            }
4741
4742
1.51M
            if (fp->idx) {
4743
0
                if (fp->format.compression == bgzf) {
4744
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4745
0
                                      bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4746
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4747
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4748
0
                        return -1;
4749
0
                    }
4750
0
                } else {
4751
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4752
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4753
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4754
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4755
0
                        return -1;
4756
0
                    }
4757
0
                }
4758
0
            }
4759
4760
1.51M
            return fp->line.l;
4761
1.51M
        }
4762
4763
4764
0
    case fasta_format:
4765
0
    case fastq_format: {
4766
0
        fastq_state *x = (fastq_state *)fp->state;
4767
0
        if (!x) {
4768
0
            if (!(fp->state = fastq_state_init(fp->format.format
4769
0
                                               == fastq_format ? '@' : '>')))
4770
0
                return -2;
4771
0
        }
4772
4773
0
        if (fastq_format1(fp->state, b, &fp->line) < 0)
4774
0
            return -1;
4775
0
        if (fp->is_bgzf) {
4776
0
            if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4777
0
                return -1;
4778
0
            if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l)
4779
0
                return -1;
4780
0
        } else {
4781
0
            if (hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l)
4782
0
                return -1;
4783
0
        }
4784
0
        return fp->line.l;
4785
0
    }
4786
4787
0
    default:
4788
0
        errno = EBADF;
4789
0
        return -1;
4790
4.55M
    }
4791
4.55M
}
4792
4793
/************************
4794
 *** Auxiliary fields ***
4795
 ************************/
4796
#ifndef HTS_LITTLE_ENDIAN
4797
static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
4798
    int tsz = aux_type2size(type);
4799
4800
    if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
4801
4802
    switch (tsz) {
4803
        case 'H': case 'Z': case 1:  // Trivial
4804
            memcpy(out, in, len);
4805
            break;
4806
4807
#define aux_val_to_le(type_t, store_le) do {                            \
4808
        type_t v;                                                       \
4809
        size_t i;                                                       \
4810
        for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
4811
            memcpy(&v, in + i, sizeof(type_t));                         \
4812
            store_le(v, out);                                           \
4813
        }                                                               \
4814
    } while (0)
4815
4816
        case 2: aux_val_to_le(uint16_t, u16_to_le); break;
4817
        case 4: aux_val_to_le(uint32_t, u32_to_le); break;
4818
        case 8: aux_val_to_le(uint64_t, u64_to_le); break;
4819
4820
#undef aux_val_to_le
4821
4822
        case 'B': { // Recurse!
4823
            uint32_t n;
4824
            if (len < 5) return -1;
4825
            memcpy(&n, in + 1, 4);
4826
            out[0] = in[0];
4827
            u32_to_le(n, out + 1);
4828
            return aux_to_le(in[0], out + 5, in + 5, len - 5);
4829
        }
4830
4831
        default: // Unknown type code
4832
            return -1;
4833
    }
4834
4835
4836
4837
    return 0;
4838
}
4839
#endif
4840
4841
int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
4842
0
{
4843
0
    uint32_t new_len;
4844
4845
0
    assert(b->l_data >= 0);
4846
0
    new_len = b->l_data + 3 + len;
4847
0
    if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
4848
4849
0
    if (realloc_bam_data(b, new_len) < 0) return -1;
4850
4851
0
    b->data[b->l_data] = tag[0];
4852
0
    b->data[b->l_data + 1] = tag[1];
4853
0
    b->data[b->l_data + 2] = type;
4854
4855
0
#ifdef HTS_LITTLE_ENDIAN
4856
0
    memcpy(b->data + b->l_data + 3, data, len);
4857
#else
4858
    if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
4859
        errno = EINVAL;
4860
        return -1;
4861
    }
4862
#endif
4863
4864
0
    b->l_data = new_len;
4865
4866
0
    return 0;
4867
4868
0
 nomem:
4869
0
    errno = ENOMEM;
4870
0
    return -1;
4871
0
}
4872
4873
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
4874
1.67M
{
4875
1.67M
    int size;
4876
1.67M
    uint32_t n;
4877
1.67M
    if (s >= end) return end;
4878
1.67M
    size = aux_type2size(*s); ++s; // skip type
4879
1.67M
    switch (size) {
4880
209k
    case 'Z':
4881
212k
    case 'H':
4882
212k
        s = memchr(s, 0, end-s);
4883
212k
        return s ? s+1 : end;
4884
79.8k
    case 'B':
4885
79.8k
        if (end - s < 5) return NULL;
4886
79.8k
        size = aux_type2size(*s); ++s;
4887
79.8k
        n = le_to_u32(s);
4888
79.8k
        s += 4;
4889
79.8k
        if (size == 0 || end - s < size * n) return NULL;
4890
79.7k
        return s + size * n;
4891
550
    case 0:
4892
550
        return NULL;
4893
1.38M
    default:
4894
1.38M
        if (end - s < size) return NULL;
4895
1.38M
        return s + size;
4896
1.67M
    }
4897
1.67M
}
4898
4899
uint8_t *bam_aux_first(const bam1_t *b)
4900
1.65M
{
4901
1.65M
    uint8_t *s = bam_get_aux(b);
4902
1.65M
    uint8_t *end = b->data + b->l_data;
4903
1.65M
    if (end - s <= 2) { errno = ENOENT; return NULL; }
4904
196k
    return s+2;
4905
1.65M
}
4906
4907
uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s)
4908
1.59M
{
4909
1.59M
    uint8_t *end = b->data + b->l_data;
4910
1.59M
    uint8_t *next = s? skip_aux((uint8_t *) s, end) : end;
4911
1.59M
    if (next == NULL) goto bad_aux;
4912
1.59M
    if (end - next <= 2) { errno = ENOENT; return NULL; }
4913
1.48M
    return next+2;
4914
4915
630
 bad_aux:
4916
630
    hts_log_error("Corrupted aux data for read %s flag %d",
4917
630
                  bam_get_qname(b), b->core.flag);
4918
630
    errno = EINVAL;
4919
630
    return NULL;
4920
1.59M
}
4921
4922
uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
4923
1.65M
{
4924
1.65M
    uint8_t *s;
4925
3.25M
    for (s = bam_aux_first(b); s; s = bam_aux_next(b, s))
4926
1.67M
        if (s[-2] == tag[0] && s[-1] == tag[1]) {
4927
            // Check the tag value is valid and complete
4928
79.6k
            uint8_t *e = skip_aux(s, b->data + b->l_data);
4929
79.6k
            if (e == NULL) goto bad_aux;
4930
79.6k
            if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux;
4931
4932
79.6k
            return s;
4933
79.6k
        }
4934
4935
    // errno now as set by bam_aux_first()/bam_aux_next()
4936
1.57M
    return NULL;
4937
4938
7
 bad_aux:
4939
7
    hts_log_error("Corrupted aux data for read %s flag %d",
4940
7
                  bam_get_qname(b), b->core.flag);
4941
7
    errno = EINVAL;
4942
7
    return NULL;
4943
1.65M
}
4944
4945
int bam_aux_del(bam1_t *b, uint8_t *s)
4946
0
{
4947
0
    s = bam_aux_remove(b, s);
4948
0
    return (s || errno == ENOENT)? 0 : -1;
4949
0
}
4950
4951
uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s)
4952
0
{
4953
0
    uint8_t *end = b->data + b->l_data;
4954
0
    uint8_t *next = skip_aux(s, end);
4955
0
    if (next == NULL) goto bad_aux;
4956
4957
0
    b->l_data -= next - (s-2);
4958
0
    if (next >= end) { errno = ENOENT; return NULL; }
4959
4960
0
    memmove(s-2, next, end - next);
4961
0
    return s;
4962
4963
0
 bad_aux:
4964
0
    hts_log_error("Corrupted aux data for read %s flag %d",
4965
0
                  bam_get_qname(b), b->core.flag);
4966
0
    errno = EINVAL;
4967
0
    return NULL;
4968
0
}
4969
4970
int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
4971
0
{
4972
    // FIXME: This is not at all efficient!
4973
0
    size_t ln = len >= 0 ? len : strlen(data) + 1;
4974
0
    size_t old_ln = 0;
4975
0
    int need_nul = ln == 0 || data[ln - 1] != '\0';
4976
0
    int save_errno = errno;
4977
0
    int new_tag = 0;
4978
0
    uint8_t *s = bam_aux_get(b,tag), *e;
4979
4980
0
    if (s) {  // Replacing existing tag
4981
0
        char type = *s;
4982
0
        if (type != 'Z') {
4983
0
            hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type);
4984
0
            errno = EINVAL;
4985
0
            return -1;
4986
0
        }
4987
0
        s++;
4988
0
        e = memchr(s, '\0', b->data + b->l_data - s);
4989
0
        old_ln = (e ? e - s : b->data + b->l_data - s) + 1;
4990
0
        s -= 3;
4991
0
    } else {
4992
0
        if (errno != ENOENT) { // Invalid aux data, give up
4993
0
            return -1;
4994
0
        } else { // Tag doesn't exist - put it on the end
4995
0
            errno = save_errno;
4996
0
            s = b->data + b->l_data;
4997
0
            new_tag = 3;
4998
0
        }
4999
0
    }
5000
5001
0
    if (old_ln < ln + need_nul + new_tag) {
5002
0
        ptrdiff_t s_offset = s - b->data;
5003
0
        if (possibly_expand_bam_data(b, ln + need_nul + new_tag - old_ln) < 0)
5004
0
            return -1;
5005
0
        s = b->data + s_offset;
5006
0
    }
5007
0
    if (!new_tag) {
5008
0
        memmove(s + 3 + ln + need_nul,
5009
0
                s + 3 + old_ln,
5010
0
                b->l_data - (s + 3 - b->data) - old_ln);
5011
0
    }
5012
0
    b->l_data += new_tag + ln + need_nul - old_ln;
5013
5014
0
    s[0] = tag[0];
5015
0
    s[1] = tag[1];
5016
0
    s[2] = 'Z';
5017
0
    memmove(s+3,data,ln);
5018
0
    if (need_nul) s[3 + ln] = '\0';
5019
0
    return 0;
5020
0
}
5021
5022
int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val)
5023
0
{
5024
0
    uint32_t sz, old_sz = 0, new = 0;
5025
0
    uint8_t *s, type;
5026
5027
0
    if (val < INT32_MIN || val > UINT32_MAX) {
5028
0
        errno = EOVERFLOW;
5029
0
        return -1;
5030
0
    }
5031
0
    if (val < INT16_MIN)       { type = 'i'; sz = 4; }
5032
0
    else if (val < INT8_MIN)   { type = 's'; sz = 2; }
5033
0
    else if (val < 0)          { type = 'c'; sz = 1; }
5034
0
    else if (val < UINT8_MAX)  { type = 'C'; sz = 1; }
5035
0
    else if (val < UINT16_MAX) { type = 'S'; sz = 2; }
5036
0
    else                       { type = 'I'; sz = 4; }
5037
5038
0
    s = bam_aux_get(b, tag);
5039
0
    if (s) {  // Tag present - how big was the old one?
5040
0
        switch (*s) {
5041
0
            case 'c': case 'C': old_sz = 1; break;
5042
0
            case 's': case 'S': old_sz = 2; break;
5043
0
            case 'i': case 'I': old_sz = 4; break;
5044
0
            default: errno = EINVAL; return -1;  // Not an integer
5045
0
        }
5046
0
    } else {
5047
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
5048
0
            s = b->data + b->l_data;
5049
0
            new = 1;
5050
0
        }  else { // Invalid aux data, give up.
5051
0
            return -1;
5052
0
        }
5053
0
    }
5054
5055
0
    if (new || old_sz < sz) {
5056
        // Make room for new tag
5057
0
        ptrdiff_t s_offset = s - b->data;
5058
0
        if (possibly_expand_bam_data(b, (new ? 3 : 0) + sz - old_sz) < 0)
5059
0
            return -1;
5060
0
        s =  b->data + s_offset;
5061
0
        if (new) { // Add tag id
5062
0
            *s++ = tag[0];
5063
0
            *s++ = tag[1];
5064
0
        } else {   // Shift following data so we have space
5065
0
            memmove(s + sz, s + old_sz, b->l_data - s_offset - old_sz);
5066
0
        }
5067
0
    } else {
5068
        // Reuse old space.  Data value may be bigger than necessary but
5069
        // we avoid having to move everything else
5070
0
        sz = old_sz;
5071
0
        type = (val < 0 ? "\0cs\0i" : "\0CS\0I")[old_sz];
5072
0
        assert(type > 0);
5073
0
    }
5074
0
    *s++ = type;
5075
0
#ifdef HTS_LITTLE_ENDIAN
5076
0
    memcpy(s, &val, sz);
5077
#else
5078
    switch (sz) {
5079
        case 4:  u32_to_le(val, s); break;
5080
        case 2:  u16_to_le(val, s); break;
5081
        default: *s = val; break;
5082
    }
5083
#endif
5084
0
    b->l_data += (new ? 3 : 0) + sz - old_sz;
5085
0
    return 0;
5086
0
}
5087
5088
int bam_aux_update_float(bam1_t *b, const char tag[2], float val)
5089
0
{
5090
0
    uint8_t *s = bam_aux_get(b, tag);
5091
0
    int shrink = 0, new = 0;
5092
5093
0
    if (s) { // Tag present - what was it?
5094
0
        switch (*s) {
5095
0
            case 'f': break;
5096
0
            case 'd': shrink = 1; break;
5097
0
            default: errno = EINVAL; return -1;  // Not a float
5098
0
        }
5099
0
    } else {
5100
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
5101
0
            new = 1;
5102
0
        }  else { // Invalid aux data, give up.
5103
0
            return -1;
5104
0
        }
5105
0
    }
5106
5107
0
    if (new) { // Ensure there's room
5108
0
        if (possibly_expand_bam_data(b, 3 + 4) < 0)
5109
0
            return -1;
5110
0
        s = b->data + b->l_data;
5111
0
        *s++ = tag[0];
5112
0
        *s++ = tag[1];
5113
0
    } else if (shrink) { // Convert non-standard double tag to float
5114
0
        memmove(s + 5, s + 9, b->l_data - ((s + 9) - b->data));
5115
0
        b->l_data -= 4;
5116
0
    }
5117
0
    *s++ = 'f';
5118
0
    float_to_le(val, s);
5119
0
    if (new) b->l_data += 7;
5120
5121
0
    return 0;
5122
0
}
5123
5124
int bam_aux_update_array(bam1_t *b, const char tag[2],
5125
                         uint8_t type, uint32_t items, void *data)
5126
0
{
5127
0
    uint8_t *s = bam_aux_get(b, tag);
5128
0
    size_t old_sz = 0, new_sz;
5129
0
    int new = 0;
5130
5131
0
    if (s) { // Tag present
5132
0
        if (*s != 'B') { errno = EINVAL; return -1; }
5133
0
        old_sz = aux_type2size(s[1]);
5134
0
        if (old_sz < 1 || old_sz > 4) { errno = EINVAL; return -1; }
5135
0
        old_sz *= le_to_u32(s + 2);
5136
0
    } else {
5137
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
5138
0
            s = b->data + b->l_data;
5139
0
            new = 1;
5140
0
        }  else { // Invalid aux data, give up.
5141
0
            return -1;
5142
0
        }
5143
0
    }
5144
5145
0
    new_sz = aux_type2size(type);
5146
0
    if (new_sz < 1 || new_sz > 4) { errno = EINVAL; return -1; }
5147
0
    if (items > INT32_MAX / new_sz) { errno = ENOMEM; return -1; }
5148
0
    new_sz *= items;
5149
5150
0
    if (new || old_sz < new_sz) {
5151
        // Make room for new tag
5152
0
        ptrdiff_t s_offset = s - b->data;
5153
0
        if (possibly_expand_bam_data(b, (new ? 8 : 0) + new_sz - old_sz) < 0)
5154
0
            return -1;
5155
0
        s =  b->data + s_offset;
5156
0
    }
5157
0
    if (new) { // Add tag id and type
5158
0
        *s++ = tag[0];
5159
0
        *s++ = tag[1];
5160
0
        *s = 'B';
5161
0
        b->l_data += 8 + new_sz;
5162
0
    } else if (old_sz != new_sz) { // shift following data if necessary
5163
0
        memmove(s + 6 + new_sz, s + 6 + old_sz,
5164
0
                b->l_data - ((s + 6 + old_sz) - b->data));
5165
0
        b->l_data -= old_sz;
5166
0
        b->l_data += new_sz;
5167
0
    }
5168
5169
0
    s[1] = type;
5170
0
    u32_to_le(items, s + 2);
5171
0
    if (new_sz > 0) {
5172
0
#ifdef HTS_LITTLE_ENDIAN
5173
0
        memcpy(s + 6, data, new_sz);
5174
#else
5175
        return aux_to_le(type, s + 6, data, new_sz);
5176
#endif
5177
0
    }
5178
0
    return 0;
5179
0
}
5180
5181
static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s,
5182
                                      uint32_t idx)
5183
0
{
5184
0
    switch (type) {
5185
0
        case 'c': return le_to_i8(s + idx);
5186
0
        case 'C': return s[idx];
5187
0
        case 's': return le_to_i16(s + 2 * idx);
5188
0
        case 'S': return le_to_u16(s + 2 * idx);
5189
0
        case 'i': return le_to_i32(s + 4 * idx);
5190
0
        case 'I': return le_to_u32(s + 4 * idx);
5191
0
        default:
5192
0
            errno = EINVAL;
5193
0
            return 0;
5194
0
    }
5195
0
}
5196
5197
int64_t bam_aux2i(const uint8_t *s)
5198
0
{
5199
0
    int type;
5200
0
    type = *s++;
5201
0
    return get_int_aux_val(type, s, 0);
5202
0
}
5203
5204
double bam_aux2f(const uint8_t *s)
5205
0
{
5206
0
    int type;
5207
0
    type = *s++;
5208
0
    if (type == 'd') return le_to_double(s);
5209
0
    else if (type == 'f') return le_to_float(s);
5210
0
    else return get_int_aux_val(type, s, 0);
5211
0
}
5212
5213
char bam_aux2A(const uint8_t *s)
5214
0
{
5215
0
    int type;
5216
0
    type = *s++;
5217
0
    if (type == 'A') return *(char*)s;
5218
0
    errno = EINVAL;
5219
0
    return 0;
5220
0
}
5221
5222
char *bam_aux2Z(const uint8_t *s)
5223
0
{
5224
0
    int type;
5225
0
    type = *s++;
5226
0
    if (type == 'Z' || type == 'H') return (char*)s;
5227
0
    errno = EINVAL;
5228
0
    return 0;
5229
0
}
5230
5231
uint32_t bam_auxB_len(const uint8_t *s)
5232
0
{
5233
0
    if (s[0] != 'B') {
5234
0
        errno = EINVAL;
5235
0
        return 0;
5236
0
    }
5237
0
    return le_to_u32(s + 2);
5238
0
}
5239
5240
int64_t bam_auxB2i(const uint8_t *s, uint32_t idx)
5241
0
{
5242
0
    uint32_t len = bam_auxB_len(s);
5243
0
    if (idx >= len) {
5244
0
        errno = ERANGE;
5245
0
        return 0;
5246
0
    }
5247
0
    return get_int_aux_val(s[1], s + 6, idx);
5248
0
}
5249
5250
double bam_auxB2f(const uint8_t *s, uint32_t idx)
5251
0
{
5252
0
    uint32_t len = bam_auxB_len(s);
5253
0
    if (idx >= len) {
5254
0
        errno = ERANGE;
5255
0
        return 0.0;
5256
0
    }
5257
0
    if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx);
5258
0
    else return get_int_aux_val(s[1], s + 6, idx);
5259
0
}
5260
5261
int sam_open_mode(char *mode, const char *fn, const char *format)
5262
0
{
5263
    // TODO Parse "bam5" etc for compression level
5264
0
    if (format == NULL) {
5265
        // Try to pick a format based on the filename extension
5266
0
        char extension[HTS_MAX_EXT_LEN];
5267
0
        if (find_file_extension(fn, extension) < 0) return -1;
5268
0
        return sam_open_mode(mode, fn, extension);
5269
0
    }
5270
0
    else if (strcasecmp(format, "bam") == 0) strcpy(mode, "b");
5271
0
    else if (strcasecmp(format, "cram") == 0) strcpy(mode, "c");
5272
0
    else if (strcasecmp(format, "sam") == 0) strcpy(mode, "");
5273
0
    else if (strcasecmp(format, "sam.gz") == 0) strcpy(mode, "z");
5274
0
    else if (strcasecmp(format, "fastq") == 0 ||
5275
0
             strcasecmp(format, "fq") == 0) strcpy(mode, "f");
5276
0
    else if (strcasecmp(format, "fastq.gz") == 0 ||
5277
0
             strcasecmp(format, "fq.gz") == 0) strcpy(mode, "fz");
5278
0
    else if (strcasecmp(format, "fasta") == 0 ||
5279
0
             strcasecmp(format, "fa") == 0) strcpy(mode, "F");
5280
0
    else if (strcasecmp(format, "fasta.gz") == 0 ||
5281
0
             strcasecmp(format, "fa.gz") == 0) strcpy(mode, "Fz");
5282
0
    else return -1;
5283
5284
0
    return 0;
5285
0
}
5286
5287
// A version of sam_open_mode that can handle ,key=value options.
5288
// The format string is allocated and returned, to be freed by the caller.
5289
// Prefix should be "r" or "w",
5290
char *sam_open_mode_opts(const char *fn,
5291
                         const char *mode,
5292
                         const char *format)
5293
0
{
5294
0
    char *mode_opts = malloc((format ? strlen(format) : 1) +
5295
0
                             (mode   ? strlen(mode)   : 1) + 12);
5296
0
    char *opts, *cp;
5297
0
    int format_len;
5298
5299
0
    if (!mode_opts)
5300
0
        return NULL;
5301
5302
0
    strcpy(mode_opts, mode ? mode : "r");
5303
0
    cp = mode_opts + strlen(mode_opts);
5304
5305
0
    if (format == NULL) {
5306
        // Try to pick a format based on the filename extension
5307
0
        char extension[HTS_MAX_EXT_LEN];
5308
0
        if (find_file_extension(fn, extension) < 0) {
5309
0
            free(mode_opts);
5310
0
            return NULL;
5311
0
        }
5312
0
        if (sam_open_mode(cp, fn, extension) == 0) {
5313
0
            return mode_opts;
5314
0
        } else {
5315
0
            free(mode_opts);
5316
0
            return NULL;
5317
0
        }
5318
0
    }
5319
5320
0
    if ((opts = strchr(format, ','))) {
5321
0
        format_len = opts-format;
5322
0
    } else {
5323
0
        opts="";
5324
0
        format_len = strlen(format);
5325
0
    }
5326
5327
0
    if (strncmp(format, "bam", format_len) == 0) {
5328
0
        *cp++ = 'b';
5329
0
    } else if (strncmp(format, "cram", format_len) == 0) {
5330
0
        *cp++ = 'c';
5331
0
    } else if (strncmp(format, "cram2", format_len) == 0) {
5332
0
        *cp++ = 'c';
5333
0
        strcpy(cp, ",VERSION=2.1");
5334
0
        cp += 12;
5335
0
    } else if (strncmp(format, "cram3", format_len) == 0) {
5336
0
        *cp++ = 'c';
5337
0
        strcpy(cp, ",VERSION=3.0");
5338
0
        cp += 12;
5339
0
    } else if (strncmp(format, "sam", format_len) == 0) {
5340
0
        ; // format mode=""
5341
0
    } else if (strncmp(format, "sam.gz", format_len) == 0) {
5342
0
        *cp++ = 'z';
5343
0
    } else if (strncmp(format, "fastq", format_len) == 0 ||
5344
0
               strncmp(format, "fq", format_len) == 0) {
5345
0
        *cp++ = 'f';
5346
0
    } else if (strncmp(format, "fastq.gz", format_len) == 0 ||
5347
0
               strncmp(format, "fq.gz", format_len) == 0) {
5348
0
        *cp++ = 'f';
5349
0
        *cp++ = 'z';
5350
0
    } else if (strncmp(format, "fasta", format_len) == 0 ||
5351
0
               strncmp(format, "fa", format_len) == 0) {
5352
0
        *cp++ = 'F';
5353
0
    } else if (strncmp(format, "fasta.gz", format_len) == 0 ||
5354
0
               strncmp(format, "fa", format_len) == 0) {
5355
0
        *cp++ = 'F';
5356
0
        *cp++ = 'z';
5357
0
    } else {
5358
0
        free(mode_opts);
5359
0
        return NULL;
5360
0
    }
5361
5362
0
    strcpy(cp, opts);
5363
5364
0
    return mode_opts;
5365
0
}
5366
5367
0
#define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
5368
int bam_str2flag(const char *str)
5369
0
{
5370
0
    char *end, *beg = (char*) str;
5371
0
    long int flag = strtol(str, &end, 0);
5372
0
    if ( end!=str ) return flag;    // the conversion was successful
5373
0
    flag = 0;
5374
0
    while ( *str )
5375
0
    {
5376
0
        end = beg;
5377
0
        while ( *end && *end!=',' ) end++;
5378
0
        if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
5379
0
        else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
5380
0
        else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
5381
0
        else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
5382
0
        else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
5383
0
        else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
5384
0
        else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
5385
0
        else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
5386
0
        else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
5387
0
        else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
5388
0
        else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
5389
0
        else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
5390
0
        else return -1;
5391
0
        if ( !*end ) break;
5392
0
        beg = end + 1;
5393
0
    }
5394
0
    return flag;
5395
0
}
5396
5397
char *bam_flag2str(int flag)
5398
0
{
5399
0
    kstring_t str = {0,0,0};
5400
0
    if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
5401
0
    if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
5402
0
    if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
5403
0
    if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
5404
0
    if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
5405
0
    if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
5406
0
    if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
5407
0
    if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
5408
0
    if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
5409
0
    if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
5410
0
    if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
5411
0
    if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
5412
0
    if ( str.l == 0 ) kputsn("", 0, &str);
5413
0
    return str.s;
5414
0
}
5415
5416
5417
/**************************
5418
 *** Pileup and Mpileup ***
5419
 **************************/
5420
5421
#if !defined(BAM_NO_PILEUP)
5422
5423
#include <assert.h>
5424
5425
/*******************
5426
 *** Memory pool ***
5427
 *******************/
5428
5429
typedef struct {
5430
    int k, y;
5431
    hts_pos_t x, end;
5432
} cstate_t;
5433
5434
static cstate_t g_cstate_null = { -1, 0, 0, 0 };
5435
5436
typedef struct __linkbuf_t {
5437
    bam1_t b;
5438
    hts_pos_t beg, end;
5439
    cstate_t s;
5440
    struct __linkbuf_t *next;
5441
    bam_pileup_cd cd;
5442
} lbnode_t;
5443
5444
typedef struct {
5445
    int cnt, n, max;
5446
    lbnode_t **buf;
5447
} mempool_t;
5448
5449
static mempool_t *mp_init(void)
5450
0
{
5451
0
    mempool_t *mp;
5452
0
    mp = (mempool_t*)calloc(1, sizeof(mempool_t));
5453
0
    return mp;
5454
0
}
5455
static void mp_destroy(mempool_t *mp)
5456
0
{
5457
0
    int k;
5458
0
    for (k = 0; k < mp->n; ++k) {
5459
0
        free(mp->buf[k]->b.data);
5460
0
        free(mp->buf[k]);
5461
0
    }
5462
0
    free(mp->buf);
5463
0
    free(mp);
5464
0
}
5465
static inline lbnode_t *mp_alloc(mempool_t *mp)
5466
0
{
5467
0
    ++mp->cnt;
5468
0
    if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
5469
0
    else return mp->buf[--mp->n];
5470
0
}
5471
static inline void mp_free(mempool_t *mp, lbnode_t *p)
5472
0
{
5473
0
    --mp->cnt; p->next = 0; // clear lbnode_t::next here
5474
0
    if (mp->n == mp->max) {
5475
0
        mp->max = mp->max? mp->max<<1 : 256;
5476
0
        mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
5477
0
    }
5478
0
    mp->buf[mp->n++] = p;
5479
0
}
5480
5481
/**********************
5482
 *** CIGAR resolver ***
5483
 **********************/
5484
5485
/* s->k: the index of the CIGAR operator that has just been processed.
5486
   s->x: the reference coordinate of the start of s->k
5487
   s->y: the query coordinate of the start of s->k
5488
 */
5489
static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s)
5490
0
{
5491
0
#define _cop(c) ((c)&BAM_CIGAR_MASK)
5492
0
#define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
5493
5494
0
    bam1_t *b = p->b;
5495
0
    bam1_core_t *c = &b->core;
5496
0
    uint32_t *cigar = bam_get_cigar(b);
5497
0
    int k;
5498
    // determine the current CIGAR operation
5499
    //fprintf(stderr, "%s\tpos=%ld\tend=%ld\t(%d,%ld,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
5500
0
    if (s->k == -1) { // never processed
5501
0
        p->qpos = 0;
5502
0
        if (c->n_cigar == 1) { // just one operation, save a loop
5503
0
          if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
5504
0
        } else { // find the first match or deletion
5505
0
            for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
5506
0
                int op = _cop(cigar[k]);
5507
0
                int l = _cln(cigar[k]);
5508
0
                if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP ||
5509
0
                    op == BAM_CEQUAL || op == BAM_CDIFF) break;
5510
0
                else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5511
0
            }
5512
0
            assert(k < c->n_cigar);
5513
0
            s->k = k;
5514
0
        }
5515
0
    } else { // the read has been processed before
5516
0
        int op, l = _cln(cigar[s->k]);
5517
0
        if (pos - s->x >= l) { // jump to the next operation
5518
0
            assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
5519
0
            op = _cop(cigar[s->k+1]);
5520
0
            if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
5521
0
              if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5522
0
                s->x += l;
5523
0
                ++s->k;
5524
0
            } else { // find the next M/D/N/=/X
5525
0
              if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5526
0
                s->x += l;
5527
0
                for (k = s->k + 1; k < c->n_cigar; ++k) {
5528
0
                    op = _cop(cigar[k]), l = _cln(cigar[k]);
5529
0
                    if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
5530
0
                    else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5531
0
                }
5532
0
                s->k = k;
5533
0
            }
5534
0
            assert(s->k < c->n_cigar); // otherwise a bug
5535
0
        } // else, do nothing
5536
0
    }
5537
0
    { // collect pileup information
5538
0
        int op, l;
5539
0
        op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
5540
0
        p->is_del = p->indel = p->is_refskip = 0;
5541
0
        if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
5542
0
            int op2 = _cop(cigar[s->k+1]);
5543
0
            int l2 = _cln(cigar[s->k+1]);
5544
0
            if (op2 == BAM_CDEL && op != BAM_CDEL) {
5545
                // At start of a new deletion, merge e.g. 1D2D to 3D.
5546
                // Within a deletion (the 2D in 1D2D) we keep p->indel=0
5547
                // and rely on is_del=1 as we would for 3D.
5548
0
                p->indel = -(int)l2;
5549
0
                for (k = s->k+2; k < c->n_cigar; ++k) {
5550
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5551
0
                    if (op2 == BAM_CDEL) p->indel -= l2;
5552
0
                    else break;
5553
0
                }
5554
0
            } else if (op2 == BAM_CINS) {
5555
0
                p->indel = l2;
5556
0
                for (k = s->k+2; k < c->n_cigar; ++k) {
5557
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5558
0
                    if (op2 == BAM_CINS) p->indel += l2;
5559
0
                    else if (op2 != BAM_CPAD) break;
5560
0
                }
5561
0
            } else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) {
5562
0
                int l3 = 0;
5563
0
                for (k = s->k + 2; k < c->n_cigar; ++k) {
5564
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5565
0
                    if (op2 == BAM_CINS) l3 += l2;
5566
0
                    else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
5567
0
                }
5568
0
                if (l3 > 0) p->indel = l3;
5569
0
            }
5570
0
        }
5571
0
        if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
5572
0
            p->qpos = s->y + (pos - s->x);
5573
0
        } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
5574
0
            p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
5575
0
            p->is_refskip = (op == BAM_CREF_SKIP);
5576
0
        } // cannot be other operations; otherwise a bug
5577
0
        p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
5578
0
    }
5579
0
    p->cigar_ind = s->k;
5580
0
    return 1;
5581
0
}
5582
5583
/*******************************
5584
 *** Expansion of insertions ***
5585
 *******************************/
5586
5587
/*
5588
 * Fills out the kstring with the padded insertion sequence for the current
5589
 * location in 'p'.  If this is not an insertion site, the string is blank.
5590
 *
5591
 * This variant handles base modifications, but only when "m" is non-NULL.
5592
 *
5593
 * Returns the number of inserted base on success, with string length being
5594
 *        accessable via ins->l;
5595
 *        -1 on failure.
5596
 */
5597
int bam_plp_insertion_mod(const bam_pileup1_t *p,
5598
                          hts_base_mod_state *m,
5599
0
                          kstring_t *ins, int *del_len) {
5600
0
    int j, k, indel, nb = 0;
5601
0
    uint32_t *cigar;
5602
5603
0
    if (p->indel <= 0) {
5604
0
        if (ks_resize(ins, 1) < 0)
5605
0
            return -1;
5606
0
        ins->l = 0;
5607
0
        ins->s[0] = '\0';
5608
0
        return 0;
5609
0
    }
5610
5611
0
    if (del_len)
5612
0
        *del_len = 0;
5613
5614
    // Measure indel length including pads
5615
0
    indel = 0;
5616
0
    k = p->cigar_ind+1;
5617
0
    cigar = bam_get_cigar(p->b);
5618
0
    while (k < p->b->core.n_cigar) {
5619
0
        switch (cigar[k] & BAM_CIGAR_MASK) {
5620
0
        case BAM_CPAD:
5621
0
        case BAM_CINS:
5622
0
            indel += (cigar[k] >> BAM_CIGAR_SHIFT);
5623
0
            break;
5624
0
        default:
5625
0
            k = p->b->core.n_cigar;
5626
0
            break;
5627
0
        }
5628
0
        k++;
5629
0
    }
5630
0
    nb = ins->l = indel;
5631
5632
    // Produce sequence
5633
0
    if (ks_resize(ins, indel+1) < 0)
5634
0
        return -1;
5635
0
    indel = 0;
5636
0
    k = p->cigar_ind+1;
5637
0
    j = 1;
5638
0
    while (k < p->b->core.n_cigar) {
5639
0
        int l, c;
5640
0
        switch (cigar[k] & BAM_CIGAR_MASK) {
5641
0
        case BAM_CPAD:
5642
0
            for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++)
5643
0
                ins->s[indel++] = '*';
5644
0
            break;
5645
0
        case BAM_CINS:
5646
0
            for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) {
5647
0
                c = p->qpos + j - p->is_del < p->b->core.l_qseq
5648
0
                    ? seq_nt16_str[bam_seqi(bam_get_seq(p->b),
5649
0
                                            p->qpos + j - p->is_del)]
5650
0
                    : 'N';
5651
0
                ins->s[indel++] = c;
5652
0
                int nm;
5653
0
                hts_base_mod mod[256];
5654
0
                if (m && (nm = bam_mods_at_qpos(p->b, p->qpos + j - p->is_del,
5655
0
                                                m, mod, 256)) > 0) {
5656
0
                    int o_indel = indel;
5657
0
                    if (ks_resize(ins, ins->l + nm*16+3) < 0)
5658
0
                        return -1;
5659
0
                    ins->s[indel++] = '[';
5660
0
                    int j;
5661
0
                    for (j = 0; j < nm; j++) {
5662
0
                        char qual[20];
5663
0
                        if (mod[j].qual >= 0)
5664
0
                            snprintf(qual, sizeof(qual), "%d", mod[j].qual);
5665
0
                        else
5666
0
                            *qual=0;
5667
0
                        if (mod[j].modified_base < 0)
5668
                            // ChEBI
5669
0
                            indel += snprintf(&ins->s[indel], ins->m - indel,
5670
0
                                              "%c(%d)%s",
5671
0
                                              "+-"[mod[j].strand],
5672
0
                                              -mod[j].modified_base,
5673
0
                                              qual);
5674
0
                        else
5675
0
                            indel += snprintf(&ins->s[indel], ins->m - indel,
5676
0
                                              "%c%c%s",
5677
0
                                              "+-"[mod[j].strand],
5678
0
                                              mod[j].modified_base,
5679
0
                                              qual);
5680
0
                    }
5681
0
                    ins->s[indel++] = ']';
5682
0
                    ins->l += indel - o_indel; // grow by amount we used
5683
0
                }
5684
0
            }
5685
0
            break;
5686
0
        case BAM_CDEL:
5687
            // eg cigar 1M2I1D gives mpileup output in T+2AA-1C style
5688
0
            if (del_len)
5689
0
                *del_len = cigar[k]>>BAM_CIGAR_SHIFT;
5690
            // fall through
5691
0
        default:
5692
0
            k = p->b->core.n_cigar;
5693
0
            break;
5694
0
        }
5695
0
        k++;
5696
0
    }
5697
0
    ins->s[indel] = '\0';
5698
0
    ins->l = indel; // string length
5699
5700
0
    return nb;      // base length
5701
0
}
5702
5703
/*
5704
 * Fills out the kstring with the padded insertion sequence for the current
5705
 * location in 'p'.  If this is not an insertion site, the string is blank.
5706
 *
5707
 * This is the original interface with no capability for reporting base
5708
 * modifications.
5709
 *
5710
 * Returns the length of insertion string on success;
5711
 *        -1 on failure.
5712
 */
5713
0
int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) {
5714
0
    return bam_plp_insertion_mod(p, NULL, ins, del_len);
5715
0
}
5716
5717
/***********************
5718
 *** Pileup iterator ***
5719
 ***********************/
5720
5721
// Dictionary of overlapping reads
5722
KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
5723
typedef khash_t(olap_hash) olap_hash_t;
5724
5725
struct bam_plp_s {
5726
    mempool_t *mp;
5727
    lbnode_t *head, *tail;
5728
    int32_t tid, max_tid;
5729
    hts_pos_t pos, max_pos;
5730
    int is_eof, max_plp, error, maxcnt;
5731
    uint64_t id;
5732
    bam_pileup1_t *plp;
5733
    // for the "auto" interface only
5734
    bam1_t *b;
5735
    bam_plp_auto_f func;
5736
    void *data;
5737
    olap_hash_t *overlaps;
5738
5739
    // For notification of creation and destruction events
5740
    // and associated client-owned pointer.
5741
    int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
5742
    int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd);
5743
};
5744
5745
bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
5746
0
{
5747
0
    bam_plp_t iter;
5748
0
    iter = (bam_plp_t)calloc(1, sizeof(struct bam_plp_s));
5749
0
    iter->mp = mp_init();
5750
0
    iter->head = iter->tail = mp_alloc(iter->mp);
5751
0
    iter->max_tid = iter->max_pos = -1;
5752
0
    iter->maxcnt = 8000;
5753
0
    if (func) {
5754
0
        iter->func = func;
5755
0
        iter->data = data;
5756
0
        iter->b = bam_init1();
5757
0
    }
5758
0
    return iter;
5759
0
}
5760
5761
int bam_plp_init_overlaps(bam_plp_t iter)
5762
0
{
5763
0
    iter->overlaps = kh_init(olap_hash);  // hash for tweaking quality of bases in overlapping reads
5764
0
    return iter->overlaps ? 0 : -1;
5765
0
}
5766
5767
void bam_plp_destroy(bam_plp_t iter)
5768
0
{
5769
0
    lbnode_t *p, *pnext;
5770
0
    if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
5771
0
    for (p = iter->head; p != NULL; p = pnext) {
5772
0
        if (iter->plp_destruct && p != iter->tail)
5773
0
            iter->plp_destruct(iter->data, &p->b, &p->cd);
5774
0
        pnext = p->next;
5775
0
        mp_free(iter->mp, p);
5776
0
    }
5777
0
    mp_destroy(iter->mp);
5778
0
    if (iter->b) bam_destroy1(iter->b);
5779
0
    free(iter->plp);
5780
0
    free(iter);
5781
0
}
5782
5783
void bam_plp_constructor(bam_plp_t plp,
5784
0
                         int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5785
0
    plp->plp_construct = func;
5786
0
}
5787
5788
void bam_plp_destructor(bam_plp_t plp,
5789
0
                        int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5790
0
    plp->plp_destruct = func;
5791
0
}
5792
5793
//---------------------------------
5794
//---  Tweak overlapping reads
5795
//---------------------------------
5796
5797
/**
5798
 *  cigar_iref2iseq_set()  - find the first CMATCH setting the ref and the read index
5799
 *  cigar_iref2iseq_next() - get the next CMATCH base
5800
 *  @cigar:       pointer to current cigar block (rw)
5801
 *  @cigar_max:   pointer just beyond the last cigar block
5802
 *  @icig:        position within the current cigar block (rw)
5803
 *  @iseq:        position in the sequence (rw)
5804
 *  @iref:        position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
5805
 *
5806
 *  Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered,
5807
 *  or -2 on error.
5808
 */
5809
static inline int cigar_iref2iseq_set(const uint32_t **cigar,
5810
                                      const uint32_t *cigar_max,
5811
                                      hts_pos_t *icig,
5812
                                      hts_pos_t *iseq,
5813
                                      hts_pos_t *iref)
5814
0
{
5815
0
    hts_pos_t pos = *iref;
5816
0
    if ( pos < 0 ) return -1;
5817
0
    *icig = 0;
5818
0
    *iseq = 0;
5819
0
    *iref = 0;
5820
0
    while ( *cigar<cigar_max )
5821
0
    {
5822
0
        int cig  = (**cigar) & BAM_CIGAR_MASK;
5823
0
        int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5824
5825
0
        if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5826
0
        if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
5827
0
        if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5828
0
        {
5829
0
            pos -= ncig;
5830
0
            if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
5831
0
            (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
5832
0
            continue;
5833
0
        }
5834
0
        if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5835
0
        if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
5836
0
        {
5837
0
            pos -= ncig;
5838
0
            if ( pos<0 ) pos = 0;
5839
0
            (*cigar)++; *icig = 0; *iref += ncig;
5840
0
            continue;
5841
0
        }
5842
0
        hts_log_error("Unexpected cigar %d", cig);
5843
0
        return -2;
5844
0
    }
5845
0
    *iseq = -1;
5846
0
    return -1;
5847
0
}
5848
static inline int cigar_iref2iseq_next(const uint32_t **cigar,
5849
                                       const uint32_t *cigar_max,
5850
                                       hts_pos_t *icig,
5851
                                       hts_pos_t *iseq,
5852
                                       hts_pos_t *iref)
5853
0
{
5854
0
    while ( *cigar < cigar_max )
5855
0
    {
5856
0
        int cig  = (**cigar) & BAM_CIGAR_MASK;
5857
0
        int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5858
5859
0
        if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5860
0
        {
5861
0
            if ( *icig >= ncig - 1 ) { *icig = -1;  (*cigar)++; continue; }
5862
0
            (*iseq)++; (*icig)++; (*iref)++;
5863
0
            return BAM_CMATCH;
5864
0
        }
5865
0
        if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = -1; continue; }
5866
0
        if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5867
0
        if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5868
0
        if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = -1; continue; }
5869
0
        hts_log_error("Unexpected cigar %d", cig);
5870
0
        return -2;
5871
0
    }
5872
0
    *iseq = -1;
5873
0
    *iref = -1;
5874
0
    return -1;
5875
0
}
5876
5877
// Given overlapping read 'a' (left) and 'b' (right) on the same
5878
// template, adjust quality values to zero for either a or b.
5879
// Note versions 1.12 and earlier always removed quality from 'b' for
5880
// matching bases.  Now we select a or b semi-randomly based on name hash.
5881
// Returns 0 on success,
5882
//        -1 on failure
5883
static int tweak_overlap_quality(bam1_t *a, bam1_t *b)
5884
0
{
5885
0
    const uint32_t *a_cigar = bam_get_cigar(a),
5886
0
        *a_cigar_max = a_cigar + a->core.n_cigar;
5887
0
    const uint32_t *b_cigar = bam_get_cigar(b),
5888
0
        *b_cigar_max = b_cigar + b->core.n_cigar;
5889
0
    hts_pos_t a_icig = 0, a_iseq = 0;
5890
0
    hts_pos_t b_icig = 0, b_iseq = 0;
5891
0
    uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
5892
0
    uint8_t *a_seq  = bam_get_seq(a), *b_seq = bam_get_seq(b);
5893
5894
0
    hts_pos_t iref   = b->core.pos;
5895
0
    hts_pos_t a_iref = iref - a->core.pos;
5896
0
    hts_pos_t b_iref = iref - b->core.pos;
5897
5898
0
    int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max,
5899
0
                                    &a_icig, &a_iseq, &a_iref);
5900
0
    if ( a_ret<0 )
5901
        // no overlap or error
5902
0
        return a_ret<-1 ? -1:0;
5903
5904
0
    int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max,
5905
0
                                    &b_icig, &b_iseq, &b_iref);
5906
0
    if ( b_ret<0 )
5907
        // no overlap or error
5908
0
        return b_ret<-1 ? -1:0;
5909
5910
    // Determine which seq is the one getting modified qualities.
5911
0
    uint8_t amul, bmul;
5912
0
    if (__ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(a))) & 1) {
5913
0
        amul = 1;
5914
0
        bmul = 0;
5915
0
    } else {
5916
0
        amul = 0;
5917
0
        bmul = 1;
5918
0
    }
5919
5920
    // Loop over the overlapping region nulling qualities in either
5921
    // seq a or b.
5922
0
    int err = 0;
5923
0
    while ( 1 ) {
5924
        // Step to next matching reference position in a and b
5925
0
        while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos )
5926
0
            a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
5927
0
                                         &a_icig, &a_iseq, &a_iref);
5928
0
        if ( a_ret<0 ) { // done
5929
0
            err = a_ret<-1?-1:0;
5930
0
            break;
5931
0
        }
5932
5933
0
        while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos )
5934
0
            b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig,
5935
0
                                         &b_iseq, &b_iref);
5936
0
        if ( b_ret<0 ) { // done
5937
0
            err = b_ret<-1?-1:0;
5938
0
            break;
5939
0
        }
5940
5941
0
        if ( iref < a_iref + a->core.pos )
5942
0
            iref = a_iref + a->core.pos;
5943
5944
0
        if ( iref < b_iref + b->core.pos )
5945
0
            iref = b_iref + b->core.pos;
5946
5947
0
        iref++;
5948
5949
        // If A or B has a deletion then we catch up the other to this point.
5950
        // We also amend quality values using the same rules for mismatch.
5951
0
        if (a_iref+a->core.pos != b_iref+b->core.pos) {
5952
0
            if (a_iref+a->core.pos < b_iref+b->core.pos
5953
0
                && b_cigar > bam_get_cigar(b)
5954
0
                && bam_cigar_op(b_cigar[-1]) == BAM_CDEL) {
5955
                // Del in B means it's moved on further than A
5956
0
                do {
5957
0
                    a_qual[a_iseq] = amul
5958
0
                        ? a_qual[a_iseq]*0.8
5959
0
                        : 0;
5960
0
                    a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
5961
0
                                                 &a_icig, &a_iseq, &a_iref);
5962
0
                    if (a_ret < 0)
5963
0
                        return -(a_ret<-1); // 0 or -1
5964
0
                } while (a_iref + a->core.pos < b_iref+b->core.pos);
5965
0
            } else if (a_cigar > bam_get_cigar(a)
5966
0
                       && bam_cigar_op(a_cigar[-1]) == BAM_CDEL) {
5967
                // Del in A means it's moved on further than B
5968
0
                do {
5969
0
                    b_qual[b_iseq] = bmul
5970
0
                        ? b_qual[b_iseq]*0.8
5971
0
                        : 0;
5972
0
                    b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max,
5973
0
                                                 &b_icig, &b_iseq, &b_iref);
5974
0
                    if (b_ret < 0)
5975
0
                        return -(b_ret<-1); // 0 or -1
5976
0
                } while (b_iref + b->core.pos < a_iref+a->core.pos);
5977
0
            } else {
5978
                // Anything else, eg ref-skip, we don't support here
5979
0
                continue;
5980
0
            }
5981
0
        }
5982
5983
        // fprintf(stderr, "a_cig=%ld,%ld b_cig=%ld,%ld iref=%ld "
5984
        //         "a_iref=%ld b_iref=%ld a_iseq=%ld b_iseq=%ld\n",
5985
        //         a_cigar-bam_get_cigar(a), a_icig,
5986
        //         b_cigar-bam_get_cigar(b), b_icig,
5987
        //         iref, a_iref+a->core.pos+1, b_iref+b->core.pos+1,
5988
        //         a_iseq, b_iseq);
5989
5990
0
        if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq)
5991
            // Fell off end of sequence, bad CIGAR?
5992
0
            return -1;
5993
5994
        // We're finally at the same ref base in both a and b.
5995
        // Check if the bases match (confident) or mismatch
5996
        // (not so confident).
5997
0
        if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) {
5998
            // We are very confident about this base.  Use sum of quals
5999
0
            int qual = a_qual[a_iseq] + b_qual[b_iseq];
6000
0
            a_qual[a_iseq] = amul * (qual>200 ? 200 : qual);
6001
0
            b_qual[b_iseq] = bmul * (qual>200 ? 200 : qual);;
6002
0
        } else {
6003
            // Not so confident about anymore given the mismatch.
6004
            // Reduce qual for lowest quality base.
6005
0
            if ( a_qual[a_iseq] > b_qual[b_iseq] ) {
6006
                // A highest qual base; keep
6007
0
                a_qual[a_iseq] = 0.8 * a_qual[a_iseq];
6008
0
                b_qual[b_iseq] = 0;
6009
0
            } else if (a_qual[a_iseq] < b_qual[b_iseq] ) {
6010
                // B highest qual base; keep
6011
0
                b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
6012
0
                a_qual[a_iseq] = 0;
6013
0
            } else {
6014
                // Both equal, so pick randomly
6015
0
                a_qual[a_iseq] = amul * 0.8 * a_qual[a_iseq];
6016
0
                b_qual[b_iseq] = bmul * 0.8 * b_qual[b_iseq];
6017
0
            }
6018
0
        }
6019
0
    }
6020
6021
0
    return err;
6022
0
}
6023
6024
// Fix overlapping reads. Simple soft-clipping did not give good results.
6025
// Lowering qualities of unwanted bases is more selective and works better.
6026
//
6027
// Returns 0 on success, -1 on failure
6028
static int overlap_push(bam_plp_t iter, lbnode_t *node)
6029
0
{
6030
0
    if ( !iter->overlaps ) return 0;
6031
6032
    // mapped mates and paired reads only
6033
0
    if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return 0;
6034
6035
    // no overlap possible, unless some wild cigar
6036
0
    if ( (node->b.core.mtid >= 0 && node->b.core.tid != node->b.core.mtid)
6037
0
         || (llabs(node->b.core.isize) >= 2*node->b.core.l_qseq
6038
0
         && node->b.core.mpos >= node->end) // for those wild cigars
6039
0
       ) return 0;
6040
6041
0
    khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
6042
0
    if ( kitr==kh_end(iter->overlaps) )
6043
0
    {
6044
        // Only add reads where the mate is still to arrive
6045
0
        if (node->b.core.mpos >= node->b.core.pos ||
6046
0
            ((node->b.core.flag & BAM_FPAIRED) && node->b.core.mpos == -1)) {
6047
0
            int ret;
6048
0
            kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
6049
0
            if (ret < 0) return -1;
6050
0
            kh_value(iter->overlaps, kitr) = node;
6051
0
        }
6052
0
    }
6053
0
    else
6054
0
    {
6055
0
        lbnode_t *a = kh_value(iter->overlaps, kitr);
6056
0
        int err = tweak_overlap_quality(&a->b, &node->b);
6057
0
        kh_del(olap_hash, iter->overlaps, kitr);
6058
0
        assert(a->end-1 == a->s.end);
6059
0
        return err;
6060
0
    }
6061
0
    return 0;
6062
0
}
6063
6064
static void overlap_remove(bam_plp_t iter, const bam1_t *b)
6065
0
{
6066
0
    if ( !iter->overlaps ) return;
6067
6068
0
    khiter_t kitr;
6069
0
    if ( b )
6070
0
    {
6071
0
        kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
6072
0
        if ( kitr!=kh_end(iter->overlaps) )
6073
0
            kh_del(olap_hash, iter->overlaps, kitr);
6074
0
    }
6075
0
    else
6076
0
    {
6077
        // remove all
6078
0
        for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
6079
0
            if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
6080
0
    }
6081
0
}
6082
6083
6084
6085
// Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
6086
// pointer to the piled records if next position is ready or NULL if there is not enough records in the
6087
// buffer yet (the current position is still the maximum position across all buffered reads).
6088
const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
6089
0
{
6090
0
    if (iter->error) { *_n_plp = -1; return NULL; }
6091
0
    *_n_plp = 0;
6092
0
    if (iter->is_eof && iter->head == iter->tail) return NULL;
6093
0
    while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
6094
0
        int n_plp = 0;
6095
        // write iter->plp at iter->pos
6096
0
        lbnode_t **pptr = &iter->head;
6097
0
        while (*pptr != iter->tail) {
6098
0
            lbnode_t *p = *pptr;
6099
0
            if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
6100
0
                overlap_remove(iter, &p->b);
6101
0
                if (iter->plp_destruct)
6102
0
                    iter->plp_destruct(iter->data, &p->b, &p->cd);
6103
0
                *pptr = p->next; mp_free(iter->mp, p);
6104
0
            }
6105
0
            else {
6106
0
                if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
6107
0
                    if (n_plp == iter->max_plp) { // then double the capacity
6108
0
                        iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
6109
0
                        iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
6110
0
                    }
6111
0
                    iter->plp[n_plp].b = &p->b;
6112
0
                    iter->plp[n_plp].cd = p->cd;
6113
0
                    if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
6114
0
                }
6115
0
                pptr = &(*pptr)->next;
6116
0
            }
6117
0
        }
6118
0
        *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
6119
        // update iter->tid and iter->pos
6120
0
        if (iter->head != iter->tail) {
6121
0
            if (iter->tid > iter->head->b.core.tid) {
6122
0
                hts_log_error("Unsorted input. Pileup aborts");
6123
0
                iter->error = 1;
6124
0
                *_n_plp = -1;
6125
0
                return NULL;
6126
0
            }
6127
0
        }
6128
0
        if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
6129
0
            iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
6130
0
        } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
6131
0
            iter->pos = iter->head->beg; // jump to the next position
6132
0
        } else ++iter->pos; // scan contiguously
6133
        // return
6134
0
        if (n_plp) return iter->plp;
6135
0
        if (iter->is_eof && iter->head == iter->tail) break;
6136
0
    }
6137
0
    return NULL;
6138
0
}
6139
6140
const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
6141
0
{
6142
0
    hts_pos_t pos64 = 0;
6143
0
    const bam_pileup1_t *p = bam_plp64_next(iter, _tid, &pos64, _n_plp);
6144
0
    if (pos64 < INT_MAX) {
6145
0
        *_pos = pos64;
6146
0
    } else {
6147
0
        hts_log_error("Position %"PRId64" too large", pos64);
6148
0
        *_pos = INT_MAX;
6149
0
        iter->error = 1;
6150
0
        *_n_plp = -1;
6151
0
        return NULL;
6152
0
    }
6153
0
    return p;
6154
0
}
6155
6156
int bam_plp_push(bam_plp_t iter, const bam1_t *b)
6157
0
{
6158
0
    if (iter->error) return -1;
6159
0
    if (b) {
6160
0
        if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
6161
        // Skip only unmapped reads here, any additional filtering must be done in iter->func
6162
0
        if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
6163
0
        if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
6164
0
        {
6165
0
            overlap_remove(iter, b);
6166
0
            return 0;
6167
0
        }
6168
0
        if (bam_copy1(&iter->tail->b, b) == NULL)
6169
0
            return -1;
6170
0
        iter->tail->b.id = iter->id++;
6171
0
        iter->tail->beg = b->core.pos;
6172
        // Use raw rlen rather than bam_endpos() which adjusts rlen=0 to rlen=1
6173
0
        iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
6174
0
        iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
6175
0
        if (b->core.tid < iter->max_tid) {
6176
0
            hts_log_error("The input is not sorted (chromosomes out of order)");
6177
0
            iter->error = 1;
6178
0
            return -1;
6179
0
        }
6180
0
        if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
6181
0
            hts_log_error("The input is not sorted (reads out of order)");
6182
0
            iter->error = 1;
6183
0
            return -1;
6184
0
        }
6185
0
        iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
6186
0
        if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
6187
0
            lbnode_t *next = mp_alloc(iter->mp);
6188
0
            if (!next) {
6189
0
                iter->error = 1;
6190
0
                return -1;
6191
0
            }
6192
0
            if (iter->plp_construct) {
6193
0
                if (iter->plp_construct(iter->data, &iter->tail->b,
6194
0
                                        &iter->tail->cd) < 0) {
6195
0
                    mp_free(iter->mp, next);
6196
0
                    iter->error = 1;
6197
0
                    return -1;
6198
0
                }
6199
0
            }
6200
0
            if (overlap_push(iter, iter->tail) < 0) {
6201
0
                mp_free(iter->mp, next);
6202
0
                iter->error = 1;
6203
0
                return -1;
6204
0
            }
6205
0
            iter->tail->next = next;
6206
0
            iter->tail = iter->tail->next;
6207
0
        }
6208
0
    } else iter->is_eof = 1;
6209
0
    return 0;
6210
0
}
6211
6212
const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
6213
0
{
6214
0
    const bam_pileup1_t *plp;
6215
0
    if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
6216
0
    if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6217
0
    else { // no pileup line can be obtained; read alignments
6218
0
        *_n_plp = 0;
6219
0
        if (iter->is_eof) return 0;
6220
0
        int ret;
6221
0
        while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
6222
0
            if (bam_plp_push(iter, iter->b) < 0) {
6223
0
                *_n_plp = -1;
6224
0
                return 0;
6225
0
            }
6226
0
            if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6227
            // otherwise no pileup line can be returned; read the next alignment.
6228
0
        }
6229
0
        if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
6230
0
        if (bam_plp_push(iter, 0) < 0) {
6231
0
            *_n_plp = -1;
6232
0
            return 0;
6233
0
        }
6234
0
        if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6235
0
        return 0;
6236
0
    }
6237
0
}
6238
6239
const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
6240
0
{
6241
0
    hts_pos_t pos64 = 0;
6242
0
    const bam_pileup1_t *p = bam_plp64_auto(iter, _tid, &pos64, _n_plp);
6243
0
    if (pos64 < INT_MAX) {
6244
0
        *_pos = pos64;
6245
0
    } else {
6246
0
        hts_log_error("Position %"PRId64" too large", pos64);
6247
0
        *_pos = INT_MAX;
6248
0
        iter->error = 1;
6249
0
        *_n_plp = -1;
6250
0
        return NULL;
6251
0
    }
6252
0
    return p;
6253
0
}
6254
6255
void bam_plp_reset(bam_plp_t iter)
6256
0
{
6257
0
    overlap_remove(iter, NULL);
6258
0
    iter->max_tid = iter->max_pos = -1;
6259
0
    iter->tid = iter->pos = 0;
6260
0
    iter->is_eof = 0;
6261
0
    while (iter->head != iter->tail) {
6262
0
        lbnode_t *p = iter->head;
6263
0
        iter->head = p->next;
6264
0
        mp_free(iter->mp, p);
6265
0
    }
6266
0
}
6267
6268
void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
6269
0
{
6270
0
    iter->maxcnt = maxcnt;
6271
0
}
6272
6273
/************************
6274
 *** Mpileup iterator ***
6275
 ************************/
6276
6277
struct bam_mplp_s {
6278
    int n;
6279
    int32_t min_tid, *tid;
6280
    hts_pos_t min_pos, *pos;
6281
    bam_plp_t *iter;
6282
    int *n_plp;
6283
    const bam_pileup1_t **plp;
6284
};
6285
6286
bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
6287
0
{
6288
0
    int i;
6289
0
    bam_mplp_t iter;
6290
0
    iter = (bam_mplp_t)calloc(1, sizeof(struct bam_mplp_s));
6291
0
    iter->pos = (hts_pos_t*)calloc(n, sizeof(hts_pos_t));
6292
0
    iter->tid = (int32_t*)calloc(n, sizeof(int32_t));
6293
0
    iter->n_plp = (int*)calloc(n, sizeof(int));
6294
0
    iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
6295
0
    iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
6296
0
    iter->n = n;
6297
0
    iter->min_pos = HTS_POS_MAX;
6298
0
    iter->min_tid = (uint32_t)-1;
6299
0
    for (i = 0; i < n; ++i) {
6300
0
        iter->iter[i] = bam_plp_init(func, data[i]);
6301
0
        iter->pos[i] = iter->min_pos;
6302
0
        iter->tid[i] = iter->min_tid;
6303
0
    }
6304
0
    return iter;
6305
0
}
6306
6307
int bam_mplp_init_overlaps(bam_mplp_t iter)
6308
0
{
6309
0
    int i, r = 0;
6310
0
    for (i = 0; i < iter->n; ++i)
6311
0
        r |= bam_plp_init_overlaps(iter->iter[i]);
6312
0
    return r == 0 ? 0 : -1;
6313
0
}
6314
6315
void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
6316
0
{
6317
0
    int i;
6318
0
    for (i = 0; i < iter->n; ++i)
6319
0
        iter->iter[i]->maxcnt = maxcnt;
6320
0
}
6321
6322
void bam_mplp_destroy(bam_mplp_t iter)
6323
0
{
6324
0
    int i;
6325
0
    for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
6326
0
    free(iter->iter); free(iter->pos); free(iter->tid);
6327
0
    free(iter->n_plp); free(iter->plp);
6328
0
    free(iter);
6329
0
}
6330
6331
int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp)
6332
0
{
6333
0
    int i, ret = 0;
6334
0
    hts_pos_t new_min_pos = HTS_POS_MAX;
6335
0
    uint32_t new_min_tid = (uint32_t)-1;
6336
0
    for (i = 0; i < iter->n; ++i) {
6337
0
        if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
6338
0
            int tid;
6339
0
            hts_pos_t pos;
6340
0
            iter->plp[i] = bam_plp64_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
6341
0
            if ( iter->iter[i]->error ) return -1;
6342
0
            if (iter->plp[i]) {
6343
0
                iter->tid[i] = tid;
6344
0
                iter->pos[i] = pos;
6345
0
            } else {
6346
0
                iter->tid[i] = 0;
6347
0
                iter->pos[i] = 0;
6348
0
            }
6349
0
        }
6350
0
        if (iter->plp[i]) {
6351
0
            if (iter->tid[i] < new_min_tid) {
6352
0
                new_min_tid = iter->tid[i];
6353
0
                new_min_pos = iter->pos[i];
6354
0
            } else if (iter->tid[i] == new_min_tid && iter->pos[i] < new_min_pos) {
6355
0
                new_min_pos = iter->pos[i];
6356
0
            }
6357
0
        }
6358
0
    }
6359
0
    iter->min_pos = new_min_pos;
6360
0
    iter->min_tid = new_min_tid;
6361
0
    if (new_min_pos == HTS_POS_MAX) return 0;
6362
0
    *_tid = new_min_tid; *_pos = new_min_pos;
6363
0
    for (i = 0; i < iter->n; ++i) {
6364
0
        if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
6365
0
            n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
6366
0
            ++ret;
6367
0
        } else n_plp[i] = 0, plp[i] = 0;
6368
0
    }
6369
0
    return ret;
6370
0
}
6371
6372
int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
6373
0
{
6374
0
    hts_pos_t pos64 = 0;
6375
0
    int ret = bam_mplp64_auto(iter, _tid, &pos64, n_plp, plp);
6376
0
    if (ret >= 0) {
6377
0
        if (pos64 < INT_MAX) {
6378
0
            *_pos = pos64;
6379
0
        } else {
6380
0
            hts_log_error("Position %"PRId64" too large", pos64);
6381
0
            *_pos = INT_MAX;
6382
0
            return -1;
6383
0
        }
6384
0
    }
6385
0
    return ret;
6386
0
}
6387
6388
void bam_mplp_reset(bam_mplp_t iter)
6389
0
{
6390
0
    int i;
6391
0
    iter->min_pos = HTS_POS_MAX;
6392
0
    iter->min_tid = (uint32_t)-1;
6393
0
    for (i = 0; i < iter->n; ++i) {
6394
0
        bam_plp_reset(iter->iter[i]);
6395
0
        iter->pos[i] = HTS_POS_MAX;
6396
0
        iter->tid[i] = (uint32_t)-1;
6397
0
        iter->n_plp[i] = 0;
6398
0
        iter->plp[i] = NULL;
6399
0
    }
6400
0
}
6401
6402
void bam_mplp_constructor(bam_mplp_t iter,
6403
0
                          int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
6404
0
    int i;
6405
0
    for (i = 0; i < iter->n; ++i)
6406
0
        bam_plp_constructor(iter->iter[i], func);
6407
0
}
6408
6409
void bam_mplp_destructor(bam_mplp_t iter,
6410
0
                         int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
6411
0
    int i;
6412
0
    for (i = 0; i < iter->n; ++i)
6413
0
        bam_plp_destructor(iter->iter[i], func);
6414
0
}
6415
6416
#endif // ~!defined(BAM_NO_PILEUP)