Coverage Report

Created: 2025-08-05 06:55

/src/htslib/sam.c
Line
Count
Source (jump to first uncovered line)
1
/*  sam.c -- SAM and BAM file I/O and manipulation.
2
3
    Copyright (C) 2008-2010, 2012-2025 Genome Research Ltd.
4
    Copyright (C) 2010, 2012, 2013 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <strings.h>
30
#include <stdio.h>
31
#include <stdlib.h>
32
#include <string.h>
33
#include <errno.h>
34
#include <zlib.h>
35
#include <assert.h>
36
#include <signal.h>
37
#include <inttypes.h>
38
#include <unistd.h>
39
40
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
41
#include "fuzz_settings.h"
42
#endif
43
44
// Suppress deprecation message for cigar_tab, which we initialise
45
#include "htslib/hts_defs.h"
46
#undef HTS_DEPRECATED
47
#define HTS_DEPRECATED(message)
48
49
#include "htslib/sam.h"
50
#include "htslib/bgzf.h"
51
#include "cram/cram.h"
52
#include "hts_internal.h"
53
#include "sam_internal.h"
54
#include "htslib/hfile.h"
55
#include "htslib/hts_endian.h"
56
#include "htslib/hts_expr.h"
57
#include "header.h"
58
59
#include "htslib/khash.h"
60
KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
61
KHASH_SET_INIT_INT(tag)
62
63
#ifndef EFTYPE
64
0
#define EFTYPE ENOEXEC
65
#endif
66
#ifndef EOVERFLOW
67
#define EOVERFLOW ERANGE
68
#endif
69
70
/**********************
71
 *** BAM header I/O ***
72
 **********************/
73
74
HTSLIB_EXPORT
75
const int8_t bam_cigar_table[256] = {
76
    // 0 .. 47
77
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
78
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
79
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
80
81
    // 48 .. 63  (including =)
82
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, BAM_CEQUAL, -1, -1,
83
84
    // 64 .. 79  (including MIDNHB)
85
    -1, -1, BAM_CBACK, -1,  BAM_CDEL, -1, -1, -1,
86
        BAM_CHARD_CLIP, BAM_CINS, -1, -1,  -1, BAM_CMATCH, BAM_CREF_SKIP, -1,
87
88
    // 80 .. 95  (including SPX)
89
    BAM_CPAD, -1, -1, BAM_CSOFT_CLIP,  -1, -1, -1, -1,
90
        BAM_CDIFF, -1, -1, -1,  -1, -1, -1, -1,
91
92
    // 96 .. 127
93
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
94
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
95
96
    // 128 .. 255
97
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
98
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
99
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
100
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
101
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
102
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
103
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
104
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1
105
};
106
107
sam_hdr_t *sam_hdr_init(void)
108
6.94k
{
109
6.94k
    sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t));
110
6.94k
    if (bh == NULL) return NULL;
111
112
6.94k
    bh->cigar_tab = bam_cigar_table;
113
6.94k
    return bh;
114
6.94k
}
115
116
void sam_hdr_destroy(sam_hdr_t *bh)
117
21.1k
{
118
21.1k
    int32_t i;
119
120
21.1k
    if (bh == NULL) return;
121
122
9.61k
    if (bh->ref_count > 0) {
123
2.67k
        --bh->ref_count;
124
2.67k
        return;
125
2.67k
    }
126
127
6.94k
    if (bh->target_name) {
128
21.0k
        for (i = 0; i < bh->n_targets; ++i)
129
17.5k
            free(bh->target_name[i]);
130
3.47k
        free(bh->target_name);
131
3.47k
        free(bh->target_len);
132
3.47k
    }
133
6.94k
    free(bh->text);
134
6.94k
    if (bh->hrecs)
135
6.47k
        sam_hrecs_free(bh->hrecs);
136
6.94k
    if (bh->sdict)
137
6.94k
        kh_destroy(s2i, (khash_t(s2i) *) bh->sdict);
138
6.94k
    free(bh);
139
6.94k
}
140
141
// Copy the sam_hdr_t::sdict hash, used to store the real lengths of long
142
// references before sam_hdr_t::hrecs is populated
143
int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h)
144
37
{
145
37
    const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict;
146
37
    khash_t(s2i) *dest_long_refs = kh_init(s2i);
147
37
    int i;
148
37
    if (!dest_long_refs) return -1;
149
150
1.32k
    for (i = 0; i < h->n_targets; i++) {
151
1.28k
        int ret;
152
1.28k
        khiter_t ksrc, kdest;
153
1.28k
        if (h->target_len[i] < UINT32_MAX) continue;
154
411
        ksrc = kh_get(s2i, src_long_refs, h->target_name[i]);
155
411
        if (ksrc == kh_end(src_long_refs)) continue;
156
411
        kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret);
157
411
        if (ret < 0) {
158
0
            kh_destroy(s2i, dest_long_refs);
159
0
            return -1;
160
0
        }
161
411
        kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc);
162
411
    }
163
164
37
    h->sdict = dest_long_refs;
165
37
    return 0;
166
37
}
167
168
sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0)
169
2.18k
{
170
2.18k
    if (h0 == NULL) return NULL;
171
2.18k
    sam_hdr_t *h;
172
2.18k
    if ((h = sam_hdr_init()) == NULL) return NULL;
173
    // copy the simple data
174
2.18k
    h->n_targets = 0;
175
2.18k
    h->ignore_sam_err = h0->ignore_sam_err;
176
2.18k
    h->l_text = 0;
177
178
    // Then the pointery stuff
179
180
2.18k
    if (!h0->hrecs) {
181
86
        h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t));
182
86
        if (!h->target_len) goto fail;
183
86
        h->target_name = (char**)calloc(h0->n_targets, sizeof(char*));
184
86
        if (!h->target_name) goto fail;
185
186
86
        int i;
187
1.44k
        for (i = 0; i < h0->n_targets; ++i) {
188
1.36k
            h->target_len[i] = h0->target_len[i];
189
1.36k
            h->target_name[i] = strdup(h0->target_name[i]);
190
1.36k
            if (!h->target_name[i]) break;
191
1.36k
        }
192
86
        h->n_targets = i;
193
86
        if (i < h0->n_targets) goto fail;
194
195
86
        if (h0->sdict) {
196
37
            if (sam_hdr_dup_sdict(h0, h) < 0) goto fail;
197
37
        }
198
86
    }
199
200
2.18k
    if (h0->hrecs) {
201
2.09k
        kstring_t tmp = { 0, 0, NULL };
202
2.09k
        if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) {
203
0
            free(ks_release(&tmp));
204
0
            goto fail;
205
0
        }
206
207
2.09k
        h->l_text = tmp.l;
208
2.09k
        h->text   = ks_release(&tmp);
209
210
2.09k
        if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0)
211
0
            goto fail;
212
2.09k
    } else {
213
86
        h->l_text = h0->text ? h0->l_text : 0;
214
86
        h->text = malloc(h->l_text + 1);
215
86
        if (!h->text) goto fail;
216
86
        if (h0->text)
217
86
            memcpy(h->text, h0->text, h->l_text);
218
86
        h->text[h->l_text] = '\0';
219
86
    }
220
221
2.18k
    return h;
222
223
0
 fail:
224
0
    sam_hdr_destroy(h);
225
0
    return NULL;
226
2.18k
}
227
228
sam_hdr_t *bam_hdr_read(BGZF *fp)
229
66
{
230
66
    sam_hdr_t *h;
231
66
    uint8_t buf[4];
232
66
    int magic_len, has_EOF;
233
66
    int32_t i, name_len, num_names = 0;
234
66
    size_t bufsize;
235
66
    ssize_t bytes;
236
    // check EOF
237
66
    has_EOF = bgzf_check_EOF(fp);
238
66
    if (has_EOF < 0) {
239
0
        perror("[W::bam_hdr_read] bgzf_check_EOF");
240
66
    } else if (has_EOF == 0) {
241
66
        hts_log_warning("EOF marker is absent. The input is probably truncated");
242
66
    }
243
    // read "BAM1"
244
66
    magic_len = bgzf_read(fp, buf, 4);
245
66
    if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) {
246
0
        hts_log_error("Invalid BAM binary header");
247
0
        return 0;
248
0
    }
249
66
    h = sam_hdr_init();
250
66
    if (!h) goto nomem;
251
252
    // read plain text and the number of reference sequences
253
66
    bytes = bgzf_read(fp, buf, 4);
254
66
    if (bytes != 4) goto read_err;
255
66
    h->l_text = le_to_u32(buf);
256
257
66
    bufsize = h->l_text + 1;
258
66
    if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
259
66
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
260
66
    if (bufsize > FUZZ_ALLOC_LIMIT) goto nomem;
261
66
#endif
262
66
    h->text = (char*)malloc(bufsize);
263
66
    if (!h->text) goto nomem;
264
66
    h->text[h->l_text] = 0; // make sure it is NULL terminated
265
66
    bytes = bgzf_read(fp, h->text, h->l_text);
266
66
    if (bytes != h->l_text) goto read_err;
267
268
51
    bytes = bgzf_read(fp, &h->n_targets, 4);
269
51
    if (bytes != 4) goto read_err;
270
51
    if (fp->is_be) ed_swap_4p(&h->n_targets);
271
272
51
    if (h->n_targets < 0) goto invalid;
273
274
    // read reference sequence names and lengths
275
51
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
276
51
    if (h->n_targets > (FUZZ_ALLOC_LIMIT - bufsize)/(sizeof(char*)+sizeof(uint32_t)))
277
0
        goto nomem;
278
51
#endif
279
51
    if (h->n_targets > 0) {
280
18
        h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
281
18
        if (!h->target_name) goto nomem;
282
18
        h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
283
18
        if (!h->target_len) goto nomem;
284
18
    }
285
33
    else {
286
33
        h->target_name = NULL;
287
33
        h->target_len = NULL;
288
33
    }
289
290
60
    for (i = 0; i != h->n_targets; ++i) {
291
24
        bytes = bgzf_read(fp, &name_len, 4);
292
24
        if (bytes != 4) goto read_err;
293
24
        if (fp->is_be) ed_swap_4p(&name_len);
294
24
        if (name_len <= 0) goto invalid;
295
296
21
        h->target_name[i] = (char*)malloc(name_len);
297
21
        if (!h->target_name[i]) goto nomem;
298
21
        num_names++;
299
300
21
        bytes = bgzf_read(fp, h->target_name[i], name_len);
301
21
        if (bytes != name_len) goto read_err;
302
303
9
        if (h->target_name[i][name_len - 1] != '\0') {
304
            /* Fix missing NUL-termination.  Is this being too nice?
305
               We could alternatively bail out with an error. */
306
9
            char *new_name;
307
9
            if (name_len == INT32_MAX) goto invalid;
308
9
            new_name = realloc(h->target_name[i], name_len + 1);
309
9
            if (new_name == NULL) goto nomem;
310
9
            h->target_name[i] = new_name;
311
9
            h->target_name[i][name_len] = '\0';
312
9
        }
313
314
9
        bytes = bgzf_read(fp, &h->target_len[i], 4);
315
9
        if (bytes != 4) goto read_err;
316
9
        if (fp->is_be) ed_swap_4p(&h->target_len[i]);
317
9
    }
318
36
    return h;
319
320
0
 nomem:
321
0
    hts_log_error("Out of memory");
322
0
    goto clean;
323
324
27
 read_err:
325
27
    if (bytes < 0) {
326
12
        hts_log_error("Error reading BGZF stream");
327
15
    } else {
328
15
        hts_log_error("Truncated BAM header");
329
15
    }
330
27
    goto clean;
331
332
3
 invalid:
333
3
    hts_log_error("Invalid BAM binary header");
334
335
30
 clean:
336
30
    if (h != NULL) {
337
30
        h->n_targets = num_names; // ensure we free only allocated target_names
338
30
        sam_hdr_destroy(h);
339
30
    }
340
30
    return NULL;
341
3
}
342
343
int bam_hdr_write(BGZF *fp, const sam_hdr_t *h)
344
1.47k
{
345
1.47k
    int32_t i, name_len, x;
346
1.47k
    kstring_t hdr_ks = { 0, 0, NULL };
347
1.47k
    char *text;
348
1.47k
    uint32_t l_text;
349
350
1.47k
    if (!h) return -1;
351
352
1.47k
    if (h->hrecs) {
353
1.39k
        if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1;
354
1.39k
        if (hdr_ks.l > UINT32_MAX) {
355
0
            hts_log_error("Header too long for BAM format");
356
0
            free(hdr_ks.s);
357
0
            return -1;
358
1.39k
        } else if (hdr_ks.l > INT32_MAX) {
359
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
360
0
            hts_log_warning("Output file may not be portable");
361
0
        }
362
1.39k
        text = hdr_ks.s;
363
1.39k
        l_text = hdr_ks.l;
364
1.39k
    } else {
365
86
        if (h->l_text > UINT32_MAX) {
366
0
            hts_log_error("Header too long for BAM format");
367
0
            return -1;
368
86
        } else if (h->l_text > INT32_MAX) {
369
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
370
0
            hts_log_warning("Output file may not be portable");
371
0
        }
372
86
        text = h->text;
373
86
        l_text = h->l_text;
374
86
    }
375
    // write "BAM1"
376
1.47k
    if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; }
377
    // write plain text and the number of reference sequences
378
1.47k
    if (fp->is_be) {
379
0
        x = ed_swap_4(l_text);
380
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
381
0
        if (l_text) {
382
0
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
383
0
        }
384
0
        x = ed_swap_4(h->n_targets);
385
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
386
1.47k
    } else {
387
1.47k
        if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; }
388
1.47k
        if (l_text) {
389
924
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
390
924
        }
391
1.47k
        if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; }
392
1.47k
    }
393
1.47k
    free(hdr_ks.s);
394
    // write sequence names and lengths
395
5.80k
    for (i = 0; i != h->n_targets; ++i) {
396
4.32k
        char *p = h->target_name[i];
397
4.32k
        name_len = strlen(p) + 1;
398
4.32k
        if (fp->is_be) {
399
0
            x = ed_swap_4(name_len);
400
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
401
4.32k
        } else {
402
4.32k
            if (bgzf_write(fp, &name_len, 4) < 0) return -1;
403
4.32k
        }
404
4.32k
        if (bgzf_write(fp, p, name_len) < 0) return -1;
405
4.32k
        if (fp->is_be) {
406
0
            x = ed_swap_4(h->target_len[i]);
407
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
408
4.32k
        } else {
409
4.32k
            if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
410
4.32k
        }
411
4.32k
    }
412
1.47k
    if (bgzf_flush(fp) < 0) return -1;
413
1.47k
    return 0;
414
1.47k
}
415
416
const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
417
0
                             hts_pos_t *beg, hts_pos_t *end, int flags) {
418
0
    return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags);
419
0
}
420
421
/*************************
422
 *** BAM alignment I/O ***
423
 *************************/
424
425
bam1_t *bam_init1(void)
426
752k
{
427
752k
    return (bam1_t*)calloc(1, sizeof(bam1_t));
428
752k
}
429
430
int sam_realloc_bam_data(bam1_t *b, size_t desired)
431
772k
{
432
772k
    uint32_t new_m_data;
433
772k
    uint8_t *new_data;
434
772k
    new_m_data = desired;
435
772k
    kroundup32(new_m_data); // next power of 2
436
772k
    new_m_data += 32; // reduces malloc arena migrations?
437
772k
    if (new_m_data < desired) {
438
0
        errno = ENOMEM; // Not strictly true but we can't store the size
439
0
        return -1;
440
0
    }
441
772k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
442
772k
    if (new_m_data > FUZZ_ALLOC_LIMIT) {
443
3
        errno = ENOMEM;
444
3
        return -1;
445
3
    }
446
772k
#endif
447
772k
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
448
772k
        new_data = realloc(b->data, new_m_data);
449
772k
    } else {
450
0
        if ((new_data = malloc(new_m_data)) != NULL) {
451
0
            if (b->l_data > 0)
452
0
                memcpy(new_data, b->data,
453
0
                       b->l_data < b->m_data ? b->l_data : b->m_data);
454
0
            bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA));
455
0
        }
456
0
    }
457
772k
    if (!new_data) return -1;
458
772k
    b->data = new_data;
459
772k
    b->m_data = new_m_data;
460
772k
    return 0;
461
772k
}
462
463
void bam_destroy1(bam1_t *b)
464
8.97M
{
465
8.97M
    if (b == 0) return;
466
752k
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
467
752k
        free(b->data);
468
752k
        if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) {
469
            // In case of reuse
470
0
            b->data = NULL;
471
0
            b->m_data = 0;
472
0
            b->l_data = 0;
473
0
        }
474
752k
    }
475
476
752k
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0)
477
752k
        free(b);
478
752k
}
479
480
bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
481
2.34M
{
482
2.34M
    if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL;
483
2.34M
    memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data
484
2.34M
    memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest
485
2.34M
    bdst->l_data = bsrc->l_data;
486
2.34M
    bdst->id = bsrc->id;
487
2.34M
    return bdst;
488
2.34M
}
489
490
bam1_t *bam_dup1(const bam1_t *bsrc)
491
748k
{
492
748k
    if (bsrc == NULL) return NULL;
493
748k
    bam1_t *bdst = bam_init1();
494
748k
    if (bdst == NULL) return NULL;
495
748k
    if (bam_copy1(bdst, bsrc) == NULL) {
496
0
        bam_destroy1(bdst);
497
0
        return NULL;
498
0
    }
499
748k
    return bdst;
500
748k
}
501
502
static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar,
503
                             hts_pos_t *rlen, hts_pos_t *qlen)
504
296
{
505
296
    int k;
506
296
    *rlen = *qlen = 0;
507
1.81k
    for (k = 0; k < n_cigar; ++k) {
508
1.51k
        int type = bam_cigar_type(bam_cigar_op(cigar[k]));
509
1.51k
        int len = bam_cigar_oplen(cigar[k]);
510
1.51k
        if (type & 1) *qlen += len;
511
1.51k
        if (type & 2) *rlen += len;
512
1.51k
    }
513
296
}
514
515
static int subtract_check_underflow(size_t length, size_t *limit)
516
34.3M
{
517
34.3M
    if (length <= *limit) {
518
34.3M
        *limit -= length;
519
34.3M
        return 0;
520
34.3M
    }
521
522
0
    return -1;
523
34.3M
}
524
525
int bam_set1(bam1_t *bam,
526
             size_t l_qname, const char *qname,
527
             uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq,
528
             size_t n_cigar, const uint32_t *cigar,
529
             int32_t mtid, hts_pos_t mpos, hts_pos_t isize,
530
             size_t l_seq, const char *seq, const char *qual,
531
             size_t l_aux)
532
6.87M
{
533
    // use a default qname "*" if none is provided
534
6.87M
    if (l_qname == 0) {
535
4.89M
        l_qname = 1;
536
4.89M
        qname = "*";
537
4.89M
    }
538
539
    // note: the qname is stored nul terminated and padded as described in the
540
    // documentation for the bam1_t struct.
541
6.87M
    size_t qname_nuls = 4 - l_qname % 4;
542
543
    // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos().
544
    // can't use bam_endpos() directly as some fields not yet set up.
545
6.87M
    hts_pos_t rlen = 0, qlen = 0;
546
6.87M
    if (!(flag & BAM_FUNMAP)) {
547
0
        bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen);
548
0
    }
549
6.87M
    if (rlen == 0) {
550
6.87M
        rlen = 1;
551
6.87M
    }
552
553
    // validate parameters
554
6.87M
    if (l_qname > 254) {
555
90
        hts_log_error("Query name too long");
556
90
        errno = EINVAL;
557
90
        return -1;
558
90
    }
559
6.87M
    if (HTS_POS_MAX - rlen <= pos) {
560
0
        hts_log_error("Read ends beyond highest supported position");
561
0
        errno = EINVAL;
562
0
        return -1;
563
0
    }
564
6.87M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) {
565
0
        hts_log_error("Mapped query must have a CIGAR");
566
0
        errno = EINVAL;
567
0
        return -1;
568
0
    }
569
6.87M
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) {
570
0
        hts_log_error("CIGAR and query sequence are of different length");
571
0
        errno = EINVAL;
572
0
        return -1;
573
0
    }
574
575
6.87M
    size_t limit = INT32_MAX;
576
6.87M
    int u = subtract_check_underflow(l_qname + qname_nuls, &limit);
577
6.87M
    u    += subtract_check_underflow(n_cigar * 4, &limit);
578
6.87M
    u    += subtract_check_underflow((l_seq + 1) / 2, &limit);
579
6.87M
    u    += subtract_check_underflow(l_seq, &limit);
580
6.87M
    u    += subtract_check_underflow(l_aux, &limit);
581
6.87M
    if (u != 0) {
582
0
        hts_log_error("Size overflow");
583
0
        errno = EINVAL;
584
0
        return -1;
585
0
    }
586
587
    // re-allocate the data buffer as needed.
588
6.87M
    size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq;
589
6.87M
    if (realloc_bam_data(bam, data_len + l_aux) < 0) {
590
0
        return -1;
591
0
    }
592
593
6.87M
    bam->l_data = (int)data_len;
594
6.87M
    bam->core.pos = pos;
595
6.87M
    bam->core.tid = tid;
596
6.87M
    bam->core.bin = bam_reg2bin(pos, pos + rlen);
597
6.87M
    bam->core.qual = mapq;
598
6.87M
    bam->core.l_extranul = (uint8_t)(qname_nuls - 1);
599
6.87M
    bam->core.flag = flag;
600
6.87M
    bam->core.l_qname = (uint16_t)(l_qname + qname_nuls);
601
6.87M
    bam->core.n_cigar = (uint32_t)n_cigar;
602
6.87M
    bam->core.l_qseq = (int32_t)l_seq;
603
6.87M
    bam->core.mtid = mtid;
604
6.87M
    bam->core.mpos = mpos;
605
6.87M
    bam->core.isize = isize;
606
607
6.87M
    uint8_t *cp = bam->data;
608
6.87M
    strncpy((char *)cp, qname, l_qname);
609
6.87M
    int i;
610
27.2M
    for (i = 0; i < qname_nuls; i++) {
611
20.3M
        cp[l_qname + i] = '\0';
612
20.3M
    }
613
6.87M
    cp += l_qname + qname_nuls;
614
615
6.87M
    if (n_cigar > 0) {
616
0
        memcpy(cp, cigar, n_cigar * 4);
617
0
    }
618
6.87M
    cp += n_cigar * 4;
619
620
345M
#define NN 16
621
6.87M
    const uint8_t *useq = (uint8_t *)seq;
622
35.0M
    for (i = 0; i + NN < l_seq; i += NN) {
623
28.2M
        int j;
624
28.2M
        const uint8_t *u2 = useq+i;
625
253M
        for (j = 0; j < NN/2; j++)
626
225M
            cp[j] = (seq_nt16_table[u2[j*2]]<<4) | seq_nt16_table[u2[j*2+1]];
627
28.2M
        cp += NN/2;
628
28.2M
    }
629
8.79M
    for (; i + 1 < l_seq; i += 2) {
630
1.91M
        *cp++ = (seq_nt16_table[useq[i]] << 4) | seq_nt16_table[useq[i + 1]];
631
1.91M
    }
632
633
7.26M
    for (; i < l_seq; i++) {
634
382k
        *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4;
635
382k
    }
636
637
6.87M
    if (qual) {
638
81
        memcpy(cp, qual, l_seq);
639
81
    }
640
6.87M
    else {
641
6.87M
        memset(cp, '\xff', l_seq);
642
6.87M
    }
643
644
6.87M
    return (int)data_len;
645
6.87M
}
646
647
hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
648
2.24M
{
649
2.24M
    int k;
650
2.24M
    hts_pos_t l;
651
3.96M
    for (k = l = 0; k < n_cigar; ++k)
652
1.72M
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
653
1.53M
            l += bam_cigar_oplen(cigar[k]);
654
2.24M
    return l;
655
2.24M
}
656
657
hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
658
100k
{
659
100k
    int k;
660
100k
    hts_pos_t l;
661
5.91M
    for (k = l = 0; k < n_cigar; ++k)
662
5.81M
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
663
5.38M
            l += bam_cigar_oplen(cigar[k]);
664
100k
    return l;
665
100k
}
666
667
hts_pos_t bam_endpos(const bam1_t *b)
668
1.05k
{
669
1.05k
    hts_pos_t rlen = (b->core.flag & BAM_FUNMAP)? 0 : bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
670
1.05k
    if (rlen == 0) rlen = 1;
671
1.05k
    return b->core.pos + rlen;
672
1.05k
}
673
674
static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
675
160k
{
676
160k
    bam1_core_t *c = &b->core;
677
678
    // Bail out as fast as possible for the easy case
679
160k
    uint32_t test_CG = BAM_CSOFT_CLIP | (c->l_qseq << BAM_CIGAR_SHIFT);
680
160k
    if (c->n_cigar == 0 || test_CG != *bam_get_cigar(b))
681
109k
        return 0;
682
683
    // The above isn't fool proof - we may have old CIGAR tags that aren't used,
684
    // but this is much less likely so do as a secondary check.
685
51.0k
    if (c->tid < 0 || c->pos < 0)
686
24.5k
        return 0;
687
688
    // Do we have a CG tag?
689
26.5k
    uint8_t *CG = bam_aux_get(b, "CG");
690
26.5k
    int saved_errno = errno;
691
26.5k
    if (!CG) {
692
25.2k
        if (errno != ENOENT) return -1;  // Bad aux data
693
25.2k
        errno = saved_errno; // restore errno on expected no-CG-tag case
694
25.2k
        return 0;
695
25.2k
    }
696
697
    // Now we start with the serious work migrating CG to CIGAR
698
1.28k
    uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data,
699
1.28k
        *cigar0, CG_len, fake_bytes;
700
1.28k
    cigar0 = bam_get_cigar(b);
701
1.28k
    fake_bytes = c->n_cigar * 4;
702
1.28k
    if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i'))
703
224
        return 0; // not of type B,I
704
1.05k
    CG_len = le_to_u32(CG + 2);
705
    // don't move if the real CIGAR length is shorter than the fake cigar length
706
1.05k
    if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0;
707
708
    // move from the CG tag to the right position
709
1.05k
    cigar_st = (uint8_t*)cigar0 - b->data;
710
1.05k
    c->n_cigar = CG_len;
711
1.05k
    n_cigar4 = c->n_cigar * 4;
712
1.05k
    CG_st = CG - b->data - 2;
713
1.05k
    CG_en = CG_st + 8 + n_cigar4;
714
1.05k
    if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1;
715
    // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
716
1.05k
    b->l_data = b->l_data - fake_bytes + n_cigar4;
717
    // insert c->n_cigar-fake_bytes empty space to make room
718
1.05k
    memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes));
719
    // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
720
1.05k
    memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4);
721
1.05k
    if (ori_len > CG_en) // move data after the CG tag
722
96
        memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
723
1.05k
    b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
724
1.05k
    if (recal_bin)
725
1.05k
        b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5);
726
1.05k
    if (give_warning)
727
1.05k
        hts_log_warning("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
728
1.05k
    return 1;
729
1.05k
}
730
731
static inline int aux_type2size(uint8_t type)
732
1.58M
{
733
1.58M
    switch (type) {
734
907k
    case 'A': case 'c': case 'C':
735
907k
        return 1;
736
145k
    case 's': case 'S':
737
145k
        return 2;
738
289k
    case 'i': case 'I': case 'f':
739
289k
        return 4;
740
9.27k
    case 'd':
741
9.27k
        return 8;
742
232k
    case 'Z': case 'H': case 'B':
743
232k
        return type;
744
7
    default:
745
7
        return 0;
746
1.58M
    }
747
1.58M
}
748
749
static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
750
0
{
751
0
    uint32_t *cigar = (uint32_t*)(data + c->l_qname);
752
0
    uint32_t i;
753
0
    for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
754
0
}
755
756
// Fix bad records where qname is not terminated correctly.
757
185
static int fixup_missing_qname_nul(bam1_t *b) {
758
185
    bam1_core_t *c = &b->core;
759
760
    // Note this is called before c->l_extranul is added to c->l_qname
761
185
    if (c->l_extranul > 0) {
762
178
        b->data[c->l_qname++] = '\0';
763
178
        c->l_extranul--;
764
178
    } else {
765
7
        if (b->l_data > INT_MAX - 4) return -1;
766
7
        if (realloc_bam_data(b, b->l_data + 4) < 0) return -1;
767
7
        b->l_data += 4;
768
7
        b->data[c->l_qname++] = '\0';
769
7
        c->l_extranul = 3;
770
7
    }
771
185
    return 0;
772
185
}
773
774
/*
775
 * Note a second interface that returns a bam pointer instead would avoid bam_copy1
776
 * in multi-threaded handling.  This may be worth considering for htslib2.
777
 */
778
int bam_read1(BGZF *fp, bam1_t *b)
779
321
{
780
321
    bam1_core_t *c = &b->core;
781
321
    int32_t block_len, ret, i;
782
321
    uint32_t new_l_data;
783
321
    uint8_t tmp[32], *x;
784
785
321
    b->l_data = 0;
786
787
321
    if ((ret = bgzf_read_small(fp, &block_len, 4)) != 4) {
788
0
        if (ret == 0) return -1; // normal end-of-file
789
0
        else return -2; // truncated
790
0
    }
791
321
    if (fp->is_be)
792
0
        ed_swap_4p(&block_len);
793
321
    if (block_len < 32) return -4;  // block_len includes core data
794
319
    if (fp->block_length - fp->block_offset > 32) {
795
        // Avoid bgzf_read and a temporary copy to a local buffer
796
317
        x = (uint8_t *)fp->uncompressed_block + fp->block_offset;
797
317
        fp->block_offset += 32;
798
317
    } else {
799
2
        x = tmp;
800
2
        if (bgzf_read(fp, x, 32) != 32) return -3;
801
2
    }
802
803
317
    c->tid        = le_to_u32(x);
804
317
    c->pos        = le_to_i32(x+4);
805
317
    uint32_t x2   = le_to_u32(x+8);
806
317
    c->bin        = x2>>16;
807
317
    c->qual       = x2>>8&0xff;
808
317
    c->l_qname    = x2&0xff;
809
317
    c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
810
317
    uint32_t x3   = le_to_u32(x+12);
811
317
    c->flag       = x3>>16;
812
317
    c->n_cigar    = x3&0xffff;
813
317
    c->l_qseq     = le_to_u32(x+16);
814
317
    c->mtid       = le_to_u32(x+20);
815
317
    c->mpos       = le_to_i32(x+24);
816
317
    c->isize      = le_to_i32(x+28);
817
818
317
    new_l_data = block_len - 32 + c->l_extranul;
819
317
    if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
820
315
    if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
821
315
        + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data)
822
7
        return -4;
823
308
    if (realloc_bam_data(b, new_l_data) < 0) return -4;
824
305
    b->l_data = new_l_data;
825
826
305
    if (bgzf_read_small(fp, b->data, c->l_qname) != c->l_qname) return -4;
827
301
    if (b->data[c->l_qname - 1] != '\0') { // try to fix missing nul termination
828
185
        if (fixup_missing_qname_nul(b) < 0) return -4;
829
185
    }
830
487
    for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
831
301
    c->l_qname += c->l_extranul;
832
301
    if (b->l_data < c->l_qname ||
833
301
        bgzf_read_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
834
2
        return -4;
835
299
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
836
299
    if (bam_tag2cigar(b, 0, 0) < 0)
837
0
        return -4;
838
839
    // TODO: consider making this conditional
840
299
    if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
841
296
        hts_pos_t rlen, qlen;
842
296
        bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
843
296
        if ((b->core.flag & BAM_FUNMAP) || rlen == 0) rlen = 1;
844
296
        b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
845
        // Sanity check for broken CIGAR alignments
846
296
        if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
847
0
            hts_log_error("CIGAR and query sequence lengths differ for %s",
848
0
                    bam_get_qname(b));
849
0
            return -4;
850
0
        }
851
296
    }
852
853
299
    return 4 + block_len;
854
299
}
855
856
int bam_write1(BGZF *fp, const bam1_t *b)
857
2.34M
{
858
2.34M
    const bam1_core_t *c = &b->core;
859
2.34M
    uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
860
2.34M
    int i, ok;
861
2.34M
    if (c->l_qname - c->l_extranul > 255) {
862
0
        hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b));
863
0
        errno = EOVERFLOW;
864
0
        return -1;
865
0
    }
866
2.34M
    if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
867
2.34M
    if (c->pos > INT_MAX ||
868
2.34M
        c->mpos > INT_MAX ||
869
2.34M
        c->isize < INT_MIN || c->isize > INT_MAX) {
870
35
        hts_log_error("Positional data is too large for BAM format");
871
35
        return -1;
872
35
    }
873
2.34M
    x[0] = c->tid;
874
2.34M
    x[1] = c->pos;
875
2.34M
    x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
876
2.34M
    if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
877
2.34M
    else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
878
2.34M
    x[4] = c->l_qseq;
879
2.34M
    x[5] = c->mtid;
880
2.34M
    x[6] = c->mpos;
881
2.34M
    x[7] = c->isize;
882
2.34M
    ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
883
2.34M
    if (fp->is_be) {
884
0
        for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
885
0
        y = block_len;
886
0
        if (ok) ok = (bgzf_write_small(fp, ed_swap_4p(&y), 4) >= 0);
887
0
        swap_data(c, b->l_data, b->data, 1);
888
2.34M
    } else {
889
2.34M
        if (ok) ok = (bgzf_write_small(fp, &block_len, 4) >= 0);
890
2.34M
    }
891
2.34M
    if (ok) ok = (bgzf_write_small(fp, x, 32) >= 0);
892
2.34M
    if (ok) ok = (bgzf_write_small(fp, b->data, c->l_qname - c->l_extranul) >= 0);
893
2.34M
    if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
894
2.34M
        if (ok) ok = (bgzf_write_small(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
895
2.34M
    } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
896
13
        uint8_t buf[8];
897
13
        uint32_t cigar_st, cigar_en, cigar[2];
898
13
        hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b));
899
13
        if (cigreflen >= (1<<28)) {
900
            // Length of reference covered is greater than the biggest
901
            // CIGAR operation currently allowed.
902
1
            hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos
903
1
                          " cannot be written in BAM.  Try writing SAM or CRAM instead.\n",
904
1
                          bam_get_qname(b), c->n_cigar, cigreflen);
905
1
            return -1;
906
1
        }
907
12
        cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
908
12
        cigar_en = cigar_st + c->n_cigar * 4;
909
12
        cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
910
12
        cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP;
911
12
        u32_to_le(cigar[0], buf);
912
12
        u32_to_le(cigar[1], buf + 4);
913
12
        if (ok) ok = (bgzf_write_small(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
914
12
        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
915
12
        if (ok) ok = (bgzf_write_small(fp, "CGBI", 4) >= 0); // write CG:B,I
916
12
        u32_to_le(c->n_cigar, buf);
917
12
        if (ok) ok = (bgzf_write_small(fp, buf, 4) >= 0); // write the true CIGAR length
918
12
        if (ok) ok = (bgzf_write_small(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
919
12
    }
920
2.34M
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
921
2.34M
    return ok? 4 + block_len : -1;
922
2.34M
}
923
924
/*
925
 * Write a BAM file and append to the in-memory index simultaneously.
926
 */
927
2.34M
static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) {
928
2.34M
    BGZF *bfp = fp->fp.bgzf;
929
930
2.34M
    if (!fp->idx)
931
2.34M
        return bam_write1(bfp, b);
932
933
0
    uint32_t block_len = b->l_data - b->core.l_extranul + 32;
934
0
    if (bgzf_flush_try(bfp, 4 + block_len) < 0)
935
0
        return -1;
936
0
    if (!bfp->mt)
937
0
        hts_idx_amend_last(fp->idx, bgzf_tell(bfp));
938
939
0
    int ret = bam_write1(bfp, b);
940
0
    if (ret < 0)
941
0
        return -1;
942
943
0
    if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) {
944
0
        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
945
0
                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
946
0
        ret = -1;
947
0
    }
948
949
0
    return ret;
950
0
}
951
952
/*
953
 * Set the qname in a BAM record
954
 */
955
int bam_set_qname(bam1_t *rec, const char *qname)
956
0
{
957
0
    if (!rec) return -1;
958
0
    if (!qname || !*qname) return -1;
959
960
0
    size_t old_len = rec->core.l_qname;
961
0
    size_t new_len = strlen(qname) + 1;
962
0
    if (new_len < 1 || new_len > 255) return -1;
963
964
0
    int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0;
965
966
0
    size_t new_data_len = rec->l_data - old_len + new_len + extranul;
967
0
    if (realloc_bam_data(rec, new_data_len) < 0) return -1;
968
969
    // Make room
970
0
    if (new_len + extranul != rec->core.l_qname)
971
0
        memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname);
972
    // Copy in new name and pad if needed
973
0
    memcpy(rec->data, qname, new_len);
974
0
    int n;
975
0
    for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0';
976
977
0
    rec->l_data = new_data_len;
978
0
    rec->core.l_qname = new_len + extranul;
979
0
    rec->core.l_extranul = extranul;
980
981
0
    return 0;
982
0
}
983
984
/********************
985
 *** BAM indexing ***
986
 ********************/
987
988
static hts_idx_t *sam_index(htsFile *fp, int min_shift)
989
0
{
990
0
    int n_lvls, i, fmt, ret;
991
0
    bam1_t *b;
992
0
    hts_idx_t *idx;
993
0
    sam_hdr_t *h;
994
0
    h = sam_hdr_read(fp);
995
0
    if (h == NULL) return NULL;
996
0
    if (min_shift > 0) {
997
0
        hts_pos_t max_len = 0, s;
998
0
        for (i = 0; i < h->n_targets; ++i) {
999
0
            hts_pos_t len = sam_hdr_tid2len(h, i);
1000
0
            if (max_len < len) max_len = len;
1001
0
        }
1002
0
        max_len += 256;
1003
0
        for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
1004
0
        fmt = HTS_FMT_CSI;
1005
0
    } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1006
0
    idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1007
0
    b = bam_init1();
1008
0
    while ((ret = sam_read1(fp, h, b)) >= 0) {
1009
0
        ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP));
1010
0
        if (ret < 0) { // unsorted or doesn't fit
1011
0
            hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
1012
0
            goto err;
1013
0
        }
1014
0
    }
1015
0
    if (ret < -1) goto err; // corrupted BAM file
1016
1017
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
1018
0
    sam_hdr_destroy(h);
1019
0
    bam_destroy1(b);
1020
0
    return idx;
1021
1022
0
err:
1023
0
    bam_destroy1(b);
1024
0
    hts_idx_destroy(idx);
1025
0
    return NULL;
1026
0
}
1027
1028
int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
1029
0
{
1030
0
    hts_idx_t *idx;
1031
0
    htsFile *fp;
1032
0
    int ret = 0;
1033
1034
0
    if ((fp = hts_open(fn, "r")) == 0) return -2;
1035
0
    if (nthreads)
1036
0
        hts_set_threads(fp, nthreads);
1037
1038
0
    switch (fp->format.format) {
1039
0
    case cram:
1040
1041
0
        ret = cram_index_build(fp->fp.cram, fn, fnidx);
1042
0
        break;
1043
1044
0
    case bam:
1045
0
    case sam:
1046
0
        if (fp->format.compression != bgzf) {
1047
0
            hts_log_error("%s file \"%s\" not BGZF compressed",
1048
0
                          fp->format.format == bam ? "BAM" : "SAM", fn);
1049
0
            ret = -1;
1050
0
            break;
1051
0
        }
1052
0
        idx = sam_index(fp, min_shift);
1053
0
        if (idx) {
1054
0
            ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
1055
0
            if (ret < 0) ret = -4;
1056
0
            hts_idx_destroy(idx);
1057
0
        }
1058
0
        else ret = -1;
1059
0
        break;
1060
1061
0
    default:
1062
0
        ret = -3;
1063
0
        break;
1064
0
    }
1065
0
    hts_close(fp);
1066
1067
0
    return ret;
1068
0
}
1069
1070
int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
1071
0
{
1072
0
    return sam_index_build3(fn, fnidx, min_shift, 0);
1073
0
}
1074
1075
int sam_index_build(const char *fn, int min_shift)
1076
0
{
1077
0
    return sam_index_build3(fn, NULL, min_shift, 0);
1078
0
}
1079
1080
// Provide bam_index_build() symbol for binary compatibility with earlier HTSlib
1081
#undef bam_index_build
1082
int bam_index_build(const char *fn, int min_shift)
1083
0
{
1084
0
    return sam_index_build2(fn, NULL, min_shift);
1085
0
}
1086
1087
// Initialise fp->idx for the current format type.
1088
// This must be called after the header has been written but no other data.
1089
0
int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) {
1090
0
    fp->fnidx = fnidx;
1091
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1092
0
        (fp->format.format == sam && fp->format.compression == bgzf)) {
1093
0
        int n_lvls, fmt = HTS_FMT_CSI;
1094
0
        if (min_shift > 0) {
1095
0
            int64_t max_len = 0, s;
1096
0
            int i;
1097
0
            for (i = 0; i < h->n_targets; ++i)
1098
0
                if (max_len < h->target_len[i]) max_len = h->target_len[i];
1099
0
            max_len += 256;
1100
0
            for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
1101
1102
0
        } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1103
1104
0
        fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1105
0
        return fp->idx ? 0 : -1;
1106
0
    }
1107
1108
0
    if (fp->format.format == cram) {
1109
0
        fp->fp.cram->idxfp = bgzf_open(fnidx, "wg");
1110
0
        return fp->fp.cram->idxfp ? 0 : -1;
1111
0
    }
1112
1113
0
    return -1;
1114
0
}
1115
1116
// Finishes an index. Call after the last record has been written.
1117
// Returns 0 on success, <0 on failure.
1118
0
int sam_idx_save(htsFile *fp) {
1119
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1120
0
        fp->format.format == vcf || fp->format.format == sam) {
1121
0
        int ret;
1122
0
        if ((ret = sam_state_destroy(fp)) < 0) {
1123
0
            errno = -ret;
1124
0
            return -1;
1125
0
        }
1126
0
        if (!fp->is_bgzf || bgzf_flush(fp->fp.bgzf) < 0)
1127
0
            return -1;
1128
0
        hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
1129
1130
0
        if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0)
1131
0
            return -1;
1132
1133
0
        return hts_idx_save_but_not_close(fp->idx, fp->fnidx, hts_idx_fmt(fp->idx));
1134
1135
0
    } else if (fp->format.format == cram) {
1136
        // flushed and closed by cram_close
1137
0
    }
1138
1139
0
    return 0;
1140
0
}
1141
1142
static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1143
0
{
1144
0
    htsFile *fp = (htsFile *)fpv;
1145
0
    bam1_t *b = bv;
1146
0
    fp->line.l = 0;
1147
0
    int ret = sam_read1(fp, fp->bam_header, b);
1148
0
    if (ret >= 0) {
1149
0
        *tid = b->core.tid;
1150
0
        *beg = b->core.pos;
1151
0
        *end = bam_endpos(b);
1152
0
    }
1153
0
    return ret;
1154
0
}
1155
1156
// This is used only with read_rest=1 iterators, so need not set tid/beg/end.
1157
static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1158
0
{
1159
0
    htsFile *fp = (htsFile *)fpv;
1160
0
    bam1_t *b = bv;
1161
0
    fp->line.l = 0;
1162
0
    int ret = sam_read1(fp, fp->bam_header, b);
1163
0
    return ret;
1164
0
}
1165
1166
// Internal (for now) func used by bam_sym_lookup.  This is copied from
1167
// samtools/bam.c.
1168
static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b)
1169
0
{
1170
0
    const char *rg;
1171
0
    kstring_t lib = { 0, 0, NULL };
1172
0
    rg = (char *)bam_aux_get(b, "RG");
1173
1174
0
    if (!rg)
1175
0
        return NULL;
1176
0
    else
1177
0
        rg++;
1178
1179
0
    if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib)  < 0)
1180
0
        return NULL;
1181
1182
0
    static char LB_text[1024];
1183
0
    int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1;
1184
1185
0
    memcpy(LB_text, lib.s, len);
1186
0
    LB_text[len] = 0;
1187
1188
0
    free(lib.s);
1189
1190
0
    return LB_text;
1191
0
}
1192
1193
1194
// Bam record pointer and SAM header combined
1195
typedef struct {
1196
    const sam_hdr_t *h;
1197
    const bam1_t *b;
1198
} hb_pair;
1199
1200
// Looks up variable names in str and replaces them with their value.
1201
// Also supports aux tags.
1202
//
1203
// Note the expression parser deliberately overallocates str size so it
1204
// is safe to use memcmp over strcmp.
1205
static int bam_sym_lookup(void *data, char *str, char **end,
1206
0
                          hts_expr_val_t *res) {
1207
0
    hb_pair *hb = (hb_pair *)data;
1208
0
    const bam1_t *b = hb->b;
1209
1210
0
    res->is_str = 0;
1211
0
    switch(*str) {
1212
0
    case 'c':
1213
0
        if (memcmp(str, "cigar", 5) == 0) {
1214
0
            *end = str+5;
1215
0
            res->is_str = 1;
1216
0
            ks_clear(&res->s);
1217
0
            uint32_t *cigar = bam_get_cigar(b);
1218
0
            int i, n = b->core.n_cigar, r = 0;
1219
0
            if (n) {
1220
0
                for (i = 0; i < n; i++) {
1221
0
                    r |= kputw (bam_cigar_oplen(cigar[i]), &res->s) < 0;
1222
0
                    r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s) < 0;
1223
0
                }
1224
0
                r |= kputs("", &res->s) < 0;
1225
0
            } else {
1226
0
                r |= kputs("*", &res->s) < 0;
1227
0
            }
1228
0
            return r ? -1 : 0;
1229
0
        }
1230
0
        break;
1231
1232
0
    case 'e':
1233
0
        if (memcmp(str, "endpos", 6) == 0) {
1234
0
            *end = str+6;
1235
0
            res->d = bam_endpos(b);
1236
0
            return 0;
1237
0
        }
1238
0
        break;
1239
1240
0
    case 'f':
1241
0
        if (memcmp(str, "flag", 4) == 0) {
1242
0
            str = *end = str+4;
1243
0
            if (*str != '.') {
1244
0
                res->d = b->core.flag;
1245
0
                return 0;
1246
0
            } else {
1247
0
                str++;
1248
0
                if (!memcmp(str, "paired", 6)) {
1249
0
                    *end = str+6;
1250
0
                    res->d = b->core.flag & BAM_FPAIRED;
1251
0
                    return 0;
1252
0
                } else if (!memcmp(str, "proper_pair", 11)) {
1253
0
                    *end = str+11;
1254
0
                    res->d = b->core.flag & BAM_FPROPER_PAIR;
1255
0
                    return 0;
1256
0
                } else if (!memcmp(str, "unmap", 5)) {
1257
0
                    *end = str+5;
1258
0
                    res->d = b->core.flag & BAM_FUNMAP;
1259
0
                    return 0;
1260
0
                } else if (!memcmp(str, "munmap", 6)) {
1261
0
                    *end = str+6;
1262
0
                    res->d = b->core.flag & BAM_FMUNMAP;
1263
0
                    return 0;
1264
0
                } else if (!memcmp(str, "reverse", 7)) {
1265
0
                    *end = str+7;
1266
0
                    res->d = b->core.flag & BAM_FREVERSE;
1267
0
                    return 0;
1268
0
                } else if (!memcmp(str, "mreverse", 8)) {
1269
0
                    *end = str+8;
1270
0
                    res->d = b->core.flag & BAM_FMREVERSE;
1271
0
                    return 0;
1272
0
                } else if (!memcmp(str, "read1", 5)) {
1273
0
                    *end = str+5;
1274
0
                    res->d = b->core.flag & BAM_FREAD1;
1275
0
                    return 0;
1276
0
                } else if (!memcmp(str, "read2", 5)) {
1277
0
                    *end = str+5;
1278
0
                    res->d = b->core.flag & BAM_FREAD2;
1279
0
                    return 0;
1280
0
                } else if (!memcmp(str, "secondary", 9)) {
1281
0
                    *end = str+9;
1282
0
                    res->d = b->core.flag & BAM_FSECONDARY;
1283
0
                    return 0;
1284
0
                } else if (!memcmp(str, "qcfail", 6)) {
1285
0
                    *end = str+6;
1286
0
                    res->d = b->core.flag & BAM_FQCFAIL;
1287
0
                    return 0;
1288
0
                } else if (!memcmp(str, "dup", 3)) {
1289
0
                    *end = str+3;
1290
0
                    res->d = b->core.flag & BAM_FDUP;
1291
0
                    return 0;
1292
0
                } else if (!memcmp(str, "supplementary", 13)) {
1293
0
                    *end = str+13;
1294
0
                    res->d = b->core.flag & BAM_FSUPPLEMENTARY;
1295
0
                    return 0;
1296
0
                } else {
1297
0
                    hts_log_error("Unrecognised flag string");
1298
0
                    return -1;
1299
0
                }
1300
0
            }
1301
0
        }
1302
0
        break;
1303
1304
0
    case 'h':
1305
0
        if (memcmp(str, "hclen", 5) == 0) {
1306
0
            int hclen = 0;
1307
0
            uint32_t *cigar = bam_get_cigar(b);
1308
0
            uint32_t ncigar = b->core.n_cigar;
1309
1310
            // left
1311
0
            if (ncigar > 0 && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP)
1312
0
                hclen = bam_cigar_oplen(cigar[0]);
1313
1314
            // right
1315
0
            if (ncigar > 1 && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP)
1316
0
                hclen += bam_cigar_oplen(cigar[ncigar-1]);
1317
1318
0
            *end = str+5;
1319
0
            res->d = hclen;
1320
0
            return 0;
1321
0
        }
1322
0
        break;
1323
1324
0
    case 'l':
1325
0
        if (memcmp(str, "library", 7) == 0) {
1326
0
            *end = str+7;
1327
0
            res->is_str = 1;
1328
0
            const char *lib = bam_get_library(hb->h, b);
1329
0
            kputs(lib ? lib : "", ks_clear(&res->s));
1330
0
            return 0;
1331
0
        }
1332
0
        break;
1333
1334
0
    case 'm':
1335
0
        if (memcmp(str, "mapq", 4) == 0) {
1336
0
            *end = str+4;
1337
0
            res->d = b->core.qual;
1338
0
            return 0;
1339
0
        } else if (memcmp(str, "mpos", 4) == 0) {
1340
0
            *end = str+4;
1341
0
            res->d = b->core.mpos+1;
1342
0
            return 0;
1343
0
        } else if (memcmp(str, "mrname", 6) == 0) {
1344
0
            *end = str+6;
1345
0
            res->is_str = 1;
1346
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1347
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1348
0
            return 0;
1349
0
        } else if (memcmp(str, "mrefid", 6) == 0) {
1350
0
            *end = str+6;
1351
0
            res->d = b->core.mtid;
1352
0
            return 0;
1353
0
        }
1354
0
        break;
1355
1356
0
    case 'n':
1357
0
        if (memcmp(str, "ncigar", 6) == 0) {
1358
0
            *end = str+6;
1359
0
            res->d = b->core.n_cigar;
1360
0
            return 0;
1361
0
        }
1362
0
        break;
1363
1364
0
    case 'p':
1365
0
        if (memcmp(str, "pos", 3) == 0) {
1366
0
            *end = str+3;
1367
0
            res->d = b->core.pos+1;
1368
0
            return 0;
1369
0
        } else if (memcmp(str, "pnext", 5) == 0) {
1370
0
            *end = str+5;
1371
0
            res->d = b->core.mpos+1;
1372
0
            return 0;
1373
0
        }
1374
0
        break;
1375
1376
0
    case 'q':
1377
0
        if (memcmp(str, "qlen", 4) == 0) {
1378
0
            *end = str+4;
1379
0
            res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
1380
0
            return 0;
1381
0
        } else if (memcmp(str, "qname", 5) == 0) {
1382
0
            *end = str+5;
1383
0
            res->is_str = 1;
1384
0
            kputs(bam_get_qname(b), ks_clear(&res->s));
1385
0
            return 0;
1386
0
        } else if (memcmp(str, "qual", 4) == 0) {
1387
0
            *end = str+4;
1388
0
            ks_clear(&res->s);
1389
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1390
0
                return -1;
1391
0
            memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq);
1392
0
            res->s.l = b->core.l_qseq;
1393
0
            res->is_str = 1;
1394
0
            return 0;
1395
0
        }
1396
0
        break;
1397
1398
0
    case 'r':
1399
0
        if (memcmp(str, "rlen", 4) == 0) {
1400
0
            *end = str+4;
1401
0
            res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
1402
0
            return 0;
1403
0
        } else if (memcmp(str, "rname", 5) == 0) {
1404
0
            *end = str+5;
1405
0
            res->is_str = 1;
1406
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.tid);
1407
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1408
0
            return 0;
1409
0
        } else if (memcmp(str, "rnext", 5) == 0) {
1410
0
            *end = str+5;
1411
0
            res->is_str = 1;
1412
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1413
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1414
0
            return 0;
1415
0
        } else if (memcmp(str, "refid", 5) == 0) {
1416
0
            *end = str+5;
1417
0
            res->d = b->core.tid;
1418
0
            return 0;
1419
0
        }
1420
0
        break;
1421
1422
0
    case 's':
1423
0
        if (memcmp(str, "seq", 3) == 0) {
1424
0
            *end = str+3;
1425
0
            ks_clear(&res->s);
1426
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1427
0
                return -1;
1428
0
            nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq);
1429
0
            res->s.s[b->core.l_qseq] = 0;
1430
0
            res->s.l = b->core.l_qseq;
1431
0
            res->is_str = 1;
1432
0
            return 0;
1433
0
        } else if (memcmp(str, "sclen", 5) == 0) {
1434
0
            int sclen = 0;
1435
0
            uint32_t *cigar = bam_get_cigar(b);
1436
0
            int ncigar = b->core.n_cigar;
1437
0
            int left = 0;
1438
1439
            // left
1440
0
            if (ncigar > 0
1441
0
                && bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP)
1442
0
                left = 0, sclen += bam_cigar_oplen(cigar[0]);
1443
0
            else if (ncigar > 1
1444
0
                     && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP
1445
0
                     && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP)
1446
0
                left = 1, sclen += bam_cigar_oplen(cigar[1]);
1447
1448
            // right
1449
0
            if (ncigar-1 > left
1450
0
                && bam_cigar_op(cigar[ncigar-1]) == BAM_CSOFT_CLIP)
1451
0
                sclen += bam_cigar_oplen(cigar[ncigar-1]);
1452
0
            else if (ncigar-2 > left
1453
0
                     && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP
1454
0
                     && bam_cigar_op(cigar[ncigar-2]) == BAM_CSOFT_CLIP)
1455
0
                sclen += bam_cigar_oplen(cigar[ncigar-2]);
1456
1457
0
            *end = str+5;
1458
0
            res->d = sclen;
1459
0
            return 0;
1460
0
        }
1461
0
        break;
1462
1463
0
    case 't':
1464
0
        if (memcmp(str, "tlen", 4) == 0) {
1465
0
            *end = str+4;
1466
0
            res->d = b->core.isize;
1467
0
            return 0;
1468
0
        }
1469
0
        break;
1470
1471
0
    case '[':
1472
0
        if (*str == '[' && str[1] && str[2] && str[3] == ']') {
1473
            /* aux tags */
1474
0
            *end = str+4;
1475
1476
0
            uint8_t *aux = bam_aux_get(b, str+1);
1477
0
            if (aux) {
1478
                // we define the truth of a tag to be its presence, even if 0.
1479
0
                res->is_true = 1;
1480
0
                switch (*aux) {
1481
0
                case 'Z':
1482
0
                case 'H':
1483
0
                    res->is_str = 1;
1484
0
                    kputs((char *)aux+1, ks_clear(&res->s));
1485
0
                    break;
1486
1487
0
                case 'A':
1488
0
                    res->is_str = 1;
1489
0
                    kputsn((char *)aux+1, 1, ks_clear(&res->s));
1490
0
                    break;
1491
1492
0
                case 'i': case 'I':
1493
0
                case 's': case 'S':
1494
0
                case 'c': case 'C':
1495
0
                    res->is_str = 0;
1496
0
                    res->d = bam_aux2i(aux);
1497
0
                    break;
1498
1499
0
                case 'f':
1500
0
                case 'd':
1501
0
                    res->is_str = 0;
1502
0
                    res->d = bam_aux2f(aux);
1503
0
                    break;
1504
1505
0
                default:
1506
0
                    hts_log_error("Aux type '%c not yet supported by filters",
1507
0
                                  *aux);
1508
0
                    return -1;
1509
0
                }
1510
0
                return 0;
1511
1512
0
            } else {
1513
                // hence absent tags are always false (and strings)
1514
0
                res->is_str = 1;
1515
0
                res->s.l = 0;
1516
0
                res->d = 0;
1517
0
                res->is_true = 0;
1518
0
                return 0;
1519
0
            }
1520
0
        }
1521
0
        break;
1522
0
    }
1523
1524
    // All successful matches in switch should return 0.
1525
    // So if we didn't match, it's a parse error.
1526
0
    return -1;
1527
0
}
1528
1529
// Returns 1 when accepted by the filter, 0 if not, -1 on error.
1530
int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt)
1531
0
{
1532
0
    hb_pair hb = {h, b};
1533
0
    hts_expr_val_t res = HTS_EXPR_VAL_INIT;
1534
0
    if (hts_filter_eval2(filt, &hb, bam_sym_lookup, &res)) {
1535
0
        hts_log_error("Couldn't process filter expression");
1536
0
        hts_expr_val_free(&res);
1537
0
        return -1;
1538
0
    }
1539
1540
0
    int t = res.is_true;
1541
0
    hts_expr_val_free(&res);
1542
1543
0
    return t;
1544
0
}
1545
1546
static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1547
0
{
1548
0
    htsFile *fp = fpv;
1549
0
    bam1_t *b = bv;
1550
0
    int pass_filter, ret;
1551
1552
0
    do {
1553
0
        ret = cram_get_bam_seq(fp->fp.cram, &b);
1554
0
        if (ret < 0)
1555
0
            return cram_eof(fp->fp.cram) ? -1 : -2;
1556
1557
0
        if (bam_tag2cigar(b, 1, 1) < 0)
1558
0
            return -2;
1559
1560
0
        *tid = b->core.tid;
1561
0
        *beg = b->core.pos;
1562
0
        *end = bam_endpos(b);
1563
1564
0
        if (fp->filter) {
1565
0
            pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter);
1566
0
            if (pass_filter < 0)
1567
0
                return -2;
1568
0
        } else {
1569
0
            pass_filter = 1;
1570
0
        }
1571
0
    } while (pass_filter == 0);
1572
1573
0
    return ret;
1574
0
}
1575
1576
static int cram_pseek(void *fp, int64_t offset, int whence)
1577
0
{
1578
0
    cram_fd *fd =  (cram_fd *)fp;
1579
1580
0
    if ((0 != cram_seek(fd, offset, SEEK_SET))
1581
0
     && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
1582
0
        return -1;
1583
1584
0
    fd->curr_position = offset;
1585
1586
0
    if (fd->ctr) {
1587
0
        cram_free_container(fd->ctr);
1588
0
        if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
1589
0
            cram_free_container(fd->ctr_mt);
1590
1591
0
        fd->ctr = NULL;
1592
0
        fd->ctr_mt = NULL;
1593
0
        fd->ooc = 0;
1594
0
    }
1595
1596
0
    return 0;
1597
0
}
1598
1599
/*
1600
 * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
1601
 *   after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
1602
 *   container previously fetched. It was designed like this to integrate with the functionality
1603
 *   of the iterator stepping logic.
1604
 */
1605
1606
static int64_t cram_ptell(void *fp)
1607
0
{
1608
0
    cram_fd *fd = (cram_fd *)fp;
1609
0
    cram_container *c;
1610
0
    cram_slice *s;
1611
0
    int64_t ret = -1L;
1612
1613
0
    if (fd) {
1614
0
        if ((c = fd->ctr) != NULL) {
1615
0
            if ((s = c->slice) != NULL && s->max_rec) {
1616
0
                if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1))
1617
0
                    fd->curr_position += c->offset + c->length;
1618
0
            }
1619
0
        }
1620
0
        ret = fd->curr_position;
1621
0
    }
1622
1623
0
    return ret;
1624
0
}
1625
1626
static int bam_pseek(void *fp, int64_t offset, int whence)
1627
0
{
1628
0
    BGZF *fd = (BGZF *)fp;
1629
1630
0
    return bgzf_seek(fd, offset, whence);
1631
0
}
1632
1633
static int64_t bam_ptell(void *fp)
1634
0
{
1635
0
    BGZF *fd = (BGZF *)fp;
1636
0
    if (!fd)
1637
0
        return -1L;
1638
1639
0
    return bgzf_tell(fd);
1640
0
}
1641
1642
1643
1644
static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags)
1645
0
{
1646
0
    switch (fp->format.format) {
1647
0
    case bam:
1648
0
    case sam:
1649
0
        return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags);
1650
1651
0
    case cram: {
1652
0
        if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
1653
1654
        // Cons up a fake "index" just pointing at the associated cram_fd:
1655
0
        hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
1656
0
        if (idx == NULL) return NULL;
1657
0
        idx->fmt = HTS_FMT_CRAI;
1658
0
        idx->cram = fp->fp.cram;
1659
0
        return (hts_idx_t *) idx;
1660
0
        }
1661
1662
0
    default:
1663
0
        return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
1664
0
    }
1665
0
}
1666
1667
hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags)
1668
0
{
1669
0
    return index_load(fp, fn, fnidx, flags);
1670
0
}
1671
1672
0
hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) {
1673
0
    return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE);
1674
0
}
1675
1676
hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
1677
0
{
1678
0
    return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE);
1679
0
}
1680
1681
static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
1682
0
{
1683
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1684
0
    hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
1685
0
    if (iter == NULL) return NULL;
1686
1687
    // Cons up a dummy iterator for which hts_itr_next() will simply invoke
1688
    // the readrec function:
1689
0
    iter->is_cram = 1;
1690
0
    iter->read_rest = 1;
1691
0
    iter->off = NULL;
1692
0
    iter->bins.a = NULL;
1693
0
    iter->readrec = readrec;
1694
1695
0
    if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
1696
0
        cram_range r = { tid, beg+1, end };
1697
0
        int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
1698
1699
0
        iter->curr_off = 0;
1700
        // The following fields are not required by hts_itr_next(), but are
1701
        // filled in in case user code wants to look at them.
1702
0
        iter->tid = tid;
1703
0
        iter->beg = beg;
1704
0
        iter->end = end;
1705
1706
0
        switch (ret) {
1707
0
        case 0:
1708
0
            break;
1709
1710
0
        case -2:
1711
            // No data vs this ref, so mark iterator as completed.
1712
            // Same as HTS_IDX_NONE.
1713
0
            iter->finished = 1;
1714
0
            break;
1715
1716
0
        default:
1717
0
            free(iter);
1718
0
            return NULL;
1719
0
        }
1720
0
    }
1721
0
    else switch (tid) {
1722
0
    case HTS_IDX_REST:
1723
0
        iter->curr_off = 0;
1724
0
        break;
1725
0
    case HTS_IDX_NONE:
1726
0
        iter->curr_off = 0;
1727
0
        iter->finished = 1;
1728
0
        break;
1729
0
    default:
1730
0
        hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
1731
0
        abort();
1732
0
        break;
1733
0
    }
1734
1735
0
    return iter;
1736
0
}
1737
1738
hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
1739
0
{
1740
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1741
0
    if (idx == NULL)
1742
0
        return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest);
1743
0
    else if (cidx->fmt == HTS_FMT_CRAI)
1744
0
        return cram_itr_query(idx, tid, beg, end, sam_readrec);
1745
0
    else
1746
0
        return hts_itr_query(idx, tid, beg, end, sam_readrec);
1747
0
}
1748
1749
static int cram_name2id(void *fdv, const char *ref)
1750
0
{
1751
0
    cram_fd *fd = (cram_fd *) fdv;
1752
0
    return sam_hdr_name2tid(fd->header, ref);
1753
0
}
1754
1755
hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region)
1756
0
{
1757
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1758
0
    return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr,
1759
0
                          cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query,
1760
0
                          sam_readrec);
1761
0
}
1762
1763
hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount)
1764
0
{
1765
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1766
0
    hts_reglist_t *r_list = NULL;
1767
0
    int r_count = 0;
1768
1769
0
    if (!cidx || !hdr)
1770
0
        return NULL;
1771
1772
0
    hts_itr_t *itr = NULL;
1773
0
    if (cidx->fmt == HTS_FMT_CRAI) {
1774
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id);
1775
0
        if (!r_list)
1776
0
            return NULL;
1777
0
        itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram,
1778
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1779
0
    } else {
1780
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, (hts_name2id_f)(bam_name2id));
1781
0
        if (!r_list)
1782
0
            return NULL;
1783
0
        itr = hts_itr_regions(idx, r_list, r_count, (hts_name2id_f)(bam_name2id), hdr,
1784
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1785
0
    }
1786
1787
0
    if (!itr)
1788
0
        hts_reglist_free(r_list, r_count);
1789
1790
0
    return itr;
1791
0
}
1792
1793
hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
1794
0
{
1795
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1796
1797
0
    if(!cidx || !hdr || !reglist)
1798
0
        return NULL;
1799
1800
0
    if (cidx->fmt == HTS_FMT_CRAI)
1801
0
        return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
1802
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1803
0
    else
1804
0
        return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr,
1805
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1806
0
}
1807
1808
/**********************
1809
 *** SAM header I/O ***
1810
 **********************/
1811
1812
#include "htslib/kseq.h"
1813
#include "htslib/kstring.h"
1814
1815
sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text)
1816
0
{
1817
0
    sam_hdr_t *bh = sam_hdr_init();
1818
0
    if (!bh) return NULL;
1819
1820
0
    if (sam_hdr_add_lines(bh, text, l_text) != 0) {
1821
0
        sam_hdr_destroy(bh);
1822
0
        return NULL;
1823
0
    }
1824
1825
0
    return bh;
1826
0
}
1827
1828
191k
static int valid_sam_header_type(const char *s) {
1829
191k
    if (s[0] != '@') return 0;
1830
191k
    switch (s[1]) {
1831
594
    case 'H':
1832
594
        return s[2] == 'D' && s[3] == '\t';
1833
9
    case 'S':
1834
9
        return s[2] == 'Q' && s[3] == '\t';
1835
185k
    case 'R':
1836
190k
    case 'P':
1837
190k
        return s[2] == 'G' && s[3] == '\t';
1838
711
    case 'C':
1839
711
        return s[2] == 'O';
1840
191k
    }
1841
18
    return 0;
1842
191k
}
1843
1844
// Minimal sanitisation of a header to ensure.
1845
// - null terminated string.
1846
// - all lines start with @ (also implies no blank lines).
1847
//
1848
// Much more could be done, but currently is not, including:
1849
// - checking header types are known (HD, SQ, etc).
1850
// - syntax (eg checking tab separated fields).
1851
// - validating n_targets matches @SQ records.
1852
// - validating target lengths against @SQ records.
1853
3.44k
static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) {
1854
3.44k
    if (!h)
1855
30
        return NULL;
1856
1857
    // Special case for empty headers.
1858
3.41k
    if (h->l_text == 0)
1859
639
        return h;
1860
1861
2.77k
    size_t i;
1862
2.77k
    unsigned int lnum = 0;
1863
2.77k
    char *cp = h->text, last = '\n';
1864
24.2M
    for (i = 0; i < h->l_text; i++) {
1865
        // NB: l_text excludes terminating nul.  This finds early ones.
1866
24.2M
        if (cp[i] == 0)
1867
1.52k
            break;
1868
1869
        // Error on \n[^@], including duplicate newlines
1870
24.2M
        if (last == '\n') {
1871
111k
            lnum++;
1872
111k
            if (cp[i] != '@') {
1873
0
                hts_log_error("Malformed SAM header at line %u", lnum);
1874
0
                sam_hdr_destroy(h);
1875
0
                return NULL;
1876
0
            }
1877
111k
        }
1878
1879
24.2M
        last = cp[i];
1880
24.2M
    }
1881
1882
2.77k
    if (i < h->l_text) { // Early nul found.  Complain if not just padding.
1883
1.52k
        size_t j = i;
1884
11.0k
        while (j < h->l_text && cp[j] == '\0') j++;
1885
1.52k
        if (j < h->l_text)
1886
1.51k
            hts_log_warning("Unexpected NUL character in header. Possibly truncated");
1887
1.52k
    }
1888
1889
    // Add trailing newline and/or trailing nul if required.
1890
2.77k
    if (last != '\n') {
1891
1.51k
        hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
1892
1893
1.51k
        if (h->l_text < 2 || i >= h->l_text - 2) {
1894
105
            if (h->l_text >= SIZE_MAX - 2) {
1895
0
                hts_log_error("No room for extra newline");
1896
0
                sam_hdr_destroy(h);
1897
0
                return NULL;
1898
0
            }
1899
1900
105
            cp = realloc(h->text, (size_t) h->l_text+2);
1901
105
            if (!cp) {
1902
0
                sam_hdr_destroy(h);
1903
0
                return NULL;
1904
0
            }
1905
105
            h->text = cp;
1906
105
        }
1907
1.51k
        cp[i++] = '\n';
1908
1909
        // l_text may be larger already due to multiple nul padding
1910
1.51k
        if (h->l_text < i)
1911
0
            h->l_text = i;
1912
1.51k
        cp[h->l_text] = '\0';
1913
1.51k
    }
1914
1915
2.77k
    return h;
1916
2.77k
}
1917
1918
1.00k
static void known_stderr(const char *tool, const char *advice) {
1919
1.00k
    hts_log_warning("SAM file corrupted by embedded %s error/log message", tool);
1920
1.00k
    hts_log_warning("%s", advice);
1921
1.00k
}
1922
1923
14.0k
static void warn_if_known_stderr(const char *line) {
1924
14.0k
    if (strstr(line, "M::bwa_idx_load_from_disk") != NULL)
1925
105
        known_stderr("bwa", "Use `bwa mem -o file.sam ...` or `bwa sampe -f file.sam ...` instead of `bwa ... > file.sam`");
1926
13.9k
    else if (strstr(line, "M::mem_pestat") != NULL)
1927
900
        known_stderr("bwa", "Use `bwa mem -o file.sam ...` instead of `bwa mem ... > file.sam`");
1928
13.0k
    else if (strstr(line, "loaded/built the index") != NULL)
1929
0
        known_stderr("minimap2", "Use `minimap2 -o file.sam ...` instead of `minimap2 ... > file.sam`");
1930
14.0k
}
1931
1932
2.73k
static sam_hdr_t *sam_hdr_create(htsFile* fp) {
1933
2.73k
    kstring_t str = { 0, 0, NULL };
1934
2.73k
    khint_t k;
1935
2.73k
    sam_hdr_t* h = sam_hdr_init();
1936
2.73k
    const char *q, *r;
1937
2.73k
    char* sn = NULL;
1938
2.73k
    khash_t(s2i) *d = kh_init(s2i);
1939
2.73k
    khash_t(s2i) *long_refs = NULL;
1940
2.73k
    if (!h || !d)
1941
0
        goto error;
1942
1943
2.73k
    int ret, has_SQ = 0;
1944
2.73k
    int next_c = '@';
1945
238k
    while (next_c == '@' && (ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) {
1946
235k
        if (fp->line.s[0] != '@')
1947
3
            break;
1948
1949
235k
        if (fp->line.l > 3 && strncmp(fp->line.s, "@SQ", 3) == 0) {
1950
43.7k
            has_SQ = 1;
1951
43.7k
            hts_pos_t ln = -1;
1952
122k
            for (q = fp->line.s + 4;; ++q) {
1953
122k
                if (strncmp(q, "SN:", 3) == 0) {
1954
45.1k
                    q += 3;
1955
943M
                    for (r = q;*r != '\t' && *r != '\n' && *r != '\0';++r);
1956
1957
45.1k
                    if (sn) {
1958
12.6k
                        hts_log_warning("SQ header line has more than one SN: tag");
1959
12.6k
                        free(sn);
1960
12.6k
                    }
1961
45.1k
                    sn = (char*)calloc(r - q + 1, 1);
1962
45.1k
                    if (!sn)
1963
0
                        goto error;
1964
1965
45.1k
                    strncpy(sn, q, r - q);
1966
45.1k
                    q = r;
1967
77.0k
                } else {
1968
77.0k
                    if (strncmp(q, "LN:", 3) == 0) {
1969
37.2k
                        hts_pos_t tmp = strtoll(q + 3, (char**)&q, 10);
1970
37.2k
                        if (ln != -1 && ln != tmp) { //duplicate & different LN
1971
9
                            hts_log_error("Header includes @SQ line \"%s\" with"
1972
9
                                " multiple LN: tag with different values.", sn);
1973
9
                            goto error;
1974
37.2k
                        } else {
1975
37.2k
                            ln = tmp;
1976
37.2k
                        }
1977
37.2k
                    }
1978
77.0k
                }
1979
1980
25.0M
                while (*q != '\t' && *q != '\n' && *q != '\0')
1981
24.9M
                    ++q;
1982
122k
                if (*q == '\0' || *q == '\n')
1983
43.7k
                    break;
1984
122k
            }
1985
43.7k
            if (sn) {
1986
32.5k
                if (ln >= 0) {
1987
29.7k
                    int absent;
1988
29.7k
                    k = kh_put(s2i, d, sn, &absent);
1989
29.7k
                    if (absent < 0)
1990
0
                        goto error;
1991
1992
29.7k
                    if (!absent) {
1993
16.6k
                        hts_log_warning("Duplicated sequence \"%s\" in file \"%s\"", sn, fp->fn);
1994
16.6k
                        free(sn);
1995
16.6k
                    } else {
1996
13.1k
                        sn = NULL;
1997
13.1k
                        if (ln >= UINT32_MAX) {
1998
                            // Stash away ref length that
1999
                            // doesn't fit in target_len array
2000
3.89k
                            int k2;
2001
3.89k
                            if (!long_refs) {
2002
441
                                long_refs = kh_init(s2i);
2003
441
                                if (!long_refs)
2004
0
                                    goto error;
2005
441
                            }
2006
3.89k
                            k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent);
2007
3.89k
                            if (absent < 0)
2008
0
                                goto error;
2009
3.89k
                            kh_val(long_refs, k2) = ln;
2010
3.89k
                            kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
2011
3.89k
                                            | UINT32_MAX);
2012
9.23k
                        } else {
2013
9.23k
                            kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
2014
9.23k
                        }
2015
13.1k
                    }
2016
29.7k
                } else {
2017
2.80k
                    hts_log_warning("Ignored @SQ SN:%s : bad or missing LN tag", sn);
2018
2.80k
                    warn_if_known_stderr(fp->line.s);
2019
2.80k
                    free(sn);
2020
2.80k
                }
2021
32.5k
            } else {
2022
11.1k
                hts_log_warning("Ignored @SQ line with missing SN: tag");
2023
11.1k
                warn_if_known_stderr(fp->line.s);
2024
11.1k
            }
2025
43.7k
            sn = NULL;
2026
43.7k
        }
2027
191k
        else if (!valid_sam_header_type(fp->line.s)) {
2028
30
            hts_log_error("Invalid header line: must start with @HD/@SQ/@RG/@PG/@CO");
2029
30
            warn_if_known_stderr(fp->line.s);
2030
30
            goto error;
2031
30
        }
2032
2033
235k
        if (kputsn(fp->line.s, fp->line.l, &str) < 0)
2034
0
            goto error;
2035
2036
235k
        if (kputc('\n', &str) < 0)
2037
0
            goto error;
2038
2039
235k
        if (fp->is_bgzf) {
2040
193k
            next_c = bgzf_peek(fp->fp.bgzf);
2041
193k
        } else {
2042
42.1k
            unsigned char nc;
2043
42.1k
            ssize_t pret = hpeek(fp->fp.hfile, &nc, 1);
2044
42.1k
            next_c = pret > 0 ? nc : pret - 1;
2045
42.1k
        }
2046
235k
        if (next_c < -1)
2047
3
            goto error;
2048
235k
    }
2049
2.69k
    if (next_c != '@')
2050
2.67k
        fp->line.l = 0;
2051
2052
2.69k
    if (ret < -1)
2053
18
        goto error;
2054
2055
2.67k
    if (!has_SQ && fp->fn_aux) {
2056
0
        kstring_t line = { 0, 0, NULL };
2057
2058
        /* The reference index (.fai) is actually needed here */
2059
0
        char *fai_fn = fp->fn_aux;
2060
0
        char *fn_delim = strstr(fp->fn_aux, HTS_IDX_DELIM);
2061
0
        if (fn_delim)
2062
0
            fai_fn = fn_delim + strlen(HTS_IDX_DELIM);
2063
2064
0
        hFILE* f = hopen(fai_fn, "r");
2065
0
        int e = 0, absent;
2066
0
        if (f == NULL)
2067
0
            goto error;
2068
2069
0
        while (line.l = 0, kgetline(&line, (kgets_func*) hgets, f) >= 0) {
2070
0
            char* tab = strchr(line.s, '\t');
2071
0
            hts_pos_t ln;
2072
2073
0
            if (tab == NULL)
2074
0
                continue;
2075
2076
0
            sn = (char*)calloc(tab-line.s+1, 1);
2077
0
            if (!sn) {
2078
0
                e = 1;
2079
0
                break;
2080
0
            }
2081
0
            memcpy(sn, line.s, tab-line.s);
2082
0
            k = kh_put(s2i, d, sn, &absent);
2083
0
            if (absent < 0) {
2084
0
                e = 1;
2085
0
                break;
2086
0
            }
2087
2088
0
            ln = strtoll(tab, NULL, 10);
2089
2090
0
            if (!absent) {
2091
0
                hts_log_warning("Duplicated sequence \"%s\" in the file \"%s\"", sn, fai_fn);
2092
0
                free(sn);
2093
0
                sn = NULL;
2094
0
            } else {
2095
0
                sn = NULL;
2096
0
                if (ln >= UINT32_MAX) {
2097
                    // Stash away ref length that
2098
                    // doesn't fit in target_len array
2099
0
                    khint_t k2;
2100
0
                    int absent = -1;
2101
0
                    if (!long_refs) {
2102
0
                        long_refs = kh_init(s2i);
2103
0
                        if (!long_refs) {
2104
0
                            e = 1;
2105
0
                            break;
2106
0
                        }
2107
0
                    }
2108
0
                    k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent);
2109
0
                    if (absent < 0) {
2110
0
                         e = 1;
2111
0
                         break;
2112
0
                    }
2113
0
                    kh_val(long_refs, k2) = ln;
2114
0
                    kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
2115
0
                                    | UINT32_MAX);
2116
0
                } else {
2117
0
                    kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
2118
0
                }
2119
0
                has_SQ = 1;
2120
0
            }
2121
2122
0
            e |= kputs("@SQ\tSN:", &str) < 0;
2123
0
            e |= kputsn(line.s, tab - line.s, &str) < 0;
2124
0
            e |= kputs("\tLN:", &str) < 0;
2125
0
            e |= kputll(ln, &str) < 0;
2126
0
            e |= kputc('\n', &str) < 0;
2127
0
            if (e)
2128
0
                break;
2129
0
        }
2130
2131
0
        ks_free(&line);
2132
0
        if (hclose(f) != 0) {
2133
0
            hts_log_error("Error on closing %s", fai_fn);
2134
0
            e = 1;
2135
0
        }
2136
0
        if (e)
2137
0
            goto error;
2138
0
    }
2139
2140
2.67k
    if (has_SQ) {
2141
        // Populate the targets array
2142
2.29k
        h->n_targets = kh_size(d);
2143
2144
2.29k
        h->target_name = (char**) malloc(sizeof(char*) * h->n_targets);
2145
2.29k
        if (!h->target_name) {
2146
0
            h->n_targets = 0;
2147
0
            goto error;
2148
0
        }
2149
2150
2.29k
        h->target_len = (uint32_t*) malloc(sizeof(uint32_t) * h->n_targets);
2151
2.29k
        if (!h->target_len) {
2152
0
            h->n_targets = 0;
2153
0
            goto error;
2154
0
        }
2155
2156
28.7k
        for (k = kh_begin(d); k != kh_end(d); ++k) {
2157
26.4k
            if (!kh_exist(d, k))
2158
14.0k
                continue;
2159
2160
12.4k
            h->target_name[kh_val(d, k) >> 32] = (char*) kh_key(d, k);
2161
12.4k
            h->target_len[kh_val(d, k) >> 32] = kh_val(d, k) & 0xffffffffUL;
2162
12.4k
            kh_val(d, k) >>= 32;
2163
12.4k
        }
2164
2.29k
    }
2165
2166
    // Repurpose sdict to hold any references longer than UINT32_MAX
2167
2.67k
    h->sdict = long_refs;
2168
2169
2.67k
    kh_destroy(s2i, d);
2170
2171
2.67k
    if (str.l == 0)
2172
3
        kputsn("", 0, &str);
2173
2.67k
    h->l_text = str.l;
2174
2.67k
    h->text = ks_release(&str);
2175
2.67k
    fp->bam_header = sam_hdr_sanitise(h);
2176
2.67k
    fp->bam_header->ref_count = 1;
2177
2178
2.67k
    return fp->bam_header;
2179
2180
60
 error:
2181
60
    if (h && d && (!h->target_name || !h->target_len)) {
2182
1.45k
        for (k = kh_begin(d); k != kh_end(d); ++k)
2183
1.39k
            if (kh_exist(d, k)) free((void *)kh_key(d, k));
2184
60
    }
2185
60
    sam_hdr_destroy(h);
2186
60
    ks_free(&str);
2187
60
    kh_destroy(s2i, d);
2188
60
    kh_destroy(s2i, long_refs);
2189
60
    if (sn) free(sn);
2190
60
    return NULL;
2191
2.67k
}
2192
2193
sam_hdr_t *sam_hdr_read(htsFile *fp)
2194
4.52k
{
2195
4.52k
    if (!fp) {
2196
0
        errno = EINVAL;
2197
0
        return NULL;
2198
0
    }
2199
2200
4.52k
    switch (fp->format.format) {
2201
66
    case bam:
2202
66
        return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
2203
2204
705
    case cram:
2205
705
        return sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header));
2206
2207
2.73k
    case sam:
2208
2.73k
        return sam_hdr_create(fp);
2209
2210
42
    case fastq_format:
2211
1.02k
    case fasta_format:
2212
1.02k
        return sam_hdr_init();
2213
2214
0
    case empty_format:
2215
0
        errno = EPIPE;
2216
0
        return NULL;
2217
2218
0
    default:
2219
0
        errno = EFTYPE;
2220
0
        return NULL;
2221
4.52k
    }
2222
4.52k
}
2223
2224
int sam_hdr_write(htsFile *fp, const sam_hdr_t *h)
2225
4.43k
{
2226
4.43k
    if (!fp || !h) {
2227
0
        errno = EINVAL;
2228
0
        return -1;
2229
0
    }
2230
2231
4.43k
    switch (fp->format.format) {
2232
1.47k
    case binary_format:
2233
1.47k
        fp->format.category = sequence_data;
2234
1.47k
        fp->format.format = bam;
2235
        /* fall-through */
2236
1.47k
    case bam:
2237
1.47k
        if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
2238
1.47k
        break;
2239
2240
1.47k
    case cram: {
2241
1.47k
        cram_fd *fd = fp->fp.cram;
2242
1.47k
        if (cram_set_header2(fd, h) < 0) return -1;
2243
1.35k
        if (fp->fn_aux)
2244
0
            cram_load_reference(fd, fp->fn_aux);
2245
1.35k
        if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
2246
1.35k
        }
2247
1.35k
        break;
2248
2249
1.47k
    case text_format:
2250
1.47k
        fp->format.category = sequence_data;
2251
1.47k
        fp->format.format = sam;
2252
        /* fall-through */
2253
1.47k
    case sam: {
2254
1.47k
        if (!h->hrecs && !h->text)
2255
0
            return 0;
2256
1.47k
        char *text;
2257
1.47k
        kstring_t hdr_ks = { 0, 0, NULL };
2258
1.47k
        size_t l_text;
2259
1.47k
        ssize_t bytes;
2260
1.47k
        int r = 0, no_sq = 0;
2261
2262
1.47k
        if (h->hrecs) {
2263
1.39k
            if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0)
2264
0
                return -1;
2265
1.39k
            text = hdr_ks.s;
2266
1.39k
            l_text = hdr_ks.l;
2267
1.39k
        } else {
2268
86
            const char *p = NULL;
2269
172
            do {
2270
172
                const char *q = p == NULL ? h->text : p + 4;
2271
172
                p = strstr(q, "@SQ\t");
2272
172
            } while (!(p == NULL || p == h->text || *(p - 1) == '\n'));
2273
86
            no_sq = p == NULL;
2274
86
            text = h->text;
2275
86
            l_text = h->l_text;
2276
86
        }
2277
2278
1.47k
        if (fp->is_bgzf) {
2279
0
            bytes = bgzf_write(fp->fp.bgzf, text, l_text);
2280
1.47k
        } else {
2281
1.47k
            bytes = hwrite(fp->fp.hfile, text, l_text);
2282
1.47k
        }
2283
1.47k
        free(hdr_ks.s);
2284
1.47k
        if (bytes != l_text)
2285
0
            return -1;
2286
2287
1.47k
        if (no_sq) {
2288
51
            int i;
2289
556
            for (i = 0; i < h->n_targets; ++i) {
2290
505
                fp->line.l = 0;
2291
505
                r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0;
2292
505
                r |= kputs(h->target_name[i], &fp->line) < 0;
2293
505
                r |= kputsn("\tLN:", 4, &fp->line) < 0;
2294
505
                r |= kputw(h->target_len[i], &fp->line) < 0;
2295
505
                r |= kputc('\n', &fp->line) < 0;
2296
505
                if (r != 0)
2297
0
                    return -1;
2298
2299
505
                if (fp->is_bgzf) {
2300
0
                    bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
2301
505
                } else {
2302
505
                    bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
2303
505
                }
2304
505
                if (bytes != fp->line.l)
2305
0
                    return -1;
2306
505
            }
2307
51
        }
2308
1.47k
        if (fp->is_bgzf) {
2309
0
            if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2310
1.47k
        } else {
2311
1.47k
            if (hflush(fp->fp.hfile) != 0) return -1;
2312
1.47k
        }
2313
1.47k
        }
2314
1.47k
        break;
2315
2316
1.47k
    case fastq_format:
2317
0
    case fasta_format:
2318
        // Nothing to output; FASTQ has no file headers.
2319
0
        break;
2320
2321
0
    default:
2322
0
        errno = EBADF;
2323
0
        return -1;
2324
4.43k
    }
2325
4.31k
    return 0;
2326
4.43k
}
2327
2328
static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2329
0
{
2330
0
    char *p, *q, *beg = NULL, *end = NULL, *newtext;
2331
0
    size_t new_l_text;
2332
0
    if (!h || !key)
2333
0
        return -1;
2334
2335
0
    if (h->l_text > 3) {
2336
0
        if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
2337
0
            if ((p = strchr(h->text, '\n')) == 0) return -1;
2338
0
            *p = '\0'; // for strstr call
2339
2340
0
            char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
2341
2342
0
            if ((q = strstr(h->text, tmp)) != 0) { // key exists
2343
0
                *p = '\n'; // change back
2344
2345
                // mark the key:val
2346
0
                beg = q;
2347
0
                for (q += 4; *q != '\n' && *q != '\t'; ++q);
2348
0
                end = q;
2349
2350
0
                if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
2351
0
                    && strlen(val) == end - beg - 4)
2352
0
                     return 0; // val is the same, no need to change
2353
2354
0
            } else {
2355
0
                beg = end = p;
2356
0
                *p = '\n';
2357
0
            }
2358
0
        }
2359
0
    }
2360
0
    if (beg == NULL) { // no @HD
2361
0
        new_l_text = h->l_text;
2362
0
        if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9)
2363
0
            return -1;
2364
0
        new_l_text += strlen(SAM_FORMAT_VERSION) + 8;
2365
0
        if (val) {
2366
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2367
0
                return -1;
2368
0
            new_l_text += strlen(val) + 4;
2369
0
        }
2370
0
        newtext = (char*)malloc(new_l_text + 1);
2371
0
        if (!newtext) return -1;
2372
2373
0
        if (val)
2374
0
            snprintf(newtext, new_l_text + 1,
2375
0
                    "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
2376
0
        else
2377
0
            snprintf(newtext, new_l_text + 1,
2378
0
                    "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
2379
0
    } else { // has @HD but different or no key
2380
0
        new_l_text = (beg - h->text) + (h->text + h->l_text - end);
2381
0
        if (val) {
2382
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2383
0
                return -1;
2384
0
            new_l_text += strlen(val) + 4;
2385
0
        }
2386
0
        newtext = (char*)malloc(new_l_text + 1);
2387
0
        if (!newtext) return -1;
2388
2389
0
        if (val) {
2390
0
            snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s",
2391
0
                    (int) (beg - h->text), h->text, key, val, end);
2392
0
        } else { //delete key
2393
0
            snprintf(newtext, new_l_text + 1, "%.*s%s",
2394
0
                    (int) (beg - h->text), h->text, end);
2395
0
        }
2396
0
    }
2397
0
    free(h->text);
2398
0
    h->text = newtext;
2399
0
    h->l_text = new_l_text;
2400
0
    return 0;
2401
0
}
2402
2403
2404
int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2405
0
{
2406
0
    if (!h || !key)
2407
0
        return -1;
2408
2409
0
    if (!h->hrecs)
2410
0
        return old_sam_hdr_change_HD(h, key, val);
2411
2412
0
    if (val) {
2413
0
        if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0)
2414
0
            return -1;
2415
0
    } else {
2416
0
        if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0)
2417
0
            return -1;
2418
0
    }
2419
0
    return sam_hdr_rebuild(h);
2420
0
}
2421
/**********************
2422
 *** SAM record I/O ***
2423
 **********************/
2424
2425
// The speed of this code can vary considerably depending on minor code
2426
// changes elsewhere as some of the tight loops are particularly prone to
2427
// speed changes when the instruction blocks are split over a 32-byte
2428
// boundary.  To protect against this, we explicitly specify an alignment
2429
// for this function.  If this is insufficient, we may also wish to
2430
// consider alignment of blocks within this function via
2431
// __attribute__((optimize("align-loops=5"))) (gcc) or clang equivalents.
2432
// However it's not very portable.
2433
// Instead we break into separate functions so we can explicitly specify
2434
// use __attribute__((aligned(32))) instead and force consistent loop
2435
// alignment.
2436
129k
static inline int64_t grow_B_array(bam1_t *b, uint32_t *n, size_t size) {
2437
    // Avoid overflow on 32-bit platforms, but it breaks BAM anyway
2438
129k
    if (*n > INT32_MAX*0.666) {
2439
0
        errno = ENOMEM;
2440
0
        return -1;
2441
0
    }
2442
2443
129k
    size_t bytes = (size_t)size * (size_t)(*n>>1);
2444
129k
    if (possibly_expand_bam_data(b, bytes) < 0) {
2445
0
        hts_log_error("Out of memory");
2446
0
        return -1;
2447
0
    }
2448
2449
129k
    (*n)+=*n>>1;
2450
129k
    return 0;
2451
129k
}
2452
2453
2454
// This ensures that q always ends up at the next comma after
2455
// reading a number even if it's followed by junk.  It
2456
// prevents the possibility of trying to read more than n items.
2457
7.00M
#define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0)
2458
2459
HTS_ALIGN32
2460
static char *sam_parse_Bc_vals(bam1_t *b, char *q, uint32_t *nused,
2461
17.2k
                               uint32_t *nalloc, int *overflow) {
2462
1.83M
    while (*q == ',') {
2463
1.81M
        if ((*nused)++ >= (*nalloc)) {
2464
111k
            if (grow_B_array(b, nalloc, 1) < 0)
2465
0
                return NULL;
2466
111k
        }
2467
1.81M
        *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, overflow);
2468
1.81M
        b->l_data++;
2469
1.81M
    }
2470
17.2k
    return q;
2471
17.2k
}
2472
2473
HTS_ALIGN32
2474
static char *sam_parse_BC_vals(bam1_t *b, char *q, uint32_t *nused,
2475
14.3k
                               uint32_t *nalloc, int *overflow) {
2476
315k
    while (*q == ',') {
2477
301k
        if ((*nused)++ >= (*nalloc)) {
2478
897
            if (grow_B_array(b, nalloc, 1) < 0)
2479
0
                return NULL;
2480
897
        }
2481
301k
        if (q[1] != '-') {
2482
293k
            *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, overflow);
2483
293k
            b->l_data++;
2484
293k
        } else {
2485
7.93k
            *overflow = 1;
2486
7.93k
            q++;
2487
7.93k
            skip_to_comma_(q);
2488
7.93k
        }
2489
301k
    }
2490
14.3k
    return q;
2491
14.3k
}
2492
2493
HTS_ALIGN32
2494
static char *sam_parse_Bs_vals(bam1_t *b, char *q, uint32_t *nused,
2495
5.84k
                               uint32_t *nalloc, int *overflow) {
2496
180k
    while (*q == ',') {
2497
174k
        if ((*nused)++ >= (*nalloc)) {
2498
1.98k
            if (grow_B_array(b, nalloc, 2) < 0)
2499
0
                return NULL;
2500
1.98k
        }
2501
174k
        i16_to_le(hts_str2int(q + 1, &q, 16, overflow),
2502
174k
                  b->data + b->l_data);
2503
174k
        b->l_data += 2;
2504
174k
    }
2505
5.84k
    return q;
2506
5.84k
}
2507
2508
HTS_ALIGN32
2509
static char *sam_parse_BS_vals(bam1_t *b, char *q, uint32_t *nused,
2510
4.19k
                               uint32_t *nalloc, int *overflow) {
2511
4.47M
    while (*q == ',') {
2512
4.46M
        if ((*nused)++ >= (*nalloc)) {
2513
5.26k
            if (grow_B_array(b, nalloc, 2) < 0)
2514
0
                return NULL;
2515
5.26k
        }
2516
4.46M
        if (q[1] != '-') {
2517
4.40M
            u16_to_le(hts_str2uint(q + 1, &q, 16, overflow),
2518
4.40M
                      b->data + b->l_data);
2519
4.40M
            b->l_data += 2;
2520
4.40M
        } else {
2521
58.8k
            *overflow = 1;
2522
58.8k
            q++;
2523
58.8k
            skip_to_comma_(q);
2524
58.8k
        }
2525
4.46M
    }
2526
4.19k
    return q;
2527
4.19k
}
2528
2529
HTS_ALIGN32
2530
static char *sam_parse_Bi_vals(bam1_t *b, char *q, uint32_t *nused,
2531
7.96k
                               uint32_t *nalloc, int *overflow) {
2532
5.25M
    while (*q == ',') {
2533
5.24M
        if ((*nused)++ >= (*nalloc)) {
2534
258
            if (grow_B_array(b, nalloc, 4) < 0)
2535
0
                return NULL;
2536
258
        }
2537
5.24M
        i32_to_le(hts_str2int(q + 1, &q, 32, overflow),
2538
5.24M
                  b->data + b->l_data);
2539
5.24M
        b->l_data += 4;
2540
5.24M
    }
2541
7.96k
    return q;
2542
7.96k
}
2543
2544
HTS_ALIGN32
2545
static char *sam_parse_BI_vals(bam1_t *b, char *q, uint32_t *nused,
2546
17.8k
                               uint32_t *nalloc, int *overflow) {
2547
1.47M
    while (*q == ',') {
2548
1.45M
        if ((*nused)++ >= (*nalloc)) {
2549
7.42k
            if (grow_B_array(b, nalloc, 4) < 0)
2550
0
                return NULL;
2551
7.42k
        }
2552
1.45M
        if (q[1] != '-') {
2553
1.44M
            u32_to_le(hts_str2uint(q + 1, &q, 32, overflow),
2554
1.44M
                      b->data + b->l_data);
2555
1.44M
            b->l_data += 4;
2556
1.44M
        } else {
2557
4.60k
            *overflow = 1;
2558
4.60k
            q++;
2559
4.60k
            skip_to_comma_(q);
2560
4.60k
        }
2561
1.45M
    }
2562
17.8k
    return q;
2563
17.8k
}
2564
2565
HTS_ALIGN32
2566
static char *sam_parse_Bf_vals(bam1_t *b, char *q, uint32_t *nused,
2567
3.56k
                               uint32_t *nalloc, int *overflow) {
2568
18.0k
    while (*q == ',') {
2569
14.4k
        if ((*nused)++ >= (*nalloc)) {
2570
1.85k
            if (grow_B_array(b, nalloc, 4) < 0)
2571
0
                return NULL;
2572
1.85k
        }
2573
14.4k
        float_to_le(strtod(q + 1, &q), b->data + b->l_data);
2574
14.4k
        b->l_data += 4;
2575
14.4k
    }
2576
3.56k
    return q;
2577
3.56k
}
2578
2579
HTS_ALIGN32
2580
static int sam_parse_B_vals_r(char type, uint32_t nalloc, char *in,
2581
                              char **end, bam1_t *b,
2582
71.0k
                              int *ctr) {
2583
    // Protect against infinite recursion when dealing with invalid input.
2584
    // An example string is "XX:B:C,-".  The lack of a number means min=0,
2585
    // but it overflowed due to "-" and so we repeat ad-infinitum.
2586
    //
2587
    // Loop detection is the safest solution incase there are other
2588
    // strange corner cases with malformed inputs.
2589
71.0k
    if (++(*ctr) > 2) {
2590
0
        hts_log_error("Malformed data in B:%c array", type);
2591
0
        return -1;
2592
0
    }
2593
2594
71.0k
    int orig_l = b->l_data;
2595
71.0k
    char *q = in;
2596
71.0k
    int32_t size;
2597
71.0k
    size_t bytes;
2598
71.0k
    int overflow = 0;
2599
2600
71.0k
    size = aux_type2size(type);
2601
71.0k
    if (size <= 0 || size > 4) {
2602
2
        hts_log_error("Unrecognized type B:%c", type);
2603
2
        return -1;
2604
2
    }
2605
2606
    // Ensure space for type + values.
2607
    // The first pass through here we don't know the number of entries and
2608
    // nalloc == 0.  We start with a small working set and then parse the
2609
    // data, growing as needed.
2610
    //
2611
    // If we have a second pass through we do know the number of entries
2612
    // and nalloc is already known.  We have no need to expand the bam data.
2613
71.0k
    if (!nalloc)
2614
51.7k
         nalloc=7;
2615
2616
    // Ensure allocated memory is big enough (for current nalloc estimate)
2617
71.0k
    bytes = (size_t) nalloc * (size_t) size;
2618
71.0k
    if (bytes / size != nalloc
2619
71.0k
        || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) {
2620
0
        hts_log_error("Out of memory");
2621
0
        return -1;
2622
0
    }
2623
2624
71.0k
    uint32_t nused = 0;
2625
2626
71.0k
    b->data[b->l_data++] = 'B';
2627
71.0k
    b->data[b->l_data++] = type;
2628
    // 32-bit B-array length is inserted later once we know it.
2629
71.0k
    int b_len_idx = b->l_data;
2630
71.0k
    b->l_data += sizeof(uint32_t);
2631
2632
71.0k
    if (type == 'c') {
2633
17.2k
        if (!(q = sam_parse_Bc_vals(b, q, &nused, &nalloc, &overflow)))
2634
0
            return -1;
2635
53.7k
    } else if (type == 'C') {
2636
14.3k
        if (!(q = sam_parse_BC_vals(b, q, &nused, &nalloc, &overflow)))
2637
0
            return -1;
2638
39.4k
    } else if (type == 's') {
2639
5.84k
        if (!(q = sam_parse_Bs_vals(b, q, &nused, &nalloc, &overflow)))
2640
0
            return -1;
2641
33.5k
    } else if (type == 'S') {
2642
4.19k
        if (!(q = sam_parse_BS_vals(b, q, &nused, &nalloc, &overflow)))
2643
0
            return -1;
2644
29.3k
    } else if (type == 'i') {
2645
7.96k
        if (!(q = sam_parse_Bi_vals(b, q, &nused, &nalloc, &overflow)))
2646
0
            return -1;
2647
21.4k
    } else if (type == 'I') {
2648
17.8k
        if (!(q = sam_parse_BI_vals(b, q, &nused, &nalloc, &overflow)))
2649
0
            return -1;
2650
17.8k
    } else if (type == 'f') {
2651
3.56k
        if (!(q = sam_parse_Bf_vals(b, q, &nused, &nalloc, &overflow)))
2652
0
            return -1;
2653
3.56k
    }
2654
71.0k
    if (*q != '\t' && *q != '\0') {
2655
        // Unknown B array type or junk in the numbers
2656
36
        hts_log_error("Malformed B:%c", type);
2657
36
        return -1;
2658
36
    }
2659
70.9k
    i32_to_le(nused, b->data + b_len_idx);
2660
2661
70.9k
    if (!overflow) {
2662
51.6k
        *end = q;
2663
51.6k
        return 0;
2664
51.6k
    } else {
2665
19.3k
        int64_t max = 0, min = 0, val;
2666
        // Given type was incorrect.  Try to rescue the situation.
2667
19.3k
        char *r = q;
2668
19.3k
        q = in;
2669
19.3k
        overflow = 0;
2670
19.3k
        b->l_data = orig_l;
2671
        // Find out what range of values is present
2672
6.65M
        while (q < r) {
2673
6.63M
            val = hts_str2int(q + 1, &q, 64, &overflow);
2674
6.63M
            if (max < val) max = val;
2675
6.63M
            if (min > val) min = val;
2676
6.63M
            skip_to_comma_(q);
2677
6.63M
        }
2678
        // Retry with appropriate type
2679
19.3k
        if (!overflow) {
2680
19.2k
            if (min < 0) {
2681
9.31k
                if (min >= INT8_MIN && max <= INT8_MAX) {
2682
912
                    return sam_parse_B_vals_r('c', nalloc, in, end, b, ctr);
2683
8.40k
                } else if (min >= INT16_MIN && max <= INT16_MAX) {
2684
807
                    return sam_parse_B_vals_r('s', nalloc, in, end, b, ctr);
2685
7.59k
                } else if (min >= INT32_MIN && max <= INT32_MAX) {
2686
7.59k
                    return sam_parse_B_vals_r('i', nalloc, in, end, b, ctr);
2687
7.59k
                }
2688
9.98k
            } else {
2689
9.98k
                if (max < UINT8_MAX) {
2690
438
                    return sam_parse_B_vals_r('C', nalloc, in, end, b, ctr);
2691
9.54k
                } else if (max <= UINT16_MAX) {
2692
438
                    return sam_parse_B_vals_r('S', nalloc, in, end, b, ctr);
2693
9.10k
                } else if (max <= UINT32_MAX) {
2694
9.10k
                    return sam_parse_B_vals_r('I', nalloc, in, end, b, ctr);
2695
9.10k
                }
2696
9.98k
            }
2697
19.2k
        }
2698
        // If here then at least one of the values is too big to store
2699
18
        hts_log_error("Numeric value in B array out of allowed range");
2700
18
        return -1;
2701
19.3k
    }
2702
70.9k
#undef skip_to_comma_
2703
70.9k
}
2704
2705
HTS_ALIGN32
2706
static int sam_parse_B_vals(char type, char *in, char **end, bam1_t *b)
2707
51.7k
{
2708
51.7k
    int ctr = 0;
2709
51.7k
    uint32_t nalloc = 0;
2710
51.7k
    return sam_parse_B_vals_r(type, nalloc, in, end, b, &ctr);
2711
51.7k
}
2712
2713
161k
static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) {
2714
161k
    if (*v >= '1' && *v <= '9') {
2715
62.6k
        return hts_str2uint(v, rv, 16, overflow);
2716
62.6k
    }
2717
98.3k
    else if (*v == '0') {
2718
        // handle single-digit "0" directly; otherwise it's hex or octal
2719
33.4k
        if (v[1] == '\t') { *rv = v+1; return 0; }
2720
29
        else {
2721
29
            unsigned long val = strtoul(v, rv, 0);
2722
29
            if (val > 65535) { *overflow = 1; return 65535; }
2723
29
            return val;
2724
29
        }
2725
33.4k
    }
2726
64.9k
    else {
2727
        // TODO implement symbolic flag letters
2728
64.9k
        *rv = v;
2729
64.9k
        return 0;
2730
64.9k
    }
2731
161k
}
2732
2733
// Parse tag line and append to bam object b.
2734
// Shared by both SAM and FASTQ parsers.
2735
//
2736
// The difference between the two is how lenient we are to recognising
2737
// non-compliant strings.  The FASTQ parser glosses over arbitrary
2738
// non-SAM looking strings.
2739
static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient,
2740
160k
                            khash_t(tag) *tag_whitelist) {
2741
160k
    int overflow = 0;
2742
160k
    int checkpoint;
2743
160k
    char logbuf[40];
2744
160k
    char *q = start, *p = end;
2745
2746
160k
#define _parse_err(cond, ...)                   \
2747
4.25M
    do {                                        \
2748
9.03M
        if (cond) {                             \
2749
229
            if (lenient) {                      \
2750
0
                while (q < p && !isspace_c(*q))   \
2751
0
                    q++;                        \
2752
0
                while (q < p && isspace_c(*q))    \
2753
0
                    q++;                        \
2754
0
                b->l_data = checkpoint;         \
2755
0
                goto loop;                      \
2756
229
            } else {                            \
2757
229
                hts_log_error(__VA_ARGS__);     \
2758
229
                goto err_ret;                   \
2759
229
            }                                   \
2760
229
        }                                       \
2761
4.25M
    } while (0)
2762
2763
4.03M
    while (q < p) loop: {
2764
4.03M
        char type;
2765
4.03M
        checkpoint = b->l_data;
2766
4.03M
        if (p - q < 5) {
2767
30
            if (lenient) {
2768
0
                break;
2769
30
            } else {
2770
30
                hts_log_error("Incomplete aux field");
2771
30
                goto err_ret;
2772
30
            }
2773
30
        }
2774
2.01M
        _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id");
2775
2776
2.01M
        if (lenient && (q[2] | q[4]) != ':') {
2777
0
            while (q < p && !isspace_c(*q))
2778
0
                q++;
2779
0
            while (q < p && isspace_c(*q))
2780
0
                q++;
2781
0
            continue;
2782
0
        }
2783
2784
2.01M
        if (tag_whitelist) {
2785
0
            int tt = q[0]*256 + q[1];
2786
0
            if (kh_get(tag, tag_whitelist, tt) == kh_end(tag_whitelist)) {
2787
0
                while (q < p && *q != '\t')
2788
0
                    q++;
2789
0
                continue;
2790
0
            }
2791
0
        }
2792
2793
        // Copy over id
2794
2.01M
        if (possibly_expand_bam_data(b, 2) < 0) goto err_ret;
2795
2.01M
        memcpy(b->data + b->l_data, q, 2); b->l_data += 2;
2796
2.01M
        q += 3; type = *q++; ++q; // q points to value
2797
2.01M
        if (type != 'Z' && type != 'H') // the only zero length acceptable fields
2798
1.78M
            _parse_err(*q <= '\t', "incomplete aux field");
2799
2800
        // Ensure enough space for a double + type allocated.
2801
2.01M
        if (possibly_expand_bam_data(b, 16) < 0) goto err_ret;
2802
2803
2.01M
        if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
2804
662k
            b->data[b->l_data++] = 'A';
2805
662k
            b->data[b->l_data++] = *q++;
2806
1.35M
        } else if (type == 'i' || type == 'I') {
2807
1.01M
            if (*q == '-') {
2808
794k
                int32_t x = hts_str2int(q, &q, 32, &overflow);
2809
794k
                if (x >= INT8_MIN) {
2810
402k
                    b->data[b->l_data++] = 'c';
2811
402k
                    b->data[b->l_data++] = x;
2812
402k
                } else if (x >= INT16_MIN) {
2813
117k
                    b->data[b->l_data++] = 's';
2814
117k
                    i16_to_le(x, b->data + b->l_data);
2815
117k
                    b->l_data += 2;
2816
274k
                } else {
2817
274k
                    b->data[b->l_data++] = 'i';
2818
274k
                    i32_to_le(x, b->data + b->l_data);
2819
274k
                    b->l_data += 4;
2820
274k
                }
2821
794k
            } else {
2822
223k
                uint32_t x = hts_str2uint(q, &q, 32, &overflow);
2823
223k
                if (x <= UINT8_MAX) {
2824
138k
                    b->data[b->l_data++] = 'C';
2825
138k
                    b->data[b->l_data++] = x;
2826
138k
                } else if (x <= UINT16_MAX) {
2827
74.4k
                    b->data[b->l_data++] = 'S';
2828
74.4k
                    u16_to_le(x, b->data + b->l_data);
2829
74.4k
                    b->l_data += 2;
2830
74.4k
                } else {
2831
10.3k
                    b->data[b->l_data++] = 'I';
2832
10.3k
                    u32_to_le(x, b->data + b->l_data);
2833
10.3k
                    b->l_data += 4;
2834
10.3k
                }
2835
223k
            }
2836
1.01M
        } else if (type == 'f') {
2837
22.6k
            b->data[b->l_data++] = 'f';
2838
22.6k
            float_to_le(strtod(q, &q), b->data + b->l_data);
2839
22.6k
            b->l_data += sizeof(float);
2840
316k
        } else if (type == 'd') {
2841
27.8k
            b->data[b->l_data++] = 'd';
2842
27.8k
            double_to_le(strtod(q, &q), b->data + b->l_data);
2843
27.8k
            b->l_data += sizeof(double);
2844
288k
        } else if (type == 'Z' || type == 'H') {
2845
236k
            char *end = strchr(q, '\t');
2846
236k
            if (!end) end = q + strlen(q);
2847
236k
            _parse_err(type == 'H' && ((end-q)&1) != 0,
2848
236k
                       "hex field does not have an even number of digits");
2849
236k
            b->data[b->l_data++] = type;
2850
236k
            if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret;
2851
236k
            memcpy(b->data + b->l_data, q, end - q);
2852
236k
            b->l_data += end - q;
2853
236k
            b->data[b->l_data++] = '\0';
2854
236k
            q = end;
2855
236k
        } else if (type == 'B') {
2856
51.7k
            type = *q++; // q points to the first ',' following the typing byte
2857
51.7k
            _parse_err(*q && *q != ',' && *q != '\t',
2858
51.7k
                       "B aux field type not followed by ','");
2859
2860
51.7k
            if (sam_parse_B_vals(type, q, &q, b) < 0)
2861
56
                goto err_ret;
2862
51.7k
        } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1));
2863
2864
11.2M
        while (*q > '\t') { q++; } // Skip any junk to next tab
2865
2.01M
        q++;
2866
2.01M
    }
2867
2868
159k
    _parse_err(!lenient && overflow != 0, "numeric value out of allowed range");
2869
159k
#undef _parse_err
2870
2871
159k
    return 0;
2872
2873
315
err_ret:
2874
315
    return -2;
2875
159k
}
2876
2877
int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
2878
161k
{
2879
657k
#define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0)
2880
2881
161k
#if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff
2882
2883
// Macro that operates on 64-bits at a time.
2884
161k
#define COPY_MINUS_N(to,from,n,l,failed)                        \
2885
161k
    do {                                                        \
2886
138k
        uint64_u *from8 = (uint64_u *)(from);                   \
2887
138k
        uint64_u *to8 = (uint64_u *)(to);                       \
2888
138k
        uint64_t uflow = 0;                                     \
2889
138k
        size_t l8 = (l)>>3, i;                                  \
2890
138k
        for (i = 0; i < l8; i++) {                              \
2891
0
            to8[i] = from8[i] - (n)*0x0101010101010101UL;       \
2892
0
            uflow |= to8[i];                                    \
2893
0
        }                                                       \
2894
138k
        for (i<<=3; i < (l); ++i) {                             \
2895
437
            to[i] = from[i] - (n);                              \
2896
437
            uflow |= to[i];                                     \
2897
437
        }                                                       \
2898
138k
        failed = (uflow & 0x8080808080808080UL) > 0;            \
2899
138k
    } while (0)
2900
2901
#else
2902
2903
// Basic version which operates a byte at a time
2904
#define COPY_MINUS_N(to,from,n,l,failed) do {                \
2905
        uint8_t uflow = 0;                                   \
2906
        for (i = 0; i < (l); ++i) {                          \
2907
            (to)[i] = (from)[i] - (n);                       \
2908
            uflow |= (uint8_t) (to)[i];                      \
2909
        }                                                    \
2910
        failed = (uflow & 0x80) > 0;                         \
2911
    } while (0)
2912
2913
#endif
2914
2915
302k
#define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l)
2916
2.20M
#define _parse_err(cond, ...) do { if (cond) { hts_log_error(__VA_ARGS__); goto err_ret; } } while (0)
2917
549k
#define _parse_warn(cond, ...) do { if (cond) { hts_log_warning(__VA_ARGS__); } } while (0)
2918
2919
161k
    uint8_t *t;
2920
2921
161k
    char *p = s->s, *q;
2922
161k
    int i, overflow = 0;
2923
161k
    char logbuf[40];
2924
161k
    hts_pos_t cigreflen;
2925
161k
    bam1_core_t *c = &b->core;
2926
2927
161k
    b->l_data = 0;
2928
161k
    memset(c, 0, 32);
2929
2930
    // qname
2931
161k
    q = _read_token(p);
2932
2933
161k
    _parse_warn(p - q <= 1, "empty query name");
2934
161k
    _parse_err(p - q > 255, "query name too long");
2935
    // resize large enough for name + extranul
2936
161k
    if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret;
2937
161k
    memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q;
2938
2939
161k
    c->l_extranul = (4 - (b->l_data & 3)) & 3;
2940
161k
    memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul);
2941
161k
    b->l_data += c->l_extranul;
2942
2943
161k
    c->l_qname = p - q + c->l_extranul;
2944
2945
    // flag
2946
161k
    c->flag = parse_sam_flag(p, &p, &overflow);
2947
161k
    if (*p++ != '\t') goto err_ret; // malformated flag
2948
2949
    // chr
2950
160k
    q = _read_token(p);
2951
160k
    if (strcmp(q, "*")) {
2952
151k
        _parse_err(h->n_targets == 0, "no SQ lines present in the header");
2953
151k
        c->tid = bam_name2id(h, q);
2954
151k
        _parse_err(c->tid < -1, "failed to parse header");
2955
151k
        _parse_warn(c->tid < 0, "unrecognized reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2956
151k
    } else c->tid = -1;
2957
2958
    // pos
2959
160k
    c->pos = hts_str2uint(p, &p, 62, &overflow) - 1;
2960
160k
    if (*p++ != '\t') goto err_ret;
2961
160k
    if (c->pos < 0 && c->tid >= 0) {
2962
3.69k
        _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
2963
3.69k
        c->tid = -1;
2964
3.69k
    }
2965
160k
    if (c->tid < 0) c->flag |= BAM_FUNMAP;
2966
2967
    // mapq
2968
160k
    c->qual = hts_str2uint(p, &p, 8, &overflow);
2969
160k
    if (*p++ != '\t') goto err_ret;
2970
    // cigar
2971
160k
    if (*p != '*') {
2972
146k
        uint32_t *cigar = NULL;
2973
146k
        int old_l_data = b->l_data;
2974
146k
        int n_cigar = bam_parse_cigar(p, &p, b);
2975
146k
        if (n_cigar < 1 || *p++ != '\t') goto err_ret;
2976
145k
        cigar = (uint32_t *)(b->data + old_l_data);
2977
2978
        // can't use bam_endpos() directly as some fields not yet set up
2979
145k
        cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
2980
145k
        if (cigreflen == 0) cigreflen = 1;
2981
145k
    } else {
2982
14.6k
        _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
2983
14.6k
        c->flag |= BAM_FUNMAP;
2984
14.6k
        q = _read_token(p);
2985
14.6k
        cigreflen = 1;
2986
14.6k
    }
2987
160k
    _parse_err(HTS_POS_MAX - cigreflen <= c->pos,
2988
160k
               "read ends beyond highest supported position");
2989
160k
    c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5);
2990
    // mate chr
2991
160k
    q = _read_token(p);
2992
160k
    if (strcmp(q, "=") == 0) {
2993
131
        c->mtid = c->tid;
2994
160k
    } else if (strcmp(q, "*") == 0) {
2995
23
        c->mtid = -1;
2996
160k
    } else {
2997
160k
        c->mtid = bam_name2id(h, q);
2998
160k
        _parse_err(c->mtid < -1, "failed to parse header");
2999
160k
        _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
3000
160k
    }
3001
    // mpos
3002
160k
    c->mpos = hts_str2uint(p, &p, 62, &overflow) - 1;
3003
160k
    if (*p++ != '\t') goto err_ret;
3004
160k
    if (c->mpos < 0 && c->mtid >= 0) {
3005
58.4k
        _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
3006
58.4k
        c->mtid = -1;
3007
58.4k
    }
3008
    // tlen
3009
160k
    c->isize = hts_str2int(p, &p, 63, &overflow);
3010
160k
    if (*p++ != '\t') goto err_ret;
3011
160k
    _parse_err(overflow, "number outside allowed range");
3012
    // seq
3013
160k
    q = _read_token(p);
3014
160k
    if (strcmp(q, "*")) {
3015
142k
        _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long");
3016
142k
        c->l_qseq = p - q - 1;
3017
142k
        hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname));
3018
142k
        _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length");
3019
142k
        i = (c->l_qseq + 1) >> 1;
3020
142k
        _get_mem(uint8_t, &t, b, i);
3021
3022
142k
        unsigned int lqs2 = c->l_qseq&~1, i;
3023
217k
        for (i = 0; i < lqs2; i+=2)
3024
75.0k
            t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]];
3025
164k
        for (; i < c->l_qseq; ++i)
3026
22.1k
            t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2);
3027
142k
    } else c->l_qseq = 0;
3028
    // qual
3029
320k
    _get_mem(uint8_t, &t, b, c->l_qseq);
3030
320k
    if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) {
3031
21.6k
        memset(t, 0xff, c->l_qseq);
3032
21.6k
        p += 2;
3033
138k
    } else {
3034
138k
        int failed = 0;
3035
138k
        _parse_err(s->l - (p - s->s) < c->l_qseq
3036
138k
                   || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'),
3037
138k
                   "SEQ and QUAL are of different length");
3038
138k
        COPY_MINUS_N(t, p, 33, c->l_qseq, failed);
3039
138k
        _parse_err(failed, "invalid QUAL character");
3040
138k
        p += c->l_qseq + 1;
3041
138k
    }
3042
3043
    // aux
3044
160k
    if (aux_parse(p, s->s + s->l, b, 0, NULL) < 0)
3045
315
        goto err_ret;
3046
3047
159k
    if (bam_tag2cigar(b, 1, 1) < 0)
3048
0
        return -2;
3049
159k
    return 0;
3050
3051
0
#undef _parse_warn
3052
0
#undef _parse_err
3053
0
#undef _get_mem
3054
0
#undef _read_token
3055
1.42k
err_ret:
3056
1.42k
    return -2;
3057
159k
}
3058
3059
146k
static uint32_t read_ncigar(const char *q) {
3060
146k
    uint32_t n_cigar = 0;
3061
2.54M
    for (; *q && *q != '\t'; ++q)
3062
2.39M
        if (!isdigit_c(*q)) ++n_cigar;
3063
146k
    if (!n_cigar) {
3064
45
        hts_log_error("No CIGAR operations");
3065
45
        return 0;
3066
45
    }
3067
145k
    if (n_cigar >= 2147483647) {
3068
0
        hts_log_error("Too many CIGAR operations");
3069
0
        return 0;
3070
0
    }
3071
3072
145k
    return n_cigar;
3073
145k
}
3074
3075
/*! @function
3076
 @abstract  Parse a CIGAR string into preallocated a uint32_t array
3077
 @param  in      [in]  pointer to the source string
3078
 @param  a_cigar [out]  address of the destination uint32_t buffer
3079
 @return         number of processed input characters; 0 on error
3080
 */
3081
145k
static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) {
3082
145k
    int i, overflow = 0;
3083
145k
    const char *p = in;
3084
399k
    for (i = 0; i < n_cigar; i++) {
3085
253k
        uint32_t len;
3086
253k
        int op;
3087
253k
        char *q;
3088
253k
        len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT;
3089
253k
        if (q == p) {
3090
55
            hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p);
3091
55
            return 0;
3092
55
        }
3093
253k
        if (overflow) {
3094
15
            hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p);
3095
15
            return 0;
3096
15
        }
3097
253k
        p = q;
3098
253k
        op = bam_cigar_table[(unsigned char)*p++];
3099
253k
        if (op < 0) {
3100
105
            hts_log_error("Unrecognized CIGAR operator");
3101
105
            return 0;
3102
105
        }
3103
253k
        a_cigar[i] = len;
3104
253k
        a_cigar[i] |= op;
3105
253k
    }
3106
3107
145k
    return p-in;
3108
145k
}
3109
3110
0
ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) {
3111
0
    size_t n_cigar = 0;
3112
0
    int diff;
3113
3114
0
    if (!in || !a_cigar || !a_mem) {
3115
0
        hts_log_error("NULL pointer arguments");
3116
0
        return -1;
3117
0
    }
3118
0
    if (end) *end = (char *)in;
3119
3120
0
    if (*in == '*') {
3121
0
        if (end) (*end)++;
3122
0
        return 0;
3123
0
    }
3124
0
    n_cigar = read_ncigar(in);
3125
0
    if (!n_cigar) return 0;
3126
0
    if (n_cigar > *a_mem) {
3127
0
        uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar));
3128
0
        if (a_tmp) {
3129
0
            *a_cigar = a_tmp;
3130
0
            *a_mem = n_cigar;
3131
0
        } else {
3132
0
            hts_log_error("Memory allocation error");
3133
0
            return -1;
3134
0
        }
3135
0
    }
3136
3137
0
    if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1;
3138
0
    if (end) *end = (char *)in+diff;
3139
3140
0
    return n_cigar;
3141
0
}
3142
3143
146k
ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) {
3144
146k
    size_t n_cigar = 0;
3145
146k
    int diff;
3146
3147
146k
    if (!in || !b) {
3148
0
        hts_log_error("NULL pointer arguments");
3149
0
        return -1;
3150
0
    }
3151
146k
    if (end) *end = (char *)in;
3152
3153
146k
    n_cigar = (*in == '*') ? 0 : read_ncigar(in);
3154
146k
    if (!n_cigar && b->core.n_cigar == 0) {
3155
45
        if (end) *end = (char *)in+1;
3156
45
        return 0;
3157
45
    }
3158
3159
145k
    ssize_t cig_diff = n_cigar - b->core.n_cigar;
3160
145k
    if (cig_diff > 0 &&
3161
145k
        possibly_expand_bam_data(b, cig_diff * sizeof(uint32_t)) < 0) {
3162
0
        hts_log_error("Memory allocation error");
3163
0
        return -1;
3164
0
    }
3165
3166
145k
    uint32_t *cig = bam_get_cigar(b);
3167
145k
    if ((uint8_t *)cig != b->data + b->l_data) {
3168
        // Modifying an BAM existing BAM record
3169
0
        uint8_t  *seq = bam_get_seq(b);
3170
0
        memmove(cig + n_cigar, seq, (b->data + b->l_data) - seq);
3171
0
    }
3172
3173
145k
    if (n_cigar) {
3174
145k
        if (!(diff = parse_cigar(in, cig, n_cigar)))
3175
175
            return -1;
3176
145k
    } else {
3177
0
        diff = 1; // handle "*"
3178
0
    }
3179
3180
145k
    b->l_data += cig_diff * sizeof(uint32_t);
3181
145k
    b->core.n_cigar = n_cigar;
3182
145k
    if (end) *end = (char *)in + diff;
3183
3184
145k
    return n_cigar;
3185
145k
}
3186
3187
/*
3188
 * -----------------------------------------------------------------------------
3189
 * SAM threading
3190
 */
3191
// Size of SAM text block (reading)
3192
0
#define SAM_NBYTES 240000
3193
3194
// Number of BAM records (writing, up to NB_mem in size)
3195
0
#define SAM_NBAM 1000
3196
3197
struct SAM_state;
3198
3199
// Output job - a block of BAM records
3200
typedef struct sp_bams {
3201
    struct sp_bams *next;
3202
    int serial;
3203
3204
    bam1_t *bams;
3205
    int nbams, abams; // used and alloc for bams[] array
3206
    size_t bam_mem;   // very approximate total size
3207
3208
    struct SAM_state *fd;
3209
} sp_bams;
3210
3211
// Input job - a block of SAM text
3212
typedef struct sp_lines {
3213
    struct sp_lines *next;
3214
    int serial;
3215
3216
    char *data;
3217
    int data_size;
3218
    int alloc;
3219
3220
    struct SAM_state *fd;
3221
    sp_bams *bams;
3222
} sp_lines;
3223
3224
enum sam_cmd {
3225
    SAM_NONE = 0,
3226
    SAM_CLOSE,
3227
    SAM_CLOSE_DONE,
3228
    SAM_AT_EOF,
3229
};
3230
3231
typedef struct SAM_state {
3232
    sam_hdr_t *h;
3233
3234
    hts_tpool *p;
3235
    int own_pool;
3236
    pthread_mutex_t lines_m;
3237
    hts_tpool_process *q;
3238
    pthread_t dispatcher;
3239
    int dispatcher_set;
3240
3241
    sp_lines *lines;
3242
    sp_bams *bams;
3243
3244
    sp_bams *curr_bam;
3245
    int curr_idx;
3246
    int serial;
3247
3248
    // Be warned: moving these mutexes around in this struct can reduce
3249
    // threading performance by up to 70%!
3250
    pthread_mutex_t command_m;
3251
    pthread_cond_t command_c;
3252
    enum sam_cmd command;
3253
3254
    // One of the E* errno codes
3255
    int errcode;
3256
3257
    htsFile *fp;
3258
} SAM_state;
3259
3260
// Returns a SAM_state struct from a generic hFILE.
3261
//
3262
// Returns NULL on failure.
3263
0
static SAM_state *sam_state_create(htsFile *fp) {
3264
    // Ideally sam_open wouldn't be a #define to hts_open but instead would
3265
    // be a redirect call with an additional 'S' mode.  This in turn would
3266
    // correctly set the designed format to sam instead of a generic
3267
    // text_format.
3268
0
    if (fp->format.format != sam && fp->format.format != text_format)
3269
0
        return NULL;
3270
3271
0
    SAM_state *fd = calloc(1, sizeof(*fd));
3272
0
    if (!fd)
3273
0
        return NULL;
3274
3275
0
    fp->state = fd;
3276
0
    fd->fp = fp;
3277
3278
0
    return fd;
3279
0
}
3280
3281
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str);
3282
static void *sam_format_worker(void *arg);
3283
3284
0
static void sam_state_err(SAM_state *fd, int errcode) {
3285
0
    pthread_mutex_lock(&fd->command_m);
3286
0
    if (!fd->errcode)
3287
0
        fd->errcode = errcode;
3288
0
    pthread_mutex_unlock(&fd->command_m);
3289
0
}
3290
3291
0
static void sam_free_sp_bams(sp_bams *b) {
3292
0
    if (!b)
3293
0
        return;
3294
3295
0
    if (b->bams) {
3296
0
        int i;
3297
0
        for (i = 0; i < b->abams; i++) {
3298
0
            if (b->bams[i].data)
3299
0
                free(b->bams[i].data);
3300
0
        }
3301
0
        free(b->bams);
3302
0
    }
3303
0
    free(b);
3304
0
}
3305
3306
// Destroys the state produce by sam_state_create.
3307
5.12k
int sam_state_destroy(htsFile *fp) {
3308
5.12k
    int ret = 0;
3309
3310
5.12k
    if (!fp->state)
3311
5.12k
        return 0;
3312
3313
0
    SAM_state *fd = fp->state;
3314
0
    if (fd->p) {
3315
0
        if (fd->h) {
3316
            // Notify sam_dispatcher we're closing
3317
0
            pthread_mutex_lock(&fd->command_m);
3318
0
            if (fd->command != SAM_CLOSE_DONE)
3319
0
                fd->command = SAM_CLOSE;
3320
0
            pthread_cond_signal(&fd->command_c);
3321
0
            ret = -fd->errcode;
3322
0
            if (fd->q)
3323
0
                hts_tpool_wake_dispatch(fd->q); // unstick the reader
3324
3325
0
            if (!fp->is_write && fd->q && fd->dispatcher_set) {
3326
0
                for (;;) {
3327
                    // Avoid deadlocks with dispatcher
3328
0
                    if (fd->command == SAM_CLOSE_DONE)
3329
0
                        break;
3330
0
                    hts_tpool_wake_dispatch(fd->q);
3331
0
                    pthread_mutex_unlock(&fd->command_m);
3332
0
                    hts_usleep(10000);
3333
0
                    pthread_mutex_lock(&fd->command_m);
3334
0
                }
3335
0
            }
3336
0
            pthread_mutex_unlock(&fd->command_m);
3337
3338
0
            if (fp->is_write) {
3339
                // Dispatch the last partial block.
3340
0
                sp_bams *gb = fd->curr_bam;
3341
0
                if (!ret && gb && gb->nbams > 0 && fd->q)
3342
0
                    ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb);
3343
3344
                // Flush and drain output
3345
0
                if (fd->q)
3346
0
                    hts_tpool_process_flush(fd->q);
3347
0
                pthread_mutex_lock(&fd->command_m);
3348
0
                if (!ret) ret = -fd->errcode;
3349
0
                pthread_mutex_unlock(&fd->command_m);
3350
3351
0
                while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) {
3352
0
                    hts_usleep(10000);
3353
0
                    pthread_mutex_lock(&fd->command_m);
3354
0
                    ret = -fd->errcode;
3355
                    // not empty but shutdown implies error
3356
0
                    if (hts_tpool_process_is_shutdown(fd->q) && !ret)
3357
0
                        ret = EIO;
3358
0
                    pthread_mutex_unlock(&fd->command_m);
3359
0
                }
3360
0
                if (fd->q)
3361
0
                    hts_tpool_process_shutdown(fd->q);
3362
0
            }
3363
3364
            // Wait for it to acknowledge
3365
0
            if (fd->dispatcher_set)
3366
0
                pthread_join(fd->dispatcher, NULL);
3367
0
            if (!ret) ret = -fd->errcode;
3368
0
        }
3369
3370
        // Tidy up memory
3371
0
        if (fd->q)
3372
0
            hts_tpool_process_destroy(fd->q);
3373
3374
0
        if (fd->own_pool && fp->format.compression == no_compression) {
3375
0
            hts_tpool_destroy(fd->p);
3376
0
            fd->p = NULL;
3377
0
        }
3378
0
        pthread_mutex_destroy(&fd->lines_m);
3379
0
        pthread_mutex_destroy(&fd->command_m);
3380
0
        pthread_cond_destroy(&fd->command_c);
3381
3382
0
        sp_lines *l = fd->lines;
3383
0
        while (l) {
3384
0
            sp_lines *n = l->next;
3385
0
            free(l->data);
3386
0
            free(l);
3387
0
            l = n;
3388
0
        }
3389
3390
0
        sp_bams *b = fd->bams;
3391
0
        while (b) {
3392
0
            if (fd->curr_bam == b)
3393
0
                fd->curr_bam = NULL;
3394
0
            sp_bams *n = b->next;
3395
0
            sam_free_sp_bams(b);
3396
0
            b = n;
3397
0
        }
3398
3399
0
        if (fd->curr_bam)
3400
0
            sam_free_sp_bams(fd->curr_bam);
3401
3402
        // Decrement counter by one, maybe destroying too.
3403
        // This is to permit the caller using bam_hdr_destroy
3404
        // before sam_close without triggering decode errors
3405
        // in the background threads.
3406
0
        bam_hdr_destroy(fd->h);
3407
0
    }
3408
3409
0
    free(fp->state);
3410
0
    fp->state = NULL;
3411
0
    return ret;
3412
5.12k
}
3413
3414
// Cleanup function - job for sam_parse_worker; result for sam_format_worker
3415
0
static void cleanup_sp_lines(void *arg) {
3416
0
    sp_lines *gl = (sp_lines *)arg;
3417
0
    if (!gl) return;
3418
3419
    // Should always be true for lines passed to / from thread workers.
3420
0
    assert(gl->next == NULL);
3421
3422
0
    free(gl->data);
3423
0
    sam_free_sp_bams(gl->bams);
3424
0
    free(gl);
3425
0
}
3426
3427
// Run from one of the worker threads.
3428
// Convert a passed in array of lines to array of BAMs, returning
3429
// the result back to the thread queue.
3430
0
static void *sam_parse_worker(void *arg) {
3431
0
    sp_lines *gl = (sp_lines *)arg;
3432
0
    sp_bams *gb = NULL;
3433
0
    char *lines = gl->data;
3434
0
    int i;
3435
0
    bam1_t *b;
3436
0
    SAM_state *fd = gl->fd;
3437
3438
    // Use a block of BAM structs we had earlier if available.
3439
0
    pthread_mutex_lock(&fd->lines_m);
3440
0
    if (fd->bams) {
3441
0
        gb = fd->bams;
3442
0
        fd->bams = gb->next;
3443
0
    }
3444
0
    pthread_mutex_unlock(&fd->lines_m);
3445
3446
0
    if (gb == NULL) {
3447
0
        gb = calloc(1, sizeof(*gb));
3448
0
        if (!gb) {
3449
0
            return NULL;
3450
0
        }
3451
0
        gb->abams = 100;
3452
0
        gb->bams = b = calloc(gb->abams, sizeof(*b));
3453
0
        if (!gb->bams) {
3454
0
            sam_state_err(fd, ENOMEM);
3455
0
            goto err;
3456
0
        }
3457
0
        gb->nbams = 0;
3458
0
        gb->bam_mem = 0;
3459
0
    }
3460
0
    gb->serial = gl->serial;
3461
0
    gb->next = NULL;
3462
3463
0
    b = (bam1_t *)gb->bams;
3464
0
    if (!b) {
3465
0
        sam_state_err(fd, ENOMEM);
3466
0
        goto err;
3467
0
    }
3468
3469
0
    i = 0;
3470
0
    char *cp = lines, *cp_end = lines + gl->data_size;
3471
0
    while (cp < cp_end) {
3472
0
        if (i >= gb->abams) {
3473
0
            int old_abams = gb->abams;
3474
0
            gb->abams *= 2;
3475
0
            b = (bam1_t *)realloc(gb->bams, gb->abams*sizeof(bam1_t));
3476
0
            if (!b) {
3477
0
                gb->abams /= 2;
3478
0
                sam_state_err(fd, ENOMEM);
3479
0
                goto err;
3480
0
            }
3481
0
            memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b));
3482
0
            gb->bams = b;
3483
0
        }
3484
3485
        // Ideally we'd get sam_parse1 to return the number of
3486
        // bytes decoded and to be able to stop on newline as
3487
        // well as \0.
3488
        //
3489
        // We can then avoid the additional strchr loop.
3490
        // It's around 6% of our CPU cost, albeit threadable.
3491
        //
3492
        // However this is an API change so for now we copy.
3493
3494
0
        char *nl = strchr(cp, '\n');
3495
0
        char *line_end;
3496
0
        if (nl) {
3497
0
            line_end = nl;
3498
0
            if (line_end > cp && *(line_end - 1) == '\r')
3499
0
                line_end--;
3500
0
            nl++;
3501
0
        } else {
3502
0
            nl = line_end = cp_end;
3503
0
        }
3504
0
        *line_end = '\0';
3505
0
        kstring_t ks = { line_end - cp, gl->alloc, cp };
3506
0
        if (sam_parse1(&ks, fd->h, &b[i]) < 0) {
3507
0
            sam_state_err(fd, errno ? errno : EIO);
3508
0
            cleanup_sp_lines(gl);
3509
0
            goto err;
3510
0
        }
3511
3512
0
        cp = nl;
3513
0
        i++;
3514
0
    }
3515
0
    gb->nbams = i;
3516
3517
0
    pthread_mutex_lock(&fd->lines_m);
3518
0
    gl->next = fd->lines;
3519
0
    fd->lines = gl;
3520
0
    pthread_mutex_unlock(&fd->lines_m);
3521
0
    return gb;
3522
3523
0
 err:
3524
0
    sam_free_sp_bams(gb);
3525
0
    return NULL;
3526
0
}
3527
3528
0
static void *sam_parse_eof(void *arg) {
3529
0
    return NULL;
3530
0
}
3531
3532
// Cleanup function - result for sam_parse_worker; job for sam_format_worker
3533
0
static void cleanup_sp_bams(void *arg) {
3534
0
    sam_free_sp_bams((sp_bams *) arg);
3535
0
}
3536
3537
// Runs in its own thread.
3538
// Reads a block of text (SAM) and sends a new job to the thread queue to
3539
// translate this to BAM.
3540
0
static void *sam_dispatcher_read(void *vp) {
3541
0
    htsFile *fp = vp;
3542
0
    kstring_t line = {0};
3543
0
    int line_frag = 0;
3544
0
    SAM_state *fd = fp->state;
3545
0
    sp_lines *l = NULL;
3546
3547
    // Pre-allocate buffer for left-over bits of line (exact size doesn't
3548
    // matter as it will grow if necessary).
3549
0
    if (ks_resize(&line, 1000) < 0)
3550
0
        goto err;
3551
3552
0
    for (;;) {
3553
        // Check for command
3554
0
        pthread_mutex_lock(&fd->command_m);
3555
0
        switch (fd->command) {
3556
3557
0
        case SAM_CLOSE:
3558
0
            pthread_cond_signal(&fd->command_c);
3559
0
            pthread_mutex_unlock(&fd->command_m);
3560
0
            hts_tpool_process_shutdown(fd->q);
3561
0
            goto tidyup;
3562
3563
0
        default:
3564
0
            break;
3565
0
        }
3566
0
        pthread_mutex_unlock(&fd->command_m);
3567
3568
0
        pthread_mutex_lock(&fd->lines_m);
3569
0
        if (fd->lines) {
3570
            // reuse existing line buffer
3571
0
            l = fd->lines;
3572
0
            fd->lines = l->next;
3573
0
        }
3574
0
        pthread_mutex_unlock(&fd->lines_m);
3575
3576
0
        if (l == NULL) {
3577
            // none to reuse, to create a new one
3578
0
            l = calloc(1, sizeof(*l));
3579
0
            if (!l)
3580
0
                goto err;
3581
0
            l->alloc = SAM_NBYTES;
3582
0
            l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1
3583
0
            if (!l->data) {
3584
0
                free(l);
3585
0
                l = NULL;
3586
0
                goto err;
3587
0
            }
3588
0
            l->fd = fd;
3589
0
        }
3590
0
        l->next = NULL;
3591
3592
0
        if (l->alloc < line_frag+SAM_NBYTES/2) {
3593
0
            char *rp = realloc(l->data, line_frag+SAM_NBYTES/2 +8);
3594
0
            if (!rp)
3595
0
                goto err;
3596
0
            l->alloc = line_frag+SAM_NBYTES/2;
3597
0
            l->data = rp;
3598
0
        }
3599
0
        memcpy(l->data, line.s, line_frag);
3600
3601
0
        l->data_size = line_frag;
3602
0
        ssize_t nbytes;
3603
0
    longer_line:
3604
0
        if (fp->is_bgzf)
3605
0
            nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag);
3606
0
        else
3607
0
            nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag);
3608
0
        if (nbytes < 0) {
3609
0
            sam_state_err(fd, errno ? errno : EIO);
3610
0
            goto err;
3611
0
        } else if (nbytes == 0)
3612
0
            break; // EOF
3613
0
        l->data_size += nbytes;
3614
3615
        // trim to last \n. Maybe \r\n, but that's still fine
3616
0
        if (nbytes == l->alloc - line_frag) {
3617
0
            char *cp_end = l->data + l->data_size;
3618
0
            char *cp = cp_end-1;
3619
3620
0
            while (cp > (char *)l->data && *cp != '\n')
3621
0
                cp--;
3622
3623
            // entire buffer is part of a single line
3624
0
            if (cp == l->data) {
3625
0
                line_frag = l->data_size;
3626
0
                char *rp = realloc(l->data, l->alloc * 2 + 8);
3627
0
                if (!rp)
3628
0
                    goto err;
3629
0
                l->alloc *= 2;
3630
0
                l->data = rp;
3631
0
                assert(l->alloc >= l->data_size);
3632
0
                assert(l->alloc >= line_frag);
3633
0
                assert(l->alloc >= l->alloc - line_frag);
3634
0
                goto longer_line;
3635
0
            }
3636
0
            cp++;
3637
3638
            // line holds the remainder of our line.
3639
0
            if (ks_resize(&line, cp_end - cp) < 0)
3640
0
                goto err;
3641
0
            memcpy(line.s, cp, cp_end - cp);
3642
0
            line_frag = cp_end - cp;
3643
0
            l->data_size = l->alloc - line_frag;
3644
0
        } else {
3645
            // out of buffer
3646
0
            line_frag = 0;
3647
0
        }
3648
3649
0
        l->serial = fd->serial++;
3650
        //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial);
3651
0
        if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l,
3652
0
                                cleanup_sp_lines, cleanup_sp_bams, 0) < 0)
3653
0
            goto err;
3654
0
        pthread_mutex_lock(&fd->command_m);
3655
0
        if (fd->command == SAM_CLOSE) {
3656
0
            pthread_mutex_unlock(&fd->command_m);
3657
0
            l = NULL;
3658
0
            goto tidyup;
3659
0
        }
3660
0
        l = NULL;  // Now "owned" by sam_parse_worker()
3661
0
        pthread_mutex_unlock(&fd->command_m);
3662
0
    }
3663
3664
    // Submit a NULL sp_bams entry to act as an EOF marker
3665
0
    if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0)
3666
0
        goto err;
3667
3668
    // At EOF, wait for close request.
3669
    // (In future if we add support for seek, this is where we need to catch it.)
3670
0
    for (;;) {
3671
0
        pthread_mutex_lock(&fd->command_m);
3672
0
        if (fd->command == SAM_NONE)
3673
0
            pthread_cond_wait(&fd->command_c, &fd->command_m);
3674
0
        switch (fd->command) {
3675
0
        case SAM_CLOSE:
3676
0
            pthread_cond_signal(&fd->command_c);
3677
0
            pthread_mutex_unlock(&fd->command_m);
3678
0
            hts_tpool_process_shutdown(fd->q);
3679
0
            goto tidyup;
3680
3681
0
        default:
3682
0
            pthread_mutex_unlock(&fd->command_m);
3683
0
            break;
3684
0
        }
3685
0
    }
3686
3687
0
 tidyup:
3688
0
    pthread_mutex_lock(&fd->command_m);
3689
0
    fd->command = SAM_CLOSE_DONE;
3690
0
    pthread_cond_signal(&fd->command_c);
3691
0
    pthread_mutex_unlock(&fd->command_m);
3692
3693
0
    if (l) {
3694
0
        pthread_mutex_lock(&fd->lines_m);
3695
0
        l->next = fd->lines;
3696
0
        fd->lines = l;
3697
0
        pthread_mutex_unlock(&fd->lines_m);
3698
0
    }
3699
0
    free(line.s);
3700
3701
0
    return NULL;
3702
3703
0
 err:
3704
0
    sam_state_err(fd, errno ? errno : ENOMEM);
3705
0
    hts_tpool_process_shutdown(fd->q);
3706
0
    goto tidyup;
3707
0
}
3708
3709
// Runs in its own thread.
3710
// Takes encoded blocks of SAM off the thread results queue and writes them
3711
// to our output stream.
3712
0
static void *sam_dispatcher_write(void *vp) {
3713
0
    htsFile *fp = vp;
3714
0
    SAM_state *fd = fp->state;
3715
0
    hts_tpool_result *r;
3716
3717
    // Iterates until result queue is shutdown, where it returns NULL.
3718
0
    while ((r = hts_tpool_next_result_wait(fd->q))) {
3719
0
        sp_lines *gl = (sp_lines *)hts_tpool_result_data(r);
3720
0
        if (!gl) {
3721
0
            sam_state_err(fd, ENOMEM);
3722
0
            goto err;
3723
0
        }
3724
3725
0
        if (fp->idx) {
3726
0
            sp_bams *gb = gl->bams;
3727
0
            int i = 0, count = 0;
3728
0
            while (i < gl->data_size) {
3729
0
                int j = i;
3730
0
                while (i < gl->data_size && gl->data[i] != '\n')
3731
0
                    i++;
3732
0
                if (i < gl->data_size)
3733
0
                    i++;
3734
3735
0
                if (fp->is_bgzf) {
3736
0
                    if (bgzf_flush_try(fp->fp.bgzf, i-j) < 0)
3737
0
                        goto err;
3738
0
                    if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j)
3739
0
                        goto err;
3740
0
                } else {
3741
0
                    if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j)
3742
0
                        goto err;
3743
0
                }
3744
3745
0
                bam1_t *b = &gb->bams[count++];
3746
0
                if (fp->format.compression == bgzf) {
3747
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
3748
0
                                      b->core.tid, b->core.pos, bam_endpos(b),
3749
0
                                      bgzf_tell(fp->fp.bgzf),
3750
0
                                      !(b->core.flag&BAM_FUNMAP)) < 0) {
3751
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3752
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3753
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3754
0
                        goto err;
3755
0
                    }
3756
0
                } else {
3757
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
3758
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
3759
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3760
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3761
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3762
0
                        goto err;
3763
0
                    }
3764
0
                }
3765
0
            }
3766
3767
0
            assert(count == gb->nbams);
3768
3769
            // Add bam array to free-list
3770
0
            pthread_mutex_lock(&fd->lines_m);
3771
0
            gb->next = fd->bams;
3772
0
            fd->bams = gl->bams;
3773
0
            gl->bams = NULL;
3774
0
            pthread_mutex_unlock(&fd->lines_m);
3775
0
        } else {
3776
0
            if (fp->is_bgzf) {
3777
                // We keep track of how much in the current block we have
3778
                // remaining => R.  We look for the last newline in input
3779
                // [i] to [i+R], backwards => position N.
3780
                //
3781
                // If we find a newline, we write out bytes i to N.
3782
                // We know we cannot fit the next record in this bgzf block,
3783
                // so we flush what we have and copy input N to i+R into
3784
                // the start of a new block, and recompute a new R for that.
3785
                //
3786
                // If we don't find a newline (i==N) then we cannot extend
3787
                // the current block at all, so flush whatever is in it now
3788
                // if it ends on a newline.
3789
                // We still copy i(==N) to i+R to the next block and
3790
                // continue as before with a new R.
3791
                //
3792
                // The only exception on the flush is when we run out of
3793
                // data in the input.  In that case we skip it as we don't
3794
                // yet know if the next record will fit.
3795
                //
3796
                // Both conditions share the same code here:
3797
                // - Look for newline (pos N)
3798
                // - Write i to N (which maybe 0)
3799
                // - Flush if block ends on newline and not end of input
3800
                // - write N to i+R
3801
3802
0
                int i = 0;
3803
0
                BGZF *fb = fp->fp.bgzf;
3804
0
                while (i < gl->data_size) {
3805
                    // remaining space in block
3806
0
                    int R = BGZF_BLOCK_SIZE - fb->block_offset;
3807
0
                    int eod = 0;
3808
0
                    if (R > gl->data_size-i)
3809
0
                        R = gl->data_size-i, eod = 1;
3810
3811
                    // Find last newline in input data
3812
0
                    int N = i + R;
3813
0
                    while (--N > i) {
3814
0
                        if (gl->data[N] == '\n')
3815
0
                            break;
3816
0
                    }
3817
3818
0
                    if (N != i) {
3819
                        // Found a newline
3820
0
                        N++;
3821
0
                        if (bgzf_write(fb, &gl->data[i], N-i) != N-i)
3822
0
                            goto err;
3823
0
                    }
3824
3825
                    // Flush bgzf block
3826
0
                    int b_off = fb->block_offset;
3827
0
                    if (!eod && b_off &&
3828
0
                        ((char *)fb->uncompressed_block)[b_off-1] == '\n')
3829
0
                        if (bgzf_flush_try(fb, BGZF_BLOCK_SIZE) < 0)
3830
0
                            goto err;
3831
3832
                    // Copy from N onwards into next block
3833
0
                    if (i+R > N)
3834
0
                        if (bgzf_write(fb, &gl->data[N], i+R - N)
3835
0
                            != i+R - N)
3836
0
                            goto err;
3837
3838
0
                    i = i+R;
3839
0
                }
3840
0
            } else {
3841
0
                if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size)
3842
0
                    goto err;
3843
0
            }
3844
0
        }
3845
3846
0
        hts_tpool_delete_result(r, 0);
3847
3848
        // Also updated by main thread
3849
0
        pthread_mutex_lock(&fd->lines_m);
3850
0
        gl->next = fd->lines;
3851
0
        fd->lines = gl;
3852
0
        pthread_mutex_unlock(&fd->lines_m);
3853
0
    }
3854
3855
0
    sam_state_err(fd, 0); // success
3856
0
    hts_tpool_process_shutdown(fd->q);
3857
0
    return NULL;
3858
3859
0
 err:
3860
0
    sam_state_err(fd, errno ? errno : EIO);
3861
0
    return (void *)-1;
3862
0
}
3863
3864
// Run from one of the worker threads.
3865
// Convert a passed in array of BAMs (sp_bams) and converts to a block
3866
// of text SAM records (sp_lines).
3867
0
static void *sam_format_worker(void *arg) {
3868
0
    sp_bams *gb = (sp_bams *)arg;
3869
0
    sp_lines *gl = NULL;
3870
0
    int i;
3871
0
    SAM_state *fd = gb->fd;
3872
0
    htsFile *fp = fd->fp;
3873
3874
    // Use a block of SAM strings we had earlier if available.
3875
0
    pthread_mutex_lock(&fd->lines_m);
3876
0
    if (fd->lines) {
3877
0
        gl = fd->lines;
3878
0
        fd->lines = gl->next;
3879
0
    }
3880
0
    pthread_mutex_unlock(&fd->lines_m);
3881
3882
0
    if (gl == NULL) {
3883
0
        gl = calloc(1, sizeof(*gl));
3884
0
        if (!gl) {
3885
0
            sam_state_err(fd, ENOMEM);
3886
0
            return NULL;
3887
0
        }
3888
0
        gl->alloc = gl->data_size = 0;
3889
0
        gl->data = NULL;
3890
0
    }
3891
0
    gl->serial = gb->serial;
3892
0
    gl->next = NULL;
3893
3894
0
    kstring_t ks = {0, gl->alloc, gl->data};
3895
3896
0
    for (i = 0; i < gb->nbams; i++) {
3897
0
        if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) {
3898
0
            sam_state_err(fd, errno ? errno : EIO);
3899
0
            goto err;
3900
0
        }
3901
0
        kputc('\n', &ks);
3902
0
    }
3903
3904
0
    pthread_mutex_lock(&fd->lines_m);
3905
0
    gl->data_size = ks.l;
3906
0
    gl->alloc = ks.m;
3907
0
    gl->data = ks.s;
3908
3909
0
    if (fp->idx) {
3910
        // Keep hold of the bam array a little longer as
3911
        // sam_dispatcher_write needs to use them for building the index.
3912
0
        gl->bams = gb;
3913
0
    } else {
3914
        // Add bam array to free-list
3915
0
        gb->next = fd->bams;
3916
0
        fd->bams = gb;
3917
0
    }
3918
0
    pthread_mutex_unlock(&fd->lines_m);
3919
3920
0
    return gl;
3921
3922
0
 err:
3923
    // Possible race between this and fd->curr_bam.
3924
    // Easier to not free and leave it on the input list so it
3925
    // gets freed there instead?
3926
    // sam_free_sp_bams(gb);
3927
0
    if (gl) {
3928
0
        free(gl->data);
3929
0
        free(gl);
3930
0
    }
3931
0
    return NULL;
3932
0
}
3933
3934
0
int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) {
3935
0
    if (fp->state)
3936
0
        return 0;
3937
3938
0
    if (!(fp->state = sam_state_create(fp)))
3939
0
        return -1;
3940
0
    SAM_state *fd = (SAM_state *)fp->state;
3941
3942
0
    pthread_mutex_init(&fd->lines_m, NULL);
3943
0
    pthread_mutex_init(&fd->command_m, NULL);
3944
0
    pthread_cond_init(&fd->command_c, NULL);
3945
0
    fd->p = p->pool;
3946
0
    int qsize = p->qsize;
3947
0
    if (!qsize)
3948
0
        qsize = 2*hts_tpool_size(fd->p);
3949
0
    fd->q = hts_tpool_process_init(fd->p, qsize, 0);
3950
0
    if (!fd->q) {
3951
0
        sam_state_destroy(fp);
3952
0
        return -1;
3953
0
    }
3954
3955
0
    if (fp->format.compression == bgzf)
3956
0
        return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize);
3957
3958
0
    return 0;
3959
0
}
3960
3961
0
int sam_set_threads(htsFile *fp, int nthreads) {
3962
0
    if (nthreads <= 0)
3963
0
        return 0;
3964
3965
0
    htsThreadPool p;
3966
0
    p.pool = hts_tpool_init(nthreads);
3967
0
    p.qsize = nthreads*2;
3968
3969
0
    int ret = sam_set_thread_pool(fp, &p);
3970
0
    if (ret < 0)
3971
0
        return ret;
3972
3973
0
    SAM_state *fd = (SAM_state *)fp->state;
3974
0
    fd->own_pool = 1;
3975
3976
0
    return 0;
3977
0
}
3978
3979
typedef struct {
3980
    kstring_t name;
3981
    kstring_t comment; // NB: pointer into name, do not free
3982
    kstring_t seq;
3983
    kstring_t qual;
3984
    int casava;
3985
    int aux;
3986
    int rnum;
3987
    char BC[3];         // aux tag ID for barcode
3988
    khash_t(tag) *tags; // which aux tags to use (if empty, use all).
3989
    char nprefix;
3990
    int sra_names;
3991
} fastq_state;
3992
3993
// Initialise fastq state.
3994
// Name char of '@' or '>' distinguishes fastq vs fasta variant
3995
1.02k
static fastq_state *fastq_state_init(int name_char) {
3996
1.02k
    fastq_state *x = (fastq_state *)calloc(1, sizeof(*x));
3997
1.02k
    if (!x)
3998
0
        return NULL;
3999
1.02k
    strcpy(x->BC, "BC");
4000
1.02k
    x->nprefix = name_char;
4001
4002
1.02k
    return x;
4003
1.02k
}
4004
4005
1.36k
void fastq_state_destroy(htsFile *fp) {
4006
1.36k
    if (fp->state) {
4007
1.02k
        fastq_state *x = (fastq_state *)fp->state;
4008
1.02k
        if (x->tags)
4009
1.02k
            kh_destroy(tag, x->tags);
4010
1.02k
        ks_free(&x->name);
4011
1.02k
        ks_free(&x->seq);
4012
1.02k
        ks_free(&x->qual);
4013
1.02k
        free(fp->state);
4014
1.02k
    }
4015
1.36k
}
4016
4017
0
int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
4018
0
    va_list args;
4019
4020
0
    if (!fp)
4021
0
        return -1;
4022
0
    if (!fp->state)
4023
0
        if (!(fp->state = fastq_state_init(fp->format.format == fastq_format
4024
0
                                           ? '@' : '>')))
4025
0
            return -1;
4026
4027
0
    fastq_state *x = (fastq_state *)fp->state;
4028
4029
0
    switch (opt) {
4030
0
    case FASTQ_OPT_CASAVA:
4031
0
        x->casava = 1;
4032
0
        break;
4033
4034
0
    case FASTQ_OPT_NAME2:
4035
0
        x->sra_names = 1;
4036
0
        break;
4037
4038
0
    case FASTQ_OPT_AUX: {
4039
0
        va_start(args, opt);
4040
0
        x->aux = 1;
4041
0
        char *tag = va_arg(args, char *);
4042
0
        va_end(args);
4043
0
        if (tag && strcmp(tag, "1") != 0) {
4044
0
            if (!x->tags)
4045
0
                if (!(x->tags = kh_init(tag)))
4046
0
                    return -1;
4047
4048
0
            size_t i, tlen = strlen(tag);
4049
0
            for (i = 0; i+3 <= tlen+1; i += 3) {
4050
0
                if (tag[i+0] == ',' || tag[i+1] == ',' ||
4051
0
                    !(tag[i+2] == ',' || tag[i+2] == '\0')) {
4052
0
                    hts_log_warning("Bad tag format '%.3s'; skipping option", tag+i);
4053
0
                    break;
4054
0
                }
4055
0
                int ret, tcode = tag[i+0]*256 + tag[i+1];
4056
0
                kh_put(tag, x->tags, tcode, &ret);
4057
0
                if (ret < 0)
4058
0
                    return -1;
4059
0
            }
4060
0
        }
4061
0
        break;
4062
0
    }
4063
4064
0
    case FASTQ_OPT_BARCODE: {
4065
0
        va_start(args, opt);
4066
0
        char *bc = va_arg(args, char *);
4067
0
        va_end(args);
4068
0
        strncpy(x->BC, bc, 2);
4069
0
        x->BC[2] = 0;
4070
0
        break;
4071
0
    }
4072
4073
0
    case FASTQ_OPT_RNUM:
4074
0
        x->rnum = 1;
4075
0
        break;
4076
4077
0
    default:
4078
0
        break;
4079
0
    }
4080
0
    return 0;
4081
0
}
4082
4083
6.88M
static int fastq_parse1(htsFile *fp, bam1_t *b) {
4084
6.88M
    fastq_state *x = (fastq_state *)fp->state;
4085
6.88M
    size_t i, l;
4086
6.88M
    int ret = 0;
4087
4088
6.88M
    if (fp->format.format == fasta_format && fp->line.s) {
4089
        // For FASTA we've already read the >name line; steal it
4090
        // Not the most efficient, but we don't optimise for fasta reading.
4091
6.87M
        if (fp->line.l == 0)
4092
339
            return -1; // EOF
4093
4094
6.87M
        free(x->name.s);
4095
6.87M
        x->name = fp->line;
4096
6.87M
        fp->line.l = fp->line.m = 0;
4097
6.87M
        fp->line.s = NULL;
4098
6.87M
    } else {
4099
        // Read a FASTQ format entry.
4100
1.10k
        ret = hts_getline(fp, KS_SEP_LINE, &x->name);
4101
1.10k
        if (ret == -1)
4102
0
            return -1;  // EOF
4103
1.10k
        else if (ret < -1)
4104
6
            return ret; // ERR
4105
1.10k
    }
4106
4107
    // Name
4108
6.88M
    if (*x->name.s != x->nprefix)
4109
15
        return -2;
4110
4111
    // Reverse the SRA strangeness of putting the run_name.number before
4112
    // the read name.
4113
6.88M
    i = 0;
4114
6.88M
    char *name = x->name.s+1;
4115
6.88M
    if (x->sra_names) {
4116
0
        char *cp = strpbrk(x->name.s, " \t");
4117
0
        if (cp) {
4118
0
            while (*cp == ' ' || *cp == '\t')
4119
0
                cp++;
4120
0
            *--cp = '@';
4121
0
            i = cp - x->name.s;
4122
0
            name = cp+1;
4123
0
        }
4124
0
    }
4125
4126
6.88M
    l = x->name.l;
4127
6.88M
    char *s = x->name.s;
4128
31.1M
    while (i < l && !isspace_c(s[i]))
4129
24.2M
        i++;
4130
6.88M
    if (i < l) {
4131
90.3k
        s[i] = 0;
4132
90.3k
        x->name.l = i++;
4133
90.3k
    }
4134
4135
    // Comment; a kstring struct, but pointer into name line.  (Do not free)
4136
7.32M
    while (i < l && isspace_c(s[i]))
4137
444k
        i++;
4138
6.88M
    x->comment.s = s+i;
4139
6.88M
    x->comment.l = l - i;
4140
4141
    // Seq
4142
6.88M
    x->seq.l = 0;
4143
40.0M
    for (;;) {
4144
40.0M
        if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0)
4145
924
            if (fp->format.format == fastq_format || ret < -1)
4146
543
                return -2;
4147
40.0M
        if (ret == -1 ||
4148
40.0M
            *fp->line.s == (fp->format.format == fastq_format ? '+' : '>'))
4149
6.87M
            break;
4150
33.1M
        if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0)
4151
0
            return -2;
4152
33.1M
    }
4153
4154
    // Qual
4155
6.87M
    if (fp->format.format == fastq_format) {
4156
108
        size_t remainder = x->seq.l;
4157
108
        x->qual.l = 0;
4158
30.0k
        do {
4159
30.0k
            if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0)
4160
15
                return -2;
4161
30.0k
            if (fp->line.l > remainder)
4162
12
                return -2;
4163
30.0k
            if (kputsn(fp->line.s, fp->line.l, &x->qual) < 0)
4164
0
                return -2;
4165
30.0k
            remainder -= fp->line.l;
4166
30.0k
        } while (remainder > 0);
4167
4168
        // Decr qual
4169
211k
        for (i = 0; i < x->qual.l; i++)
4170
211k
            x->qual.s[i] -= '!';
4171
81
    }
4172
4173
6.87M
    int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED;
4174
6.87M
    if (x->name.l > 2 &&
4175
6.87M
        x->name.s[x->name.l-2] == '/' &&
4176
6.87M
        isdigit_c(x->name.s[x->name.l-1])) {
4177
9.29k
        switch(x->name.s[x->name.l-1]) {
4178
1.68k
        case '1': flag |= BAM_FREAD1 | pflag; break;
4179
870
        case '2': flag |= BAM_FREAD2 | pflag; break;
4180
6.73k
        default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
4181
9.29k
        }
4182
9.29k
        x->name.s[x->name.l-=2] = 0;
4183
9.29k
    }
4184
4185
    // Convert to BAM
4186
6.87M
    ret = bam_set1(b,
4187
6.87M
                   x->name.s + x->name.l - name, name,
4188
6.87M
                   flag,
4189
6.87M
                   -1, -1, 0, // ref '*', pos, mapq,
4190
6.87M
                   0, NULL,     // no cigar,
4191
6.87M
                   -1, -1, 0,    // mate
4192
6.87M
                   x->seq.l, x->seq.s, x->qual.s,
4193
6.87M
                   0);
4194
6.87M
    if (ret < 0) return -2;
4195
4196
    // Identify Illumina CASAVA strings.
4197
    // <read>:<is_filtered>:<control_bits>:<barcode_sequence>
4198
6.87M
    char *barcode = NULL;
4199
6.87M
    int barcode_len = 0;
4200
6.87M
    kstring_t *kc = &x->comment;
4201
6.87M
    char *endptr;
4202
6.87M
    if (x->casava &&
4203
        // \d:[YN]:\d+:[ACGTN]+
4204
6.87M
        kc->l > 6 && (kc->s[1] | kc->s[3]) == ':' && isdigit_c(kc->s[0]) &&
4205
6.87M
        strtol(kc->s+4, &endptr, 10) >= 0 && endptr != kc->s+4
4206
6.87M
        && *endptr == ':') {
4207
4208
        // read num
4209
0
        switch(kc->s[0]) {
4210
0
        case '1': b->core.flag |= BAM_FREAD1 | pflag; break;
4211
0
        case '2': b->core.flag |= BAM_FREAD2 | pflag; break;
4212
0
        default : b->core.flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
4213
0
        }
4214
4215
0
        if (kc->s[2] == 'Y')
4216
0
            b->core.flag |= BAM_FQCFAIL;
4217
4218
        // Barcode, maybe numeric in which case we skip it
4219
0
        if (!isdigit_c(endptr[1])) {
4220
0
            barcode = endptr+1;
4221
0
            for (i = barcode - kc->s; i < kc->l; i++)
4222
0
                if (isspace_c(kc->s[i]))
4223
0
                    break;
4224
4225
0
            kc->s[i] = 0;
4226
0
            barcode_len = i+1-(barcode - kc->s);
4227
0
        }
4228
0
    }
4229
4230
6.87M
    if (ret >= 0 && barcode_len)
4231
0
        if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0)
4232
0
            ret = -2;
4233
4234
6.87M
    if (!x->aux)
4235
6.87M
        return ret;
4236
4237
    // Identify any SAM style aux tags in comments too.
4238
0
    if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0)
4239
0
        ret = -2;
4240
4241
0
    return ret;
4242
6.87M
}
4243
4244
// Internal component of sam_read1 below
4245
321
static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4246
321
    int ret = bam_read1(fp->fp.bgzf, b);
4247
321
    if (h && ret >= 0) {
4248
299
        if (b->core.tid  >= h->n_targets || b->core.tid  < -1 ||
4249
299
            b->core.mtid >= h->n_targets || b->core.mtid < -1) {
4250
1
            errno = ERANGE;
4251
1
            return -3;
4252
1
        }
4253
299
    }
4254
320
    return ret;
4255
321
}
4256
4257
// Internal component of sam_read1 below
4258
678
static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) {
4259
678
    int ret = cram_get_bam_seq(fp->fp.cram, b);
4260
678
    if (ret < 0)
4261
678
        return cram_eof(fp->fp.cram) ? -1 : -2;
4262
4263
0
    if (bam_tag2cigar(*b, 1, 1) < 0)
4264
0
        return -2;
4265
4266
0
    return ret;
4267
0
}
4268
4269
// Internal component of sam_read1 below
4270
162k
static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4271
162k
    int ret;
4272
4273
    // Consume 1st line after header parsing as it wasn't using peek
4274
162k
    if (fp->line.l != 0) {
4275
3
        ret = sam_parse1(&fp->line, h, b);
4276
3
        fp->line.l = 0;
4277
3
        return ret;
4278
3
    }
4279
4280
162k
    if (fp->state) {
4281
0
        SAM_state *fd = (SAM_state *)fp->state;
4282
4283
0
        if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) {
4284
            // We don't support multi-threaded SAM parsing with seeks yet.
4285
0
            int ret;
4286
0
            if ((ret = sam_state_destroy(fp)) < 0) {
4287
0
                errno = -ret;
4288
0
                return -2;
4289
0
            }
4290
0
            if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0)
4291
0
                return -2;
4292
0
            fp->fp.bgzf->seeked = 0;
4293
0
            goto err_recover;
4294
0
        }
4295
4296
0
        if (!fd->h) {
4297
0
            fd->h = h;
4298
0
            fd->h->ref_count++;
4299
            // Ensure hrecs is initialised now as we don't want multiple
4300
            // threads trying to do this simultaneously.
4301
0
            if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0)
4302
0
                return -2;
4303
4304
            // We can only do this once we've got a header
4305
0
            if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read,
4306
0
                               fp) != 0)
4307
0
                return -2;
4308
0
            fd->dispatcher_set = 1;
4309
0
        }
4310
4311
0
        if (fd->h != h) {
4312
0
            hts_log_error("SAM multi-threaded decoding does not support changing header");
4313
0
            return -2;
4314
0
        }
4315
4316
0
        sp_bams *gb = fd->curr_bam;
4317
0
        if (!gb) {
4318
0
            if (fd->errcode) {
4319
                // In case reader failed
4320
0
                errno = fd->errcode;
4321
0
                return -2;
4322
0
            }
4323
4324
0
            pthread_mutex_lock(&fd->command_m);
4325
0
            int cmd = fd->command;
4326
0
            pthread_mutex_unlock(&fd->command_m);
4327
0
            if (cmd == SAM_AT_EOF)
4328
0
                return -1;
4329
4330
0
            hts_tpool_result *r = hts_tpool_next_result_wait(fd->q);
4331
0
            if (!r)
4332
0
                return -2;
4333
0
            fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r);
4334
0
            hts_tpool_delete_result(r, 0);
4335
0
        }
4336
0
        if (!gb) {
4337
0
            pthread_mutex_lock(&fd->command_m);
4338
0
            fd->command = SAM_AT_EOF;
4339
0
            pthread_mutex_unlock(&fd->command_m);
4340
0
            return fd->errcode ? -2 : -1;
4341
0
        }
4342
0
        bam1_t *b_array = (bam1_t *)gb->bams;
4343
0
        if (fd->curr_idx < gb->nbams)
4344
0
            if (!bam_copy1(b, &b_array[fd->curr_idx++]))
4345
0
                return -2;
4346
0
        if (fd->curr_idx == gb->nbams) {
4347
0
            pthread_mutex_lock(&fd->lines_m);
4348
0
            gb->next = fd->bams;
4349
0
            fd->bams = gb;
4350
0
            pthread_mutex_unlock(&fd->lines_m);
4351
4352
0
            fd->curr_bam = NULL;
4353
0
            fd->curr_idx = 0;
4354
        // Consider prefetching next record?  I.e.
4355
        // } else {
4356
        //     __builtin_prefetch(&b_array[fd->curr_idx], 0, 3);
4357
0
        }
4358
4359
0
        ret = 0;
4360
4361
162k
    } else  {
4362
162k
    err_recover:
4363
162k
        ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
4364
162k
        if (ret < 0) return ret;
4365
4366
161k
        ret = sam_parse1(&fp->line, h, b);
4367
161k
        fp->line.l = 0;
4368
161k
        if (ret < 0) {
4369
1.42k
            hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
4370
1.42k
            if (h && h->ignore_sam_err) goto err_recover;
4371
1.42k
        }
4372
161k
    }
4373
4374
161k
    return ret;
4375
162k
}
4376
4377
// Returns 0 on success,
4378
//        -1 on EOF,
4379
//       <-1 on error
4380
int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b)
4381
7.04M
{
4382
7.04M
    int ret, pass_filter;
4383
4384
7.04M
    do {
4385
7.04M
        switch (fp->format.format) {
4386
321
        case bam:
4387
321
            ret = sam_read1_bam(fp, h, b);
4388
321
            break;
4389
4390
678
        case cram:
4391
678
            ret = sam_read1_cram(fp, h, &b);
4392
678
            break;
4393
4394
162k
        case sam:
4395
162k
            ret = sam_read1_sam(fp, h, b);
4396
162k
            break;
4397
4398
6.88M
        case fasta_format:
4399
6.88M
        case fastq_format: {
4400
6.88M
            fastq_state *x = (fastq_state *)fp->state;
4401
6.88M
            if (!x) {
4402
1.02k
                if (!(fp->state = fastq_state_init(fp->format.format
4403
1.02k
                                                   == fastq_format ? '@' : '>')))
4404
0
                    return -2;
4405
1.02k
            }
4406
4407
6.88M
            return fastq_parse1(fp, b);
4408
6.88M
        }
4409
4410
0
        case empty_format:
4411
0
            errno = EPIPE;
4412
0
            return -3;
4413
4414
0
        default:
4415
0
            errno = EFTYPE;
4416
0
            return -3;
4417
7.04M
        }
4418
4419
163k
        pass_filter = (ret >= 0 && fp->filter)
4420
163k
            ? sam_passes_filter(h, b, fp->filter)
4421
163k
            : 1;
4422
163k
    } while (pass_filter == 0);
4423
4424
163k
    return pass_filter < 0 ? -2 : ret;
4425
7.04M
}
4426
4427
// With gcc, -O3 or -ftree-loop-vectorize is really key here as otherwise
4428
// this code isn't vectorised and runs far slower than is necessary (even
4429
// with the restrict keyword being used).
4430
static inline void HTS_OPT3
4431
170
add33(uint8_t *a, const uint8_t * b, int32_t len) {
4432
170
    uint32_t i;
4433
70.6k
    for (i = 0; i < len; i++)
4434
70.5k
        a[i] = b[i]+33;
4435
170
}
4436
4437
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4438
2.34M
{
4439
2.34M
    int i, r = 0;
4440
2.34M
    uint8_t *s, *end;
4441
2.34M
    const bam1_core_t *c = &b->core;
4442
4443
2.34M
    if (c->l_qname == 0)
4444
0
        return -1;
4445
2.34M
    r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str);
4446
2.34M
    r |= kputc_('\t', str); // query name
4447
2.34M
    r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag
4448
2.34M
    if (c->tid >= 0) { // chr
4449
37.7k
        r |= kputs(h->target_name[c->tid] , str);
4450
37.7k
        r |= kputc_('\t', str);
4451
2.30M
    } else r |= kputsn_("*\t", 2, str);
4452
2.34M
    r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos
4453
2.34M
    r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual
4454
2.34M
    if (c->n_cigar) { // cigar
4455
48.8k
        uint32_t *cigar = bam_get_cigar(b);
4456
1.60M
        for (i = 0; i < c->n_cigar; ++i) {
4457
1.55M
            r |= kputw(bam_cigar_oplen(cigar[i]), str);
4458
1.55M
            r |= kputc_(bam_cigar_opchr(cigar[i]), str);
4459
1.55M
        }
4460
2.29M
    } else r |= kputc_('*', str);
4461
2.34M
    r |= kputc_('\t', str);
4462
2.34M
    if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr
4463
1.74k
    else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str);
4464
1.19k
    else {
4465
1.19k
        r |= kputs(h->target_name[c->mtid], str);
4466
1.19k
        r |= kputc_('\t', str);
4467
1.19k
    }
4468
2.34M
    r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos
4469
2.34M
    r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len
4470
2.34M
    if (c->l_qseq) { // seq and qual
4471
244k
        uint8_t *s = bam_get_seq(b);
4472
244k
        if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err;
4473
244k
        char *cp = str->s + str->l;
4474
4475
        // Sequence, 2 bases at a time
4476
244k
        nibble2base(s, cp, c->l_qseq);
4477
244k
        cp[c->l_qseq] = '\t';
4478
244k
        cp += c->l_qseq+1;
4479
4480
        // Quality
4481
244k
        s = bam_get_qual(b);
4482
244k
        i = 0;
4483
244k
        if (s[0] == 0xff) {
4484
244k
            cp[i++] = '*';
4485
244k
        } else {
4486
170
            add33((uint8_t *)cp, s, c->l_qseq); // cp[i] = s[i]+33;
4487
170
            i = c->l_qseq;
4488
170
        }
4489
244k
        cp[i] = 0;
4490
244k
        cp += i;
4491
244k
        str->l = cp - str->s;
4492
2.10M
    } else r |= kputsn_("*\t*", 3, str);
4493
4494
2.34M
    s = bam_get_aux(b); // aux
4495
2.34M
    end = b->data + b->l_data;
4496
4497
3.01M
    while (end - s >= 4) {
4498
672k
        r |= kputc_('\t', str);
4499
672k
        if ((s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)) == NULL)
4500
7
            goto bad_aux;
4501
672k
    }
4502
2.34M
    r |= kputsn("", 0, str); // nul terminate
4503
2.34M
    if (r < 0) goto mem_err;
4504
4505
2.34M
    return str->l;
4506
4507
7
 bad_aux:
4508
7
    hts_log_error("Corrupted aux data for read %.*s flag %d",
4509
7
                  b->core.l_qname, bam_get_qname(b), b->core.flag);
4510
7
    errno = EINVAL;
4511
7
    return -1;
4512
4513
0
 mem_err:
4514
0
    hts_log_error("Out of memory");
4515
0
    errno = ENOMEM;
4516
0
    return -1;
4517
2.34M
}
4518
4519
int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4520
2.34M
{
4521
2.34M
    str->l = 0;
4522
2.34M
    return sam_format1_append(h, b, str);
4523
2.34M
}
4524
4525
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end);
4526
int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str)
4527
0
{
4528
0
    unsigned flag = b->core.flag;
4529
0
    int i, e = 0, len = b->core.l_qseq;
4530
0
    uint8_t *seq, *qual;
4531
4532
0
    str->l = 0;
4533
4534
    // Name
4535
0
    if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF)
4536
0
        return -1;
4537
4538
    // /1 or /2 suffix
4539
0
    if (x && x->rnum && (flag & BAM_FPAIRED)) {
4540
0
        int r12 = flag & (BAM_FREAD1 | BAM_FREAD2);
4541
0
        if (r12 == BAM_FREAD1) {
4542
0
            if (kputs("/1", str) == EOF)
4543
0
                return -1;
4544
0
        } else if (r12 == BAM_FREAD2) {
4545
0
            if (kputs("/2", str) == EOF)
4546
0
                return -1;
4547
0
        }
4548
0
    }
4549
4550
    // Illumina CASAVA tag.
4551
    // This is <rnum>:<Y/N qcfail>:<control-bits>:<barcode-or-zero>
4552
0
    if (x && x->casava) {
4553
0
        int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0;
4554
0
        char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N';
4555
0
        uint8_t *bc = bam_aux_get(b, x->BC);
4556
0
        if (ksprintf(str, " %d:%c:0:%s", rnum, filtered,
4557
0
                     bc ? (char *)bc+1 : "0") < 0)
4558
0
            return -1;
4559
4560
0
        if (bc && (*bc != 'Z' || (!isupper_c(bc[1]) && !islower_c(bc[1])))) {
4561
0
            hts_log_warning("BC tag starts with non-sequence base; using '0'");
4562
0
            str->l -= strlen((char *)bc)-2; // limit to 1 char
4563
0
            str->s[str->l-1] = '0';
4564
0
            str->s[str->l] = 0;
4565
0
            bc = NULL;
4566
0
        }
4567
4568
        // Replace any non-alpha with '+'.  Ie seq-seq to seq+seq
4569
0
        if (bc) {
4570
0
            int l = strlen((char *)bc+1);
4571
0
            char *c = (char *)str->s + str->l - l;
4572
0
            for (i = 0; i < l; i++) {
4573
0
                if (!isalpha_c(c[i]))
4574
0
                    c[i] = '+';
4575
0
                else if (islower_c(c[i]))
4576
0
                    c[i] = toupper_c(c[i]);
4577
0
            }
4578
0
        }
4579
0
    }
4580
4581
    // Aux tags
4582
0
    if (x && x->aux) {
4583
0
        uint8_t *s = bam_get_aux(b), *end = b->data + b->l_data;
4584
0
        while (s && end - s >= 4) {
4585
0
            int tt = s[0]*256 + s[1];
4586
0
            if (x->tags == NULL ||
4587
0
                kh_get(tag, x->tags, tt) != kh_end(x->tags)) {
4588
0
                e |= kputc_('\t', str) < 0;
4589
0
                if (!(s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)))
4590
0
                    return -1;
4591
0
            } else {
4592
0
                s = skip_aux(s+2, end);
4593
0
            }
4594
0
        }
4595
0
        e |= kputsn("", 0, str) < 0; // nul terminate
4596
0
    }
4597
4598
0
    if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1;
4599
0
    e |= kputc_('\n', str) < 0;
4600
4601
    // Seq line
4602
0
    seq = bam_get_seq(b);
4603
0
    if (flag & BAM_FREVERSE)
4604
0
        for (i = len-1; i >= 0; i--)
4605
0
            e |= kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str) < 0;
4606
0
    else
4607
0
        for (i = 0; i < len; i++)
4608
0
            e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0;
4609
4610
4611
    // Qual line
4612
0
    if (x->nprefix == '@') {
4613
0
        kputsn("\n+\n", 3, str);
4614
0
        qual = bam_get_qual(b);
4615
0
        if (qual[0] == 0xff)
4616
0
            for (i = 0; i < len; i++)
4617
0
                e |= kputc_('B', str) < 0;
4618
0
        else if (flag & BAM_FREVERSE)
4619
0
            for (i = len-1; i >= 0; i--)
4620
0
                e |= kputc_(33 + qual[i], str) < 0;
4621
0
        else
4622
0
            for (i = 0; i < len; i++)
4623
0
                e |= kputc_(33 + qual[i], str) < 0;
4624
4625
0
    }
4626
0
    e |= kputc('\n', str) < 0;
4627
4628
0
    return e ? -1 : str->l;
4629
0
}
4630
4631
// Sadly we need to be able to modify the bam_hdr here so we can
4632
// reference count the structure.
4633
int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b)
4634
7.03M
{
4635
7.03M
    switch (fp->format.format) {
4636
0
    case binary_format:
4637
0
        fp->format.category = sequence_data;
4638
0
        fp->format.format = bam;
4639
        /* fall-through */
4640
2.34M
    case bam:
4641
2.34M
        return bam_write_idx1(fp, h, b);
4642
4643
2.34M
    case cram:
4644
2.34M
        return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
4645
4646
0
    case text_format:
4647
0
        fp->format.category = sequence_data;
4648
0
        fp->format.format = sam;
4649
        /* fall-through */
4650
2.34M
    case sam:
4651
2.34M
        if (fp->state) {
4652
0
            SAM_state *fd = (SAM_state *)fp->state;
4653
4654
            // Threaded output
4655
0
            if (!fd->h) {
4656
                // NB: discard const.  We don't actually modify sam_hdr_t here,
4657
                // just data pointed to by it (which is a bit weasely still),
4658
                // but out cached pointer must be non-const as we want to
4659
                // destroy it later on and sam_hdr_destroy takes non-const.
4660
                //
4661
                // We do this because some tools do sam_hdr_destroy; sam_close
4662
                // while others do sam_close; sam_hdr_destroy.  The former is
4663
                // an issue as we need the header still when flushing.
4664
0
                fd->h = (sam_hdr_t *)h;
4665
0
                fd->h->ref_count++;
4666
4667
0
                if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write,
4668
0
                                   fp) != 0)
4669
0
                    return -2;
4670
0
                fd->dispatcher_set = 1;
4671
0
            }
4672
4673
0
            if (fd->h != h) {
4674
0
                hts_log_error("SAM multi-threaded decoding does not support changing header");
4675
0
                return -2;
4676
0
            }
4677
4678
            // Find a suitable BAM array to copy to
4679
0
            sp_bams *gb = fd->curr_bam;
4680
0
            if (!gb) {
4681
0
                pthread_mutex_lock(&fd->lines_m);
4682
0
                if (fd->bams) {
4683
0
                    fd->curr_bam = gb = fd->bams;
4684
0
                    fd->bams = gb->next;
4685
0
                    gb->next = NULL;
4686
0
                    gb->nbams = 0;
4687
0
                    gb->bam_mem = 0;
4688
0
                    pthread_mutex_unlock(&fd->lines_m);
4689
0
                } else {
4690
0
                    pthread_mutex_unlock(&fd->lines_m);
4691
0
                    if (!(gb = calloc(1, sizeof(*gb)))) return -1;
4692
0
                    if (!(gb->bams = calloc(SAM_NBAM, sizeof(*gb->bams)))) {
4693
0
                        free(gb);
4694
0
                        return -1;
4695
0
                    }
4696
0
                    gb->nbams = 0;
4697
0
                    gb->abams = SAM_NBAM;
4698
0
                    gb->bam_mem = 0;
4699
0
                    gb->fd = fd;
4700
0
                    fd->curr_idx = 0;
4701
0
                    fd->curr_bam = gb;
4702
0
                }
4703
0
            }
4704
4705
0
            if (!bam_copy1(&gb->bams[gb->nbams++], b))
4706
0
                return -2;
4707
0
            gb->bam_mem += b->l_data + sizeof(*b);
4708
4709
            // Dispatch if full
4710
0
            if (gb->nbams == SAM_NBAM || gb->bam_mem > SAM_NBYTES*0.8) {
4711
0
                gb->serial = fd->serial++;
4712
0
                pthread_mutex_lock(&fd->command_m);
4713
0
                if (fd->errcode != 0) {
4714
0
                    pthread_mutex_unlock(&fd->command_m);
4715
0
                    return -fd->errcode;
4716
0
                }
4717
0
                if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb,
4718
0
                                        cleanup_sp_bams,
4719
0
                                        cleanup_sp_lines, 0) < 0) {
4720
0
                    pthread_mutex_unlock(&fd->command_m);
4721
0
                    return -1;
4722
0
                }
4723
0
                pthread_mutex_unlock(&fd->command_m);
4724
0
                fd->curr_bam = NULL;
4725
0
            }
4726
4727
            // Dummy value as we don't know how long it really is.
4728
            // We could track file sizes via a SAM_state field, but I don't think
4729
            // it is necessary.
4730
0
            return 1;
4731
2.34M
        } else {
4732
2.34M
            if (sam_format1(h, b, &fp->line) < 0) return -1;
4733
2.34M
            kputc('\n', &fp->line);
4734
2.34M
            if (fp->is_bgzf) {
4735
0
                if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4736
0
                    return -1;
4737
0
                if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4738
2.34M
            } else {
4739
2.34M
                if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4740
2.34M
            }
4741
4742
2.34M
            if (fp->idx) {
4743
0
                if (fp->format.compression == bgzf) {
4744
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4745
0
                                      bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4746
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4747
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4748
0
                        return -1;
4749
0
                    }
4750
0
                } else {
4751
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4752
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4753
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4754
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4755
0
                        return -1;
4756
0
                    }
4757
0
                }
4758
0
            }
4759
4760
2.34M
            return fp->line.l;
4761
2.34M
        }
4762
4763
4764
0
    case fasta_format:
4765
0
    case fastq_format: {
4766
0
        fastq_state *x = (fastq_state *)fp->state;
4767
0
        if (!x) {
4768
0
            if (!(fp->state = fastq_state_init(fp->format.format
4769
0
                                               == fastq_format ? '@' : '>')))
4770
0
                return -2;
4771
0
        }
4772
4773
0
        if (fastq_format1(fp->state, b, &fp->line) < 0)
4774
0
            return -1;
4775
0
        if (fp->is_bgzf) {
4776
0
            if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4777
0
                return -1;
4778
0
            if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l)
4779
0
                return -1;
4780
0
        } else {
4781
0
            if (hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l)
4782
0
                return -1;
4783
0
        }
4784
0
        return fp->line.l;
4785
0
    }
4786
4787
0
    default:
4788
0
        errno = EBADF;
4789
0
        return -1;
4790
7.03M
    }
4791
7.03M
}
4792
4793
/************************
4794
 *** Auxiliary fields ***
4795
 ************************/
4796
#ifndef HTS_LITTLE_ENDIAN
4797
static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
4798
    int tsz = aux_type2size(type);
4799
4800
    if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
4801
4802
    switch (tsz) {
4803
        case 'H': case 'Z': case 1:  // Trivial
4804
            memcpy(out, in, len);
4805
            break;
4806
4807
#define aux_val_to_le(type_t, store_le) do {                            \
4808
        type_t v;                                                       \
4809
        size_t i;                                                       \
4810
        for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
4811
            memcpy(&v, in + i, sizeof(type_t));                         \
4812
            store_le(v, out);                                           \
4813
        }                                                               \
4814
    } while (0)
4815
4816
        case 2: aux_val_to_le(uint16_t, u16_to_le); break;
4817
        case 4: aux_val_to_le(uint32_t, u32_to_le); break;
4818
        case 8: aux_val_to_le(uint64_t, u64_to_le); break;
4819
4820
#undef aux_val_to_le
4821
4822
        case 'B': { // Recurse!
4823
            uint32_t n;
4824
            if (len < 5) return -1;
4825
            memcpy(&n, in + 1, 4);
4826
            out[0] = in[0];
4827
            u32_to_le(n, out + 1);
4828
            return aux_to_le(in[0], out + 5, in + 5, len - 5);
4829
        }
4830
4831
        default: // Unknown type code
4832
            return -1;
4833
    }
4834
4835
4836
4837
    return 0;
4838
}
4839
#endif
4840
4841
int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
4842
0
{
4843
0
    uint32_t new_len;
4844
4845
0
    assert(b->l_data >= 0);
4846
0
    new_len = b->l_data + 3 + len;
4847
0
    if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
4848
4849
0
    if (realloc_bam_data(b, new_len) < 0) return -1;
4850
4851
0
    b->data[b->l_data] = tag[0];
4852
0
    b->data[b->l_data + 1] = tag[1];
4853
0
    b->data[b->l_data + 2] = type;
4854
4855
0
#ifdef HTS_LITTLE_ENDIAN
4856
0
    memcpy(b->data + b->l_data + 3, data, len);
4857
#else
4858
    if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
4859
        errno = EINVAL;
4860
        return -1;
4861
    }
4862
#endif
4863
4864
0
    b->l_data = new_len;
4865
4866
0
    return 0;
4867
4868
0
 nomem:
4869
0
    errno = ENOMEM;
4870
0
    return -1;
4871
0
}
4872
4873
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
4874
1.46M
{
4875
1.46M
    int size;
4876
1.46M
    uint32_t n;
4877
1.46M
    if (s >= end) return end;
4878
1.46M
    size = aux_type2size(*s); ++s; // skip type
4879
1.46M
    switch (size) {
4880
176k
    case 'Z':
4881
180k
    case 'H':
4882
180k
        s = memchr(s, 0, end-s);
4883
180k
        return s ? s+1 : end;
4884
51.5k
    case 'B':
4885
51.5k
        if (end - s < 5) return NULL;
4886
51.5k
        size = aux_type2size(*s); ++s;
4887
51.5k
        n = le_to_u32(s);
4888
51.5k
        s += 4;
4889
51.5k
        if (size == 0 || end - s < size * n) return NULL;
4890
51.5k
        return s + size * n;
4891
5
    case 0:
4892
5
        return NULL;
4893
1.22M
    default:
4894
1.22M
        if (end - s < size) return NULL;
4895
1.22M
        return s + size;
4896
1.46M
    }
4897
1.46M
}
4898
4899
uint8_t *bam_aux_first(const bam1_t *b)
4900
2.44M
{
4901
2.44M
    uint8_t *s = bam_get_aux(b);
4902
2.44M
    uint8_t *end = b->data + b->l_data;
4903
2.44M
    if (end - s <= 2) { errno = ENOENT; return NULL; }
4904
128k
    return s+2;
4905
2.44M
}
4906
4907
uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s)
4908
1.41M
{
4909
1.41M
    uint8_t *end = b->data + b->l_data;
4910
1.41M
    uint8_t *next = s? skip_aux((uint8_t *) s, end) : end;
4911
1.41M
    if (next == NULL) goto bad_aux;
4912
1.41M
    if (end - next <= 2) { errno = ENOENT; return NULL; }
4913
1.33M
    return next+2;
4914
4915
6
 bad_aux:
4916
6
    hts_log_error("Corrupted aux data for read %s flag %d",
4917
6
                  bam_get_qname(b), b->core.flag);
4918
6
    errno = EINVAL;
4919
6
    return NULL;
4920
1.41M
}
4921
4922
uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
4923
2.44M
{
4924
2.44M
    uint8_t *s;
4925
3.85M
    for (s = bam_aux_first(b); s; s = bam_aux_next(b, s))
4926
1.46M
        if (s[-2] == tag[0] && s[-1] == tag[1]) {
4927
            // Check the tag value is valid and complete
4928
48.3k
            uint8_t *e = skip_aux(s, b->data + b->l_data);
4929
48.3k
            if (e == NULL) goto bad_aux;
4930
48.3k
            if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux;
4931
4932
48.3k
            return s;
4933
48.3k
        }
4934
4935
    // errno now as set by bam_aux_first()/bam_aux_next()
4936
2.39M
    return NULL;
4937
4938
0
 bad_aux:
4939
0
    hts_log_error("Corrupted aux data for read %s flag %d",
4940
0
                  bam_get_qname(b), b->core.flag);
4941
0
    errno = EINVAL;
4942
0
    return NULL;
4943
2.44M
}
4944
4945
int bam_aux_del(bam1_t *b, uint8_t *s)
4946
0
{
4947
0
    s = bam_aux_remove(b, s);
4948
0
    return (s || errno == ENOENT)? 0 : -1;
4949
0
}
4950
4951
uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s)
4952
0
{
4953
0
    uint8_t *end = b->data + b->l_data;
4954
0
    uint8_t *next = skip_aux(s, end);
4955
0
    if (next == NULL) goto bad_aux;
4956
4957
0
    b->l_data -= next - (s-2);
4958
0
    if (next >= end) { errno = ENOENT; return NULL; }
4959
4960
0
    memmove(s-2, next, end - next);
4961
0
    return s;
4962
4963
0
 bad_aux:
4964
0
    hts_log_error("Corrupted aux data for read %s flag %d",
4965
0
                  bam_get_qname(b), b->core.flag);
4966
0
    errno = EINVAL;
4967
0
    return NULL;
4968
0
}
4969
4970
int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
4971
0
{
4972
    // FIXME: This is not at all efficient!
4973
0
    size_t ln = len >= 0 ? len : strlen(data) + 1;
4974
0
    size_t old_ln = 0;
4975
0
    int need_nul = ln == 0 || data[ln - 1] != '\0';
4976
0
    int save_errno = errno;
4977
0
    int new_tag = 0;
4978
0
    uint8_t *s = bam_aux_get(b,tag), *e;
4979
4980
0
    if (s) {  // Replacing existing tag
4981
0
        char type = *s;
4982
0
        if (type != 'Z') {
4983
0
            hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type);
4984
0
            errno = EINVAL;
4985
0
            return -1;
4986
0
        }
4987
0
        s++;
4988
0
        e = memchr(s, '\0', b->data + b->l_data - s);
4989
0
        old_ln = (e ? e - s : b->data + b->l_data - s) + 1;
4990
0
        s -= 3;
4991
0
    } else {
4992
0
        if (errno != ENOENT) { // Invalid aux data, give up
4993
0
            return -1;
4994
0
        } else { // Tag doesn't exist - put it on the end
4995
0
            errno = save_errno;
4996
0
            s = b->data + b->l_data;
4997
0
            new_tag = 3;
4998
0
        }
4999
0
    }
5000
5001
0
    if (old_ln < ln + need_nul + new_tag) {
5002
0
        ptrdiff_t s_offset = s - b->data;
5003
0
        if (possibly_expand_bam_data(b, ln + need_nul + new_tag - old_ln) < 0)
5004
0
            return -1;
5005
0
        s = b->data + s_offset;
5006
0
    }
5007
0
    if (!new_tag) {
5008
0
        memmove(s + 3 + ln + need_nul,
5009
0
                s + 3 + old_ln,
5010
0
                b->l_data - (s + 3 - b->data) - old_ln);
5011
0
    }
5012
0
    b->l_data += new_tag + ln + need_nul - old_ln;
5013
5014
0
    s[0] = tag[0];
5015
0
    s[1] = tag[1];
5016
0
    s[2] = 'Z';
5017
0
    memmove(s+3,data,ln);
5018
0
    if (need_nul) s[3 + ln] = '\0';
5019
0
    return 0;
5020
0
}
5021
5022
int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val)
5023
0
{
5024
0
    uint32_t sz, old_sz = 0, new = 0;
5025
0
    uint8_t *s, type;
5026
5027
0
    if (val < INT32_MIN || val > UINT32_MAX) {
5028
0
        errno = EOVERFLOW;
5029
0
        return -1;
5030
0
    }
5031
0
    if (val < INT16_MIN)       { type = 'i'; sz = 4; }
5032
0
    else if (val < INT8_MIN)   { type = 's'; sz = 2; }
5033
0
    else if (val < 0)          { type = 'c'; sz = 1; }
5034
0
    else if (val < UINT8_MAX)  { type = 'C'; sz = 1; }
5035
0
    else if (val < UINT16_MAX) { type = 'S'; sz = 2; }
5036
0
    else                       { type = 'I'; sz = 4; }
5037
5038
0
    s = bam_aux_get(b, tag);
5039
0
    if (s) {  // Tag present - how big was the old one?
5040
0
        switch (*s) {
5041
0
            case 'c': case 'C': old_sz = 1; break;
5042
0
            case 's': case 'S': old_sz = 2; break;
5043
0
            case 'i': case 'I': old_sz = 4; break;
5044
0
            default: errno = EINVAL; return -1;  // Not an integer
5045
0
        }
5046
0
    } else {
5047
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
5048
0
            s = b->data + b->l_data;
5049
0
            new = 1;
5050
0
        }  else { // Invalid aux data, give up.
5051
0
            return -1;
5052
0
        }
5053
0
    }
5054
5055
0
    if (new || old_sz < sz) {
5056
        // Make room for new tag
5057
0
        ptrdiff_t s_offset = s - b->data;
5058
0
        if (possibly_expand_bam_data(b, (new ? 3 : 0) + sz - old_sz) < 0)
5059
0
            return -1;
5060
0
        s =  b->data + s_offset;
5061
0
        if (new) { // Add tag id
5062
0
            *s++ = tag[0];
5063
0
            *s++ = tag[1];
5064
0
        } else {   // Shift following data so we have space
5065
0
            memmove(s + sz, s + old_sz, b->l_data - s_offset - old_sz);
5066
0
        }
5067
0
    } else {
5068
        // Reuse old space.  Data value may be bigger than necessary but
5069
        // we avoid having to move everything else
5070
0
        sz = old_sz;
5071
0
        type = (val < 0 ? "\0cs\0i" : "\0CS\0I")[old_sz];
5072
0
        assert(type > 0);
5073
0
    }
5074
0
    *s++ = type;
5075
0
#ifdef HTS_LITTLE_ENDIAN
5076
0
    memcpy(s, &val, sz);
5077
#else
5078
    switch (sz) {
5079
        case 4:  u32_to_le(val, s); break;
5080
        case 2:  u16_to_le(val, s); break;
5081
        default: *s = val; break;
5082
    }
5083
#endif
5084
0
    b->l_data += (new ? 3 : 0) + sz - old_sz;
5085
0
    return 0;
5086
0
}
5087
5088
int bam_aux_update_float(bam1_t *b, const char tag[2], float val)
5089
0
{
5090
0
    uint8_t *s = bam_aux_get(b, tag);
5091
0
    int shrink = 0, new = 0;
5092
5093
0
    if (s) { // Tag present - what was it?
5094
0
        switch (*s) {
5095
0
            case 'f': break;
5096
0
            case 'd': shrink = 1; break;
5097
0
            default: errno = EINVAL; return -1;  // Not a float
5098
0
        }
5099
0
    } else {
5100
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
5101
0
            new = 1;
5102
0
        }  else { // Invalid aux data, give up.
5103
0
            return -1;
5104
0
        }
5105
0
    }
5106
5107
0
    if (new) { // Ensure there's room
5108
0
        if (possibly_expand_bam_data(b, 3 + 4) < 0)
5109
0
            return -1;
5110
0
        s = b->data + b->l_data;
5111
0
        *s++ = tag[0];
5112
0
        *s++ = tag[1];
5113
0
    } else if (shrink) { // Convert non-standard double tag to float
5114
0
        memmove(s + 5, s + 9, b->l_data - ((s + 9) - b->data));
5115
0
        b->l_data -= 4;
5116
0
    }
5117
0
    *s++ = 'f';
5118
0
    float_to_le(val, s);
5119
0
    if (new) b->l_data += 7;
5120
5121
0
    return 0;
5122
0
}
5123
5124
int bam_aux_update_array(bam1_t *b, const char tag[2],
5125
                         uint8_t type, uint32_t items, void *data)
5126
0
{
5127
0
    uint8_t *s = bam_aux_get(b, tag);
5128
0
    size_t old_sz = 0, new_sz;
5129
0
    int new = 0;
5130
5131
0
    if (s) { // Tag present
5132
0
        if (*s != 'B') { errno = EINVAL; return -1; }
5133
0
        old_sz = aux_type2size(s[1]);
5134
0
        if (old_sz < 1 || old_sz > 4) { errno = EINVAL; return -1; }
5135
0
        old_sz *= le_to_u32(s + 2);
5136
0
    } else {
5137
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
5138
0
            s = b->data + b->l_data;
5139
0
            new = 1;
5140
0
        }  else { // Invalid aux data, give up.
5141
0
            return -1;
5142
0
        }
5143
0
    }
5144
5145
0
    new_sz = aux_type2size(type);
5146
0
    if (new_sz < 1 || new_sz > 4) { errno = EINVAL; return -1; }
5147
0
    if (items > INT32_MAX / new_sz) { errno = ENOMEM; return -1; }
5148
0
    new_sz *= items;
5149
5150
0
    if (new || old_sz < new_sz) {
5151
        // Make room for new tag
5152
0
        ptrdiff_t s_offset = s - b->data;
5153
0
        if (possibly_expand_bam_data(b, (new ? 8 : 0) + new_sz - old_sz) < 0)
5154
0
            return -1;
5155
0
        s =  b->data + s_offset;
5156
0
    }
5157
0
    if (new) { // Add tag id and type
5158
0
        *s++ = tag[0];
5159
0
        *s++ = tag[1];
5160
0
        *s = 'B';
5161
0
        b->l_data += 8 + new_sz;
5162
0
    } else if (old_sz != new_sz) { // shift following data if necessary
5163
0
        memmove(s + 6 + new_sz, s + 6 + old_sz,
5164
0
                b->l_data - ((s + 6 + old_sz) - b->data));
5165
0
        b->l_data -= old_sz;
5166
0
        b->l_data += new_sz;
5167
0
    }
5168
5169
0
    s[1] = type;
5170
0
    u32_to_le(items, s + 2);
5171
0
    if (new_sz > 0) {
5172
0
#ifdef HTS_LITTLE_ENDIAN
5173
0
        memcpy(s + 6, data, new_sz);
5174
#else
5175
        return aux_to_le(type, s + 6, data, new_sz);
5176
#endif
5177
0
    }
5178
0
    return 0;
5179
0
}
5180
5181
static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s,
5182
                                      uint32_t idx)
5183
0
{
5184
0
    switch (type) {
5185
0
        case 'c': return le_to_i8(s + idx);
5186
0
        case 'C': return s[idx];
5187
0
        case 's': return le_to_i16(s + 2 * idx);
5188
0
        case 'S': return le_to_u16(s + 2 * idx);
5189
0
        case 'i': return le_to_i32(s + 4 * idx);
5190
0
        case 'I': return le_to_u32(s + 4 * idx);
5191
0
        default:
5192
0
            errno = EINVAL;
5193
0
            return 0;
5194
0
    }
5195
0
}
5196
5197
int64_t bam_aux2i(const uint8_t *s)
5198
0
{
5199
0
    int type;
5200
0
    type = *s++;
5201
0
    return get_int_aux_val(type, s, 0);
5202
0
}
5203
5204
double bam_aux2f(const uint8_t *s)
5205
0
{
5206
0
    int type;
5207
0
    type = *s++;
5208
0
    if (type == 'd') return le_to_double(s);
5209
0
    else if (type == 'f') return le_to_float(s);
5210
0
    else return get_int_aux_val(type, s, 0);
5211
0
}
5212
5213
char bam_aux2A(const uint8_t *s)
5214
0
{
5215
0
    int type;
5216
0
    type = *s++;
5217
0
    if (type == 'A') return *(char*)s;
5218
0
    errno = EINVAL;
5219
0
    return 0;
5220
0
}
5221
5222
char *bam_aux2Z(const uint8_t *s)
5223
0
{
5224
0
    int type;
5225
0
    type = *s++;
5226
0
    if (type == 'Z' || type == 'H') return (char*)s;
5227
0
    errno = EINVAL;
5228
0
    return 0;
5229
0
}
5230
5231
uint32_t bam_auxB_len(const uint8_t *s)
5232
0
{
5233
0
    if (s[0] != 'B') {
5234
0
        errno = EINVAL;
5235
0
        return 0;
5236
0
    }
5237
0
    return le_to_u32(s + 2);
5238
0
}
5239
5240
int64_t bam_auxB2i(const uint8_t *s, uint32_t idx)
5241
0
{
5242
0
    uint32_t len = bam_auxB_len(s);
5243
0
    if (idx >= len) {
5244
0
        errno = ERANGE;
5245
0
        return 0;
5246
0
    }
5247
0
    return get_int_aux_val(s[1], s + 6, idx);
5248
0
}
5249
5250
double bam_auxB2f(const uint8_t *s, uint32_t idx)
5251
0
{
5252
0
    uint32_t len = bam_auxB_len(s);
5253
0
    if (idx >= len) {
5254
0
        errno = ERANGE;
5255
0
        return 0.0;
5256
0
    }
5257
0
    if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx);
5258
0
    else return get_int_aux_val(s[1], s + 6, idx);
5259
0
}
5260
5261
int sam_open_mode(char *mode, const char *fn, const char *format)
5262
0
{
5263
    // TODO Parse "bam5" etc for compression level
5264
0
    if (format == NULL) {
5265
        // Try to pick a format based on the filename extension
5266
0
        char extension[HTS_MAX_EXT_LEN];
5267
0
        if (find_file_extension(fn, extension) < 0) return -1;
5268
0
        return sam_open_mode(mode, fn, extension);
5269
0
    }
5270
0
    else if (strcasecmp(format, "bam") == 0) strcpy(mode, "b");
5271
0
    else if (strcasecmp(format, "cram") == 0) strcpy(mode, "c");
5272
0
    else if (strcasecmp(format, "sam") == 0) strcpy(mode, "");
5273
0
    else if (strcasecmp(format, "sam.gz") == 0) strcpy(mode, "z");
5274
0
    else if (strcasecmp(format, "fastq") == 0 ||
5275
0
             strcasecmp(format, "fq") == 0) strcpy(mode, "f");
5276
0
    else if (strcasecmp(format, "fastq.gz") == 0 ||
5277
0
             strcasecmp(format, "fq.gz") == 0) strcpy(mode, "fz");
5278
0
    else if (strcasecmp(format, "fasta") == 0 ||
5279
0
             strcasecmp(format, "fa") == 0) strcpy(mode, "F");
5280
0
    else if (strcasecmp(format, "fasta.gz") == 0 ||
5281
0
             strcasecmp(format, "fa.gz") == 0) strcpy(mode, "Fz");
5282
0
    else return -1;
5283
5284
0
    return 0;
5285
0
}
5286
5287
// A version of sam_open_mode that can handle ,key=value options.
5288
// The format string is allocated and returned, to be freed by the caller.
5289
// Prefix should be "r" or "w",
5290
char *sam_open_mode_opts(const char *fn,
5291
                         const char *mode,
5292
                         const char *format)
5293
0
{
5294
0
    char *mode_opts = malloc((format ? strlen(format) : 1) +
5295
0
                             (mode   ? strlen(mode)   : 1) + 12);
5296
0
    char *opts, *cp;
5297
0
    int format_len;
5298
5299
0
    if (!mode_opts)
5300
0
        return NULL;
5301
5302
0
    strcpy(mode_opts, mode ? mode : "r");
5303
0
    cp = mode_opts + strlen(mode_opts);
5304
5305
0
    if (format == NULL) {
5306
        // Try to pick a format based on the filename extension
5307
0
        char extension[HTS_MAX_EXT_LEN];
5308
0
        if (find_file_extension(fn, extension) < 0) {
5309
0
            free(mode_opts);
5310
0
            return NULL;
5311
0
        }
5312
0
        if (sam_open_mode(cp, fn, extension) == 0) {
5313
0
            return mode_opts;
5314
0
        } else {
5315
0
            free(mode_opts);
5316
0
            return NULL;
5317
0
        }
5318
0
    }
5319
5320
0
    if ((opts = strchr(format, ','))) {
5321
0
        format_len = opts-format;
5322
0
    } else {
5323
0
        opts="";
5324
0
        format_len = strlen(format);
5325
0
    }
5326
5327
0
    if (strncmp(format, "bam", format_len) == 0) {
5328
0
        *cp++ = 'b';
5329
0
    } else if (strncmp(format, "cram", format_len) == 0) {
5330
0
        *cp++ = 'c';
5331
0
    } else if (strncmp(format, "cram2", format_len) == 0) {
5332
0
        *cp++ = 'c';
5333
0
        strcpy(cp, ",VERSION=2.1");
5334
0
        cp += 12;
5335
0
    } else if (strncmp(format, "cram3", format_len) == 0) {
5336
0
        *cp++ = 'c';
5337
0
        strcpy(cp, ",VERSION=3.0");
5338
0
        cp += 12;
5339
0
    } else if (strncmp(format, "sam", format_len) == 0) {
5340
0
        ; // format mode=""
5341
0
    } else if (strncmp(format, "sam.gz", format_len) == 0) {
5342
0
        *cp++ = 'z';
5343
0
    } else if (strncmp(format, "fastq", format_len) == 0 ||
5344
0
               strncmp(format, "fq", format_len) == 0) {
5345
0
        *cp++ = 'f';
5346
0
    } else if (strncmp(format, "fastq.gz", format_len) == 0 ||
5347
0
               strncmp(format, "fq.gz", format_len) == 0) {
5348
0
        *cp++ = 'f';
5349
0
        *cp++ = 'z';
5350
0
    } else if (strncmp(format, "fasta", format_len) == 0 ||
5351
0
               strncmp(format, "fa", format_len) == 0) {
5352
0
        *cp++ = 'F';
5353
0
    } else if (strncmp(format, "fasta.gz", format_len) == 0 ||
5354
0
               strncmp(format, "fa", format_len) == 0) {
5355
0
        *cp++ = 'F';
5356
0
        *cp++ = 'z';
5357
0
    } else {
5358
0
        free(mode_opts);
5359
0
        return NULL;
5360
0
    }
5361
5362
0
    strcpy(cp, opts);
5363
5364
0
    return mode_opts;
5365
0
}
5366
5367
0
#define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
5368
int bam_str2flag(const char *str)
5369
0
{
5370
0
    char *end, *beg = (char*) str;
5371
0
    long int flag = strtol(str, &end, 0);
5372
0
    if ( end!=str ) return flag;    // the conversion was successful
5373
0
    flag = 0;
5374
0
    while ( *str )
5375
0
    {
5376
0
        end = beg;
5377
0
        while ( *end && *end!=',' ) end++;
5378
0
        if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
5379
0
        else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
5380
0
        else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
5381
0
        else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
5382
0
        else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
5383
0
        else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
5384
0
        else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
5385
0
        else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
5386
0
        else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
5387
0
        else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
5388
0
        else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
5389
0
        else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
5390
0
        else return -1;
5391
0
        if ( !*end ) break;
5392
0
        beg = end + 1;
5393
0
    }
5394
0
    return flag;
5395
0
}
5396
5397
char *bam_flag2str(int flag)
5398
0
{
5399
0
    kstring_t str = {0,0,0};
5400
0
    if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
5401
0
    if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
5402
0
    if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
5403
0
    if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
5404
0
    if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
5405
0
    if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
5406
0
    if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
5407
0
    if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
5408
0
    if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
5409
0
    if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
5410
0
    if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
5411
0
    if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
5412
0
    if ( str.l == 0 ) kputsn("", 0, &str);
5413
0
    return str.s;
5414
0
}
5415
5416
5417
/**************************
5418
 *** Pileup and Mpileup ***
5419
 **************************/
5420
5421
#if !defined(BAM_NO_PILEUP)
5422
5423
#include <assert.h>
5424
5425
/*******************
5426
 *** Memory pool ***
5427
 *******************/
5428
5429
typedef struct {
5430
    int k, y;
5431
    hts_pos_t x, end;
5432
} cstate_t;
5433
5434
static cstate_t g_cstate_null = { -1, 0, 0, 0 };
5435
5436
typedef struct __linkbuf_t {
5437
    bam1_t b;
5438
    hts_pos_t beg, end;
5439
    cstate_t s;
5440
    struct __linkbuf_t *next;
5441
    bam_pileup_cd cd;
5442
} lbnode_t;
5443
5444
typedef struct {
5445
    int cnt, n, max;
5446
    lbnode_t **buf;
5447
} mempool_t;
5448
5449
static mempool_t *mp_init(void)
5450
0
{
5451
0
    mempool_t *mp;
5452
0
    mp = (mempool_t*)calloc(1, sizeof(mempool_t));
5453
0
    return mp;
5454
0
}
5455
static void mp_destroy(mempool_t *mp)
5456
0
{
5457
0
    int k;
5458
0
    for (k = 0; k < mp->n; ++k) {
5459
0
        free(mp->buf[k]->b.data);
5460
0
        free(mp->buf[k]);
5461
0
    }
5462
0
    free(mp->buf);
5463
0
    free(mp);
5464
0
}
5465
static inline lbnode_t *mp_alloc(mempool_t *mp)
5466
0
{
5467
0
    ++mp->cnt;
5468
0
    if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
5469
0
    else return mp->buf[--mp->n];
5470
0
}
5471
static inline void mp_free(mempool_t *mp, lbnode_t *p)
5472
0
{
5473
0
    --mp->cnt; p->next = 0; // clear lbnode_t::next here
5474
0
    if (mp->n == mp->max) {
5475
0
        mp->max = mp->max? mp->max<<1 : 256;
5476
0
        mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
5477
0
    }
5478
0
    mp->buf[mp->n++] = p;
5479
0
}
5480
5481
/**********************
5482
 *** CIGAR resolver ***
5483
 **********************/
5484
5485
/* s->k: the index of the CIGAR operator that has just been processed.
5486
   s->x: the reference coordinate of the start of s->k
5487
   s->y: the query coordinate of the start of s->k
5488
 */
5489
static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s)
5490
0
{
5491
0
#define _cop(c) ((c)&BAM_CIGAR_MASK)
5492
0
#define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
5493
5494
0
    bam1_t *b = p->b;
5495
0
    bam1_core_t *c = &b->core;
5496
0
    uint32_t *cigar = bam_get_cigar(b);
5497
0
    int k;
5498
    // determine the current CIGAR operation
5499
    //fprintf(stderr, "%s\tpos=%ld\tend=%ld\t(%d,%ld,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
5500
0
    if (s->k == -1) { // never processed
5501
0
        p->qpos = 0;
5502
0
        if (c->n_cigar == 1) { // just one operation, save a loop
5503
0
          if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
5504
0
        } else { // find the first match or deletion
5505
0
            for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
5506
0
                int op = _cop(cigar[k]);
5507
0
                int l = _cln(cigar[k]);
5508
0
                if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP ||
5509
0
                    op == BAM_CEQUAL || op == BAM_CDIFF) break;
5510
0
                else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5511
0
            }
5512
0
            assert(k < c->n_cigar);
5513
0
            s->k = k;
5514
0
        }
5515
0
    } else { // the read has been processed before
5516
0
        int op, l = _cln(cigar[s->k]);
5517
0
        if (pos - s->x >= l) { // jump to the next operation
5518
0
            assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
5519
0
            op = _cop(cigar[s->k+1]);
5520
0
            if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
5521
0
              if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5522
0
                s->x += l;
5523
0
                ++s->k;
5524
0
            } else { // find the next M/D/N/=/X
5525
0
              if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5526
0
                s->x += l;
5527
0
                for (k = s->k + 1; k < c->n_cigar; ++k) {
5528
0
                    op = _cop(cigar[k]), l = _cln(cigar[k]);
5529
0
                    if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
5530
0
                    else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5531
0
                }
5532
0
                s->k = k;
5533
0
            }
5534
0
            assert(s->k < c->n_cigar); // otherwise a bug
5535
0
        } // else, do nothing
5536
0
    }
5537
0
    { // collect pileup information
5538
0
        int op, l;
5539
0
        op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
5540
0
        p->is_del = p->indel = p->is_refskip = 0;
5541
0
        if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
5542
0
            int op2 = _cop(cigar[s->k+1]);
5543
0
            int l2 = _cln(cigar[s->k+1]);
5544
0
            if (op2 == BAM_CDEL && op != BAM_CDEL) {
5545
                // At start of a new deletion, merge e.g. 1D2D to 3D.
5546
                // Within a deletion (the 2D in 1D2D) we keep p->indel=0
5547
                // and rely on is_del=1 as we would for 3D.
5548
0
                p->indel = -(int)l2;
5549
0
                for (k = s->k+2; k < c->n_cigar; ++k) {
5550
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5551
0
                    if (op2 == BAM_CDEL) p->indel -= l2;
5552
0
                    else break;
5553
0
                }
5554
0
            } else if (op2 == BAM_CINS) {
5555
0
                p->indel = l2;
5556
0
                for (k = s->k+2; k < c->n_cigar; ++k) {
5557
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5558
0
                    if (op2 == BAM_CINS) p->indel += l2;
5559
0
                    else if (op2 != BAM_CPAD) break;
5560
0
                }
5561
0
            } else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) {
5562
0
                int l3 = 0;
5563
0
                for (k = s->k + 2; k < c->n_cigar; ++k) {
5564
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5565
0
                    if (op2 == BAM_CINS) l3 += l2;
5566
0
                    else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
5567
0
                }
5568
0
                if (l3 > 0) p->indel = l3;
5569
0
            }
5570
0
        }
5571
0
        if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
5572
0
            p->qpos = s->y + (pos - s->x);
5573
0
        } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
5574
0
            p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
5575
0
            p->is_refskip = (op == BAM_CREF_SKIP);
5576
0
        } // cannot be other operations; otherwise a bug
5577
0
        p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
5578
0
    }
5579
0
    p->cigar_ind = s->k;
5580
0
    return 1;
5581
0
}
5582
5583
/*******************************
5584
 *** Expansion of insertions ***
5585
 *******************************/
5586
5587
/*
5588
 * Fills out the kstring with the padded insertion sequence for the current
5589
 * location in 'p'.  If this is not an insertion site, the string is blank.
5590
 *
5591
 * This variant handles base modifications, but only when "m" is non-NULL.
5592
 *
5593
 * Returns the number of inserted base on success, with string length being
5594
 *        accessable via ins->l;
5595
 *        -1 on failure.
5596
 */
5597
int bam_plp_insertion_mod(const bam_pileup1_t *p,
5598
                          hts_base_mod_state *m,
5599
0
                          kstring_t *ins, int *del_len) {
5600
0
    int j, k, indel, nb = 0;
5601
0
    uint32_t *cigar;
5602
5603
0
    if (p->indel <= 0) {
5604
0
        if (ks_resize(ins, 1) < 0)
5605
0
            return -1;
5606
0
        ins->l = 0;
5607
0
        ins->s[0] = '\0';
5608
0
        return 0;
5609
0
    }
5610
5611
0
    if (del_len)
5612
0
        *del_len = 0;
5613
5614
    // Measure indel length including pads
5615
0
    indel = 0;
5616
0
    k = p->cigar_ind+1;
5617
0
    cigar = bam_get_cigar(p->b);
5618
0
    while (k < p->b->core.n_cigar) {
5619
0
        switch (cigar[k] & BAM_CIGAR_MASK) {
5620
0
        case BAM_CPAD:
5621
0
        case BAM_CINS:
5622
0
            indel += (cigar[k] >> BAM_CIGAR_SHIFT);
5623
0
            break;
5624
0
        default:
5625
0
            k = p->b->core.n_cigar;
5626
0
            break;
5627
0
        }
5628
0
        k++;
5629
0
    }
5630
0
    nb = ins->l = indel;
5631
5632
    // Produce sequence
5633
0
    if (ks_resize(ins, indel+1) < 0)
5634
0
        return -1;
5635
0
    indel = 0;
5636
0
    k = p->cigar_ind+1;
5637
0
    j = 1;
5638
0
    while (k < p->b->core.n_cigar) {
5639
0
        int l, c;
5640
0
        switch (cigar[k] & BAM_CIGAR_MASK) {
5641
0
        case BAM_CPAD:
5642
0
            for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++)
5643
0
                ins->s[indel++] = '*';
5644
0
            break;
5645
0
        case BAM_CINS:
5646
0
            for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) {
5647
0
                c = p->qpos + j - p->is_del < p->b->core.l_qseq
5648
0
                    ? seq_nt16_str[bam_seqi(bam_get_seq(p->b),
5649
0
                                            p->qpos + j - p->is_del)]
5650
0
                    : 'N';
5651
0
                ins->s[indel++] = c;
5652
0
                int nm;
5653
0
                hts_base_mod mod[256];
5654
0
                if (m && (nm = bam_mods_at_qpos(p->b, p->qpos + j - p->is_del,
5655
0
                                                m, mod, 256)) > 0) {
5656
0
                    int o_indel = indel;
5657
0
                    if (ks_resize(ins, ins->l + nm*16+3) < 0)
5658
0
                        return -1;
5659
0
                    ins->s[indel++] = '[';
5660
0
                    int j;
5661
0
                    for (j = 0; j < nm; j++) {
5662
0
                        char qual[20];
5663
0
                        if (mod[j].qual >= 0)
5664
0
                            snprintf(qual, sizeof(qual), "%d", mod[j].qual);
5665
0
                        else
5666
0
                            *qual=0;
5667
0
                        if (mod[j].modified_base < 0)
5668
                            // ChEBI
5669
0
                            indel += snprintf(&ins->s[indel], ins->m - indel,
5670
0
                                              "%c(%d)%s",
5671
0
                                              "+-"[mod[j].strand],
5672
0
                                              -mod[j].modified_base,
5673
0
                                              qual);
5674
0
                        else
5675
0
                            indel += snprintf(&ins->s[indel], ins->m - indel,
5676
0
                                              "%c%c%s",
5677
0
                                              "+-"[mod[j].strand],
5678
0
                                              mod[j].modified_base,
5679
0
                                              qual);
5680
0
                    }
5681
0
                    ins->s[indel++] = ']';
5682
0
                    ins->l += indel - o_indel; // grow by amount we used
5683
0
                }
5684
0
            }
5685
0
            break;
5686
0
        case BAM_CDEL:
5687
            // eg cigar 1M2I1D gives mpileup output in T+2AA-1C style
5688
0
            if (del_len)
5689
0
                *del_len = cigar[k]>>BAM_CIGAR_SHIFT;
5690
            // fall through
5691
0
        default:
5692
0
            k = p->b->core.n_cigar;
5693
0
            break;
5694
0
        }
5695
0
        k++;
5696
0
    }
5697
0
    ins->s[indel] = '\0';
5698
0
    ins->l = indel; // string length
5699
5700
0
    return nb;      // base length
5701
0
}
5702
5703
/*
5704
 * Fills out the kstring with the padded insertion sequence for the current
5705
 * location in 'p'.  If this is not an insertion site, the string is blank.
5706
 *
5707
 * This is the original interface with no capability for reporting base
5708
 * modifications.
5709
 *
5710
 * Returns the length of insertion string on success;
5711
 *        -1 on failure.
5712
 */
5713
0
int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) {
5714
0
    return bam_plp_insertion_mod(p, NULL, ins, del_len);
5715
0
}
5716
5717
/***********************
5718
 *** Pileup iterator ***
5719
 ***********************/
5720
5721
// Dictionary of overlapping reads
5722
KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
5723
typedef khash_t(olap_hash) olap_hash_t;
5724
5725
struct bam_plp_s {
5726
    mempool_t *mp;
5727
    lbnode_t *head, *tail;
5728
    int32_t tid, max_tid;
5729
    hts_pos_t pos, max_pos;
5730
    int is_eof, max_plp, error, maxcnt;
5731
    uint64_t id;
5732
    bam_pileup1_t *plp;
5733
    // for the "auto" interface only
5734
    bam1_t *b;
5735
    bam_plp_auto_f func;
5736
    void *data;
5737
    olap_hash_t *overlaps;
5738
5739
    // For notification of creation and destruction events
5740
    // and associated client-owned pointer.
5741
    int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
5742
    int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd);
5743
};
5744
5745
bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
5746
0
{
5747
0
    bam_plp_t iter;
5748
0
    iter = (bam_plp_t)calloc(1, sizeof(struct bam_plp_s));
5749
0
    iter->mp = mp_init();
5750
0
    iter->head = iter->tail = mp_alloc(iter->mp);
5751
0
    iter->max_tid = iter->max_pos = -1;
5752
0
    iter->maxcnt = 8000;
5753
0
    if (func) {
5754
0
        iter->func = func;
5755
0
        iter->data = data;
5756
0
        iter->b = bam_init1();
5757
0
    }
5758
0
    return iter;
5759
0
}
5760
5761
int bam_plp_init_overlaps(bam_plp_t iter)
5762
0
{
5763
0
    iter->overlaps = kh_init(olap_hash);  // hash for tweaking quality of bases in overlapping reads
5764
0
    return iter->overlaps ? 0 : -1;
5765
0
}
5766
5767
void bam_plp_destroy(bam_plp_t iter)
5768
0
{
5769
0
    lbnode_t *p, *pnext;
5770
0
    if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
5771
0
    for (p = iter->head; p != NULL; p = pnext) {
5772
0
        if (iter->plp_destruct && p != iter->tail)
5773
0
            iter->plp_destruct(iter->data, &p->b, &p->cd);
5774
0
        pnext = p->next;
5775
0
        mp_free(iter->mp, p);
5776
0
    }
5777
0
    mp_destroy(iter->mp);
5778
0
    if (iter->b) bam_destroy1(iter->b);
5779
0
    free(iter->plp);
5780
0
    free(iter);
5781
0
}
5782
5783
void bam_plp_constructor(bam_plp_t plp,
5784
0
                         int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5785
0
    plp->plp_construct = func;
5786
0
}
5787
5788
void bam_plp_destructor(bam_plp_t plp,
5789
0
                        int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5790
0
    plp->plp_destruct = func;
5791
0
}
5792
5793
//---------------------------------
5794
//---  Tweak overlapping reads
5795
//---------------------------------
5796
5797
/**
5798
 *  cigar_iref2iseq_set()  - find the first CMATCH setting the ref and the read index
5799
 *  cigar_iref2iseq_next() - get the next CMATCH base
5800
 *  @cigar:       pointer to current cigar block (rw)
5801
 *  @cigar_max:   pointer just beyond the last cigar block
5802
 *  @icig:        position within the current cigar block (rw)
5803
 *  @iseq:        position in the sequence (rw)
5804
 *  @iref:        position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
5805
 *
5806
 *  Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered,
5807
 *  or -2 on error.
5808
 */
5809
static inline int cigar_iref2iseq_set(const uint32_t **cigar,
5810
                                      const uint32_t *cigar_max,
5811
                                      hts_pos_t *icig,
5812
                                      hts_pos_t *iseq,
5813
                                      hts_pos_t *iref)
5814
0
{
5815
0
    hts_pos_t pos = *iref;
5816
0
    if ( pos < 0 ) return -1;
5817
0
    *icig = 0;
5818
0
    *iseq = 0;
5819
0
    *iref = 0;
5820
0
    while ( *cigar<cigar_max )
5821
0
    {
5822
0
        int cig  = (**cigar) & BAM_CIGAR_MASK;
5823
0
        int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5824
5825
0
        if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5826
0
        if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
5827
0
        if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5828
0
        {
5829
0
            pos -= ncig;
5830
0
            if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
5831
0
            (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
5832
0
            continue;
5833
0
        }
5834
0
        if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5835
0
        if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
5836
0
        {
5837
0
            pos -= ncig;
5838
0
            if ( pos<0 ) pos = 0;
5839
0
            (*cigar)++; *icig = 0; *iref += ncig;
5840
0
            continue;
5841
0
        }
5842
0
        hts_log_error("Unexpected cigar %d", cig);
5843
0
        return -2;
5844
0
    }
5845
0
    *iseq = -1;
5846
0
    return -1;
5847
0
}
5848
static inline int cigar_iref2iseq_next(const uint32_t **cigar,
5849
                                       const uint32_t *cigar_max,
5850
                                       hts_pos_t *icig,
5851
                                       hts_pos_t *iseq,
5852
                                       hts_pos_t *iref)
5853
0
{
5854
0
    while ( *cigar < cigar_max )
5855
0
    {
5856
0
        int cig  = (**cigar) & BAM_CIGAR_MASK;
5857
0
        int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5858
5859
0
        if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5860
0
        {
5861
0
            if ( *icig >= ncig - 1 ) { *icig = -1;  (*cigar)++; continue; }
5862
0
            (*iseq)++; (*icig)++; (*iref)++;
5863
0
            return BAM_CMATCH;
5864
0
        }
5865
0
        if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = -1; continue; }
5866
0
        if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5867
0
        if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5868
0
        if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = -1; continue; }
5869
0
        hts_log_error("Unexpected cigar %d", cig);
5870
0
        return -2;
5871
0
    }
5872
0
    *iseq = -1;
5873
0
    *iref = -1;
5874
0
    return -1;
5875
0
}
5876
5877
// Given overlapping read 'a' (left) and 'b' (right) on the same
5878
// template, adjust quality values to zero for either a or b.
5879
// Note versions 1.12 and earlier always removed quality from 'b' for
5880
// matching bases.  Now we select a or b semi-randomly based on name hash.
5881
// Returns 0 on success,
5882
//        -1 on failure
5883
static int tweak_overlap_quality(bam1_t *a, bam1_t *b)
5884
0
{
5885
0
    const uint32_t *a_cigar = bam_get_cigar(a),
5886
0
        *a_cigar_max = a_cigar + a->core.n_cigar;
5887
0
    const uint32_t *b_cigar = bam_get_cigar(b),
5888
0
        *b_cigar_max = b_cigar + b->core.n_cigar;
5889
0
    hts_pos_t a_icig = 0, a_iseq = 0;
5890
0
    hts_pos_t b_icig = 0, b_iseq = 0;
5891
0
    uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
5892
0
    uint8_t *a_seq  = bam_get_seq(a), *b_seq = bam_get_seq(b);
5893
5894
0
    hts_pos_t iref   = b->core.pos;
5895
0
    hts_pos_t a_iref = iref - a->core.pos;
5896
0
    hts_pos_t b_iref = iref - b->core.pos;
5897
5898
0
    int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max,
5899
0
                                    &a_icig, &a_iseq, &a_iref);
5900
0
    if ( a_ret<0 )
5901
        // no overlap or error
5902
0
        return a_ret<-1 ? -1:0;
5903
5904
0
    int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max,
5905
0
                                    &b_icig, &b_iseq, &b_iref);
5906
0
    if ( b_ret<0 )
5907
        // no overlap or error
5908
0
        return b_ret<-1 ? -1:0;
5909
5910
    // Determine which seq is the one getting modified qualities.
5911
0
    uint8_t amul, bmul;
5912
0
    if (__ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(a))) & 1) {
5913
0
        amul = 1;
5914
0
        bmul = 0;
5915
0
    } else {
5916
0
        amul = 0;
5917
0
        bmul = 1;
5918
0
    }
5919
5920
    // Loop over the overlapping region nulling qualities in either
5921
    // seq a or b.
5922
0
    int err = 0;
5923
0
    while ( 1 ) {
5924
        // Step to next matching reference position in a and b
5925
0
        while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos )
5926
0
            a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
5927
0
                                         &a_icig, &a_iseq, &a_iref);
5928
0
        if ( a_ret<0 ) { // done
5929
0
            err = a_ret<-1?-1:0;
5930
0
            break;
5931
0
        }
5932
5933
0
        while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos )
5934
0
            b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig,
5935
0
                                         &b_iseq, &b_iref);
5936
0
        if ( b_ret<0 ) { // done
5937
0
            err = b_ret<-1?-1:0;
5938
0
            break;
5939
0
        }
5940
5941
0
        if ( iref < a_iref + a->core.pos )
5942
0
            iref = a_iref + a->core.pos;
5943
5944
0
        if ( iref < b_iref + b->core.pos )
5945
0
            iref = b_iref + b->core.pos;
5946
5947
0
        iref++;
5948
5949
        // If A or B has a deletion then we catch up the other to this point.
5950
        // We also amend quality values using the same rules for mismatch.
5951
0
        if (a_iref+a->core.pos != b_iref+b->core.pos) {
5952
0
            if (a_iref+a->core.pos < b_iref+b->core.pos
5953
0
                && b_cigar > bam_get_cigar(b)
5954
0
                && bam_cigar_op(b_cigar[-1]) == BAM_CDEL) {
5955
                // Del in B means it's moved on further than A
5956
0
                do {
5957
0
                    a_qual[a_iseq] = amul
5958
0
                        ? a_qual[a_iseq]*0.8
5959
0
                        : 0;
5960
0
                    a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
5961
0
                                                 &a_icig, &a_iseq, &a_iref);
5962
0
                    if (a_ret < 0)
5963
0
                        return -(a_ret<-1); // 0 or -1
5964
0
                } while (a_iref + a->core.pos < b_iref+b->core.pos);
5965
0
            } else if (a_cigar > bam_get_cigar(a)
5966
0
                       && bam_cigar_op(a_cigar[-1]) == BAM_CDEL) {
5967
                // Del in A means it's moved on further than B
5968
0
                do {
5969
0
                    b_qual[b_iseq] = bmul
5970
0
                        ? b_qual[b_iseq]*0.8
5971
0
                        : 0;
5972
0
                    b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max,
5973
0
                                                 &b_icig, &b_iseq, &b_iref);
5974
0
                    if (b_ret < 0)
5975
0
                        return -(b_ret<-1); // 0 or -1
5976
0
                } while (b_iref + b->core.pos < a_iref+a->core.pos);
5977
0
            } else {
5978
                // Anything else, eg ref-skip, we don't support here
5979
0
                continue;
5980
0
            }
5981
0
        }
5982
5983
        // fprintf(stderr, "a_cig=%ld,%ld b_cig=%ld,%ld iref=%ld "
5984
        //         "a_iref=%ld b_iref=%ld a_iseq=%ld b_iseq=%ld\n",
5985
        //         a_cigar-bam_get_cigar(a), a_icig,
5986
        //         b_cigar-bam_get_cigar(b), b_icig,
5987
        //         iref, a_iref+a->core.pos+1, b_iref+b->core.pos+1,
5988
        //         a_iseq, b_iseq);
5989
5990
0
        if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq)
5991
            // Fell off end of sequence, bad CIGAR?
5992
0
            return -1;
5993
5994
        // We're finally at the same ref base in both a and b.
5995
        // Check if the bases match (confident) or mismatch
5996
        // (not so confident).
5997
0
        if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) {
5998
            // We are very confident about this base.  Use sum of quals
5999
0
            int qual = a_qual[a_iseq] + b_qual[b_iseq];
6000
0
            a_qual[a_iseq] = amul * (qual>200 ? 200 : qual);
6001
0
            b_qual[b_iseq] = bmul * (qual>200 ? 200 : qual);;
6002
0
        } else {
6003
            // Not so confident about anymore given the mismatch.
6004
            // Reduce qual for lowest quality base.
6005
0
            if ( a_qual[a_iseq] > b_qual[b_iseq] ) {
6006
                // A highest qual base; keep
6007
0
                a_qual[a_iseq] = 0.8 * a_qual[a_iseq];
6008
0
                b_qual[b_iseq] = 0;
6009
0
            } else if (a_qual[a_iseq] < b_qual[b_iseq] ) {
6010
                // B highest qual base; keep
6011
0
                b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
6012
0
                a_qual[a_iseq] = 0;
6013
0
            } else {
6014
                // Both equal, so pick randomly
6015
0
                a_qual[a_iseq] = amul * 0.8 * a_qual[a_iseq];
6016
0
                b_qual[b_iseq] = bmul * 0.8 * b_qual[b_iseq];
6017
0
            }
6018
0
        }
6019
0
    }
6020
6021
0
    return err;
6022
0
}
6023
6024
// Fix overlapping reads. Simple soft-clipping did not give good results.
6025
// Lowering qualities of unwanted bases is more selective and works better.
6026
//
6027
// Returns 0 on success, -1 on failure
6028
static int overlap_push(bam_plp_t iter, lbnode_t *node)
6029
0
{
6030
0
    if ( !iter->overlaps ) return 0;
6031
6032
    // mapped mates and paired reads only
6033
0
    if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return 0;
6034
6035
    // no overlap possible, unless some wild cigar
6036
0
    if ( (node->b.core.mtid >= 0 && node->b.core.tid != node->b.core.mtid)
6037
0
         || (llabs(node->b.core.isize) >= 2*node->b.core.l_qseq
6038
0
         && node->b.core.mpos >= node->end) // for those wild cigars
6039
0
       ) return 0;
6040
6041
0
    khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
6042
0
    if ( kitr==kh_end(iter->overlaps) )
6043
0
    {
6044
        // Only add reads where the mate is still to arrive
6045
0
        if (node->b.core.mpos >= node->b.core.pos ||
6046
0
            ((node->b.core.flag & BAM_FPAIRED) && node->b.core.mpos == -1)) {
6047
0
            int ret;
6048
0
            kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
6049
0
            if (ret < 0) return -1;
6050
0
            kh_value(iter->overlaps, kitr) = node;
6051
0
        }
6052
0
    }
6053
0
    else
6054
0
    {
6055
0
        lbnode_t *a = kh_value(iter->overlaps, kitr);
6056
0
        int err = tweak_overlap_quality(&a->b, &node->b);
6057
0
        kh_del(olap_hash, iter->overlaps, kitr);
6058
0
        assert(a->end-1 == a->s.end);
6059
0
        return err;
6060
0
    }
6061
0
    return 0;
6062
0
}
6063
6064
static void overlap_remove(bam_plp_t iter, const bam1_t *b)
6065
0
{
6066
0
    if ( !iter->overlaps ) return;
6067
6068
0
    khiter_t kitr;
6069
0
    if ( b )
6070
0
    {
6071
0
        kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
6072
0
        if ( kitr!=kh_end(iter->overlaps) )
6073
0
            kh_del(olap_hash, iter->overlaps, kitr);
6074
0
    }
6075
0
    else
6076
0
    {
6077
        // remove all
6078
0
        for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
6079
0
            if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
6080
0
    }
6081
0
}
6082
6083
6084
6085
// Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
6086
// pointer to the piled records if next position is ready or NULL if there is not enough records in the
6087
// buffer yet (the current position is still the maximum position across all buffered reads).
6088
const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
6089
0
{
6090
0
    if (iter->error) { *_n_plp = -1; return NULL; }
6091
0
    *_n_plp = 0;
6092
0
    if (iter->is_eof && iter->head == iter->tail) return NULL;
6093
0
    while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
6094
0
        int n_plp = 0;
6095
        // write iter->plp at iter->pos
6096
0
        lbnode_t **pptr = &iter->head;
6097
0
        while (*pptr != iter->tail) {
6098
0
            lbnode_t *p = *pptr;
6099
0
            if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
6100
0
                overlap_remove(iter, &p->b);
6101
0
                if (iter->plp_destruct)
6102
0
                    iter->plp_destruct(iter->data, &p->b, &p->cd);
6103
0
                *pptr = p->next; mp_free(iter->mp, p);
6104
0
            }
6105
0
            else {
6106
0
                if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
6107
0
                    if (n_plp == iter->max_plp) { // then double the capacity
6108
0
                        iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
6109
0
                        iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
6110
0
                    }
6111
0
                    iter->plp[n_plp].b = &p->b;
6112
0
                    iter->plp[n_plp].cd = p->cd;
6113
0
                    if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
6114
0
                }
6115
0
                pptr = &(*pptr)->next;
6116
0
            }
6117
0
        }
6118
0
        *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
6119
        // update iter->tid and iter->pos
6120
0
        if (iter->head != iter->tail) {
6121
0
            if (iter->tid > iter->head->b.core.tid) {
6122
0
                hts_log_error("Unsorted input. Pileup aborts");
6123
0
                iter->error = 1;
6124
0
                *_n_plp = -1;
6125
0
                return NULL;
6126
0
            }
6127
0
        }
6128
0
        if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
6129
0
            iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
6130
0
        } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
6131
0
            iter->pos = iter->head->beg; // jump to the next position
6132
0
        } else ++iter->pos; // scan contiguously
6133
        // return
6134
0
        if (n_plp) return iter->plp;
6135
0
        if (iter->is_eof && iter->head == iter->tail) break;
6136
0
    }
6137
0
    return NULL;
6138
0
}
6139
6140
const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
6141
0
{
6142
0
    hts_pos_t pos64 = 0;
6143
0
    const bam_pileup1_t *p = bam_plp64_next(iter, _tid, &pos64, _n_plp);
6144
0
    if (pos64 < INT_MAX) {
6145
0
        *_pos = pos64;
6146
0
    } else {
6147
0
        hts_log_error("Position %"PRId64" too large", pos64);
6148
0
        *_pos = INT_MAX;
6149
0
        iter->error = 1;
6150
0
        *_n_plp = -1;
6151
0
        return NULL;
6152
0
    }
6153
0
    return p;
6154
0
}
6155
6156
int bam_plp_push(bam_plp_t iter, const bam1_t *b)
6157
0
{
6158
0
    if (iter->error) return -1;
6159
0
    if (b) {
6160
0
        if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
6161
        // Skip only unmapped reads here, any additional filtering must be done in iter->func
6162
0
        if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
6163
0
        if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
6164
0
        {
6165
0
            overlap_remove(iter, b);
6166
0
            return 0;
6167
0
        }
6168
0
        if (bam_copy1(&iter->tail->b, b) == NULL)
6169
0
            return -1;
6170
0
        iter->tail->b.id = iter->id++;
6171
0
        iter->tail->beg = b->core.pos;
6172
        // Use raw rlen rather than bam_endpos() which adjusts rlen=0 to rlen=1
6173
0
        iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
6174
0
        iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
6175
0
        if (b->core.tid < iter->max_tid) {
6176
0
            hts_log_error("The input is not sorted (chromosomes out of order)");
6177
0
            iter->error = 1;
6178
0
            return -1;
6179
0
        }
6180
0
        if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
6181
0
            hts_log_error("The input is not sorted (reads out of order)");
6182
0
            iter->error = 1;
6183
0
            return -1;
6184
0
        }
6185
0
        iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
6186
0
        if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
6187
0
            lbnode_t *next = mp_alloc(iter->mp);
6188
0
            if (!next) {
6189
0
                iter->error = 1;
6190
0
                return -1;
6191
0
            }
6192
0
            if (iter->plp_construct) {
6193
0
                if (iter->plp_construct(iter->data, &iter->tail->b,
6194
0
                                        &iter->tail->cd) < 0) {
6195
0
                    mp_free(iter->mp, next);
6196
0
                    iter->error = 1;
6197
0
                    return -1;
6198
0
                }
6199
0
            }
6200
0
            if (overlap_push(iter, iter->tail) < 0) {
6201
0
                mp_free(iter->mp, next);
6202
0
                iter->error = 1;
6203
0
                return -1;
6204
0
            }
6205
0
            iter->tail->next = next;
6206
0
            iter->tail = iter->tail->next;
6207
0
        }
6208
0
    } else iter->is_eof = 1;
6209
0
    return 0;
6210
0
}
6211
6212
const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
6213
0
{
6214
0
    const bam_pileup1_t *plp;
6215
0
    if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
6216
0
    if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6217
0
    else { // no pileup line can be obtained; read alignments
6218
0
        *_n_plp = 0;
6219
0
        if (iter->is_eof) return 0;
6220
0
        int ret;
6221
0
        while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
6222
0
            if (bam_plp_push(iter, iter->b) < 0) {
6223
0
                *_n_plp = -1;
6224
0
                return 0;
6225
0
            }
6226
0
            if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6227
            // otherwise no pileup line can be returned; read the next alignment.
6228
0
        }
6229
0
        if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
6230
0
        if (bam_plp_push(iter, 0) < 0) {
6231
0
            *_n_plp = -1;
6232
0
            return 0;
6233
0
        }
6234
0
        if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
6235
0
        return 0;
6236
0
    }
6237
0
}
6238
6239
const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
6240
0
{
6241
0
    hts_pos_t pos64 = 0;
6242
0
    const bam_pileup1_t *p = bam_plp64_auto(iter, _tid, &pos64, _n_plp);
6243
0
    if (pos64 < INT_MAX) {
6244
0
        *_pos = pos64;
6245
0
    } else {
6246
0
        hts_log_error("Position %"PRId64" too large", pos64);
6247
0
        *_pos = INT_MAX;
6248
0
        iter->error = 1;
6249
0
        *_n_plp = -1;
6250
0
        return NULL;
6251
0
    }
6252
0
    return p;
6253
0
}
6254
6255
void bam_plp_reset(bam_plp_t iter)
6256
0
{
6257
0
    overlap_remove(iter, NULL);
6258
0
    iter->max_tid = iter->max_pos = -1;
6259
0
    iter->tid = iter->pos = 0;
6260
0
    iter->is_eof = 0;
6261
0
    while (iter->head != iter->tail) {
6262
0
        lbnode_t *p = iter->head;
6263
0
        iter->head = p->next;
6264
0
        mp_free(iter->mp, p);
6265
0
    }
6266
0
}
6267
6268
void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
6269
0
{
6270
0
    iter->maxcnt = maxcnt;
6271
0
}
6272
6273
/************************
6274
 *** Mpileup iterator ***
6275
 ************************/
6276
6277
struct bam_mplp_s {
6278
    int n;
6279
    int32_t min_tid, *tid;
6280
    hts_pos_t min_pos, *pos;
6281
    bam_plp_t *iter;
6282
    int *n_plp;
6283
    const bam_pileup1_t **plp;
6284
};
6285
6286
bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
6287
0
{
6288
0
    int i;
6289
0
    bam_mplp_t iter;
6290
0
    iter = (bam_mplp_t)calloc(1, sizeof(struct bam_mplp_s));
6291
0
    iter->pos = (hts_pos_t*)calloc(n, sizeof(hts_pos_t));
6292
0
    iter->tid = (int32_t*)calloc(n, sizeof(int32_t));
6293
0
    iter->n_plp = (int*)calloc(n, sizeof(int));
6294
0
    iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
6295
0
    iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
6296
0
    iter->n = n;
6297
0
    iter->min_pos = HTS_POS_MAX;
6298
0
    iter->min_tid = (uint32_t)-1;
6299
0
    for (i = 0; i < n; ++i) {
6300
0
        iter->iter[i] = bam_plp_init(func, data[i]);
6301
0
        iter->pos[i] = iter->min_pos;
6302
0
        iter->tid[i] = iter->min_tid;
6303
0
    }
6304
0
    return iter;
6305
0
}
6306
6307
int bam_mplp_init_overlaps(bam_mplp_t iter)
6308
0
{
6309
0
    int i, r = 0;
6310
0
    for (i = 0; i < iter->n; ++i)
6311
0
        r |= bam_plp_init_overlaps(iter->iter[i]);
6312
0
    return r == 0 ? 0 : -1;
6313
0
}
6314
6315
void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
6316
0
{
6317
0
    int i;
6318
0
    for (i = 0; i < iter->n; ++i)
6319
0
        iter->iter[i]->maxcnt = maxcnt;
6320
0
}
6321
6322
void bam_mplp_destroy(bam_mplp_t iter)
6323
0
{
6324
0
    int i;
6325
0
    for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
6326
0
    free(iter->iter); free(iter->pos); free(iter->tid);
6327
0
    free(iter->n_plp); free(iter->plp);
6328
0
    free(iter);
6329
0
}
6330
6331
int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp)
6332
0
{
6333
0
    int i, ret = 0;
6334
0
    hts_pos_t new_min_pos = HTS_POS_MAX;
6335
0
    uint32_t new_min_tid = (uint32_t)-1;
6336
0
    for (i = 0; i < iter->n; ++i) {
6337
0
        if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
6338
0
            int tid;
6339
0
            hts_pos_t pos;
6340
0
            iter->plp[i] = bam_plp64_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
6341
0
            if ( iter->iter[i]->error ) return -1;
6342
0
            if (iter->plp[i]) {
6343
0
                iter->tid[i] = tid;
6344
0
                iter->pos[i] = pos;
6345
0
            } else {
6346
0
                iter->tid[i] = 0;
6347
0
                iter->pos[i] = 0;
6348
0
            }
6349
0
        }
6350
0
        if (iter->plp[i]) {
6351
0
            if (iter->tid[i] < new_min_tid) {
6352
0
                new_min_tid = iter->tid[i];
6353
0
                new_min_pos = iter->pos[i];
6354
0
            } else if (iter->tid[i] == new_min_tid && iter->pos[i] < new_min_pos) {
6355
0
                new_min_pos = iter->pos[i];
6356
0
            }
6357
0
        }
6358
0
    }
6359
0
    iter->min_pos = new_min_pos;
6360
0
    iter->min_tid = new_min_tid;
6361
0
    if (new_min_pos == HTS_POS_MAX) return 0;
6362
0
    *_tid = new_min_tid; *_pos = new_min_pos;
6363
0
    for (i = 0; i < iter->n; ++i) {
6364
0
        if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
6365
0
            n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
6366
0
            ++ret;
6367
0
        } else n_plp[i] = 0, plp[i] = 0;
6368
0
    }
6369
0
    return ret;
6370
0
}
6371
6372
int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
6373
0
{
6374
0
    hts_pos_t pos64 = 0;
6375
0
    int ret = bam_mplp64_auto(iter, _tid, &pos64, n_plp, plp);
6376
0
    if (ret >= 0) {
6377
0
        if (pos64 < INT_MAX) {
6378
0
            *_pos = pos64;
6379
0
        } else {
6380
0
            hts_log_error("Position %"PRId64" too large", pos64);
6381
0
            *_pos = INT_MAX;
6382
0
            return -1;
6383
0
        }
6384
0
    }
6385
0
    return ret;
6386
0
}
6387
6388
void bam_mplp_reset(bam_mplp_t iter)
6389
0
{
6390
0
    int i;
6391
0
    iter->min_pos = HTS_POS_MAX;
6392
0
    iter->min_tid = (uint32_t)-1;
6393
0
    for (i = 0; i < iter->n; ++i) {
6394
0
        bam_plp_reset(iter->iter[i]);
6395
0
        iter->pos[i] = HTS_POS_MAX;
6396
0
        iter->tid[i] = (uint32_t)-1;
6397
0
        iter->n_plp[i] = 0;
6398
0
        iter->plp[i] = NULL;
6399
0
    }
6400
0
}
6401
6402
void bam_mplp_constructor(bam_mplp_t iter,
6403
0
                          int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
6404
0
    int i;
6405
0
    for (i = 0; i < iter->n; ++i)
6406
0
        bam_plp_constructor(iter->iter[i], func);
6407
0
}
6408
6409
void bam_mplp_destructor(bam_mplp_t iter,
6410
0
                         int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
6411
0
    int i;
6412
0
    for (i = 0; i < iter->n; ++i)
6413
0
        bam_plp_destructor(iter->iter[i], func);
6414
0
}
6415
6416
#endif // ~!defined(BAM_NO_PILEUP)