Coverage Report

Created: 2023-01-17 06:24

/src/htslib/sam.c
Line
Count
Source (jump to first uncovered line)
1
/*  sam.c -- SAM and BAM file I/O and manipulation.
2
3
    Copyright (C) 2008-2010, 2012-2022 Genome Research Ltd.
4
    Copyright (C) 2010, 2012, 2013 Broad Institute.
5
6
    Author: Heng Li <lh3@sanger.ac.uk>
7
8
Permission is hereby granted, free of charge, to any person obtaining a copy
9
of this software and associated documentation files (the "Software"), to deal
10
in the Software without restriction, including without limitation the rights
11
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
copies of the Software, and to permit persons to whom the Software is
13
furnished to do so, subject to the following conditions:
14
15
The above copyright notice and this permission notice shall be included in
16
all copies or substantial portions of the Software.
17
18
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24
DEALINGS IN THE SOFTWARE.  */
25
26
#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h
27
#include <config.h>
28
29
#include <strings.h>
30
#include <stdio.h>
31
#include <stdlib.h>
32
#include <string.h>
33
#include <errno.h>
34
#include <zlib.h>
35
#include <assert.h>
36
#include <signal.h>
37
#include <inttypes.h>
38
#include <unistd.h>
39
40
// Suppress deprecation message for cigar_tab, which we initialise
41
#include "htslib/hts_defs.h"
42
#undef HTS_DEPRECATED
43
#define HTS_DEPRECATED(message)
44
45
#include "htslib/sam.h"
46
#include "htslib/bgzf.h"
47
#include "cram/cram.h"
48
#include "hts_internal.h"
49
#include "sam_internal.h"
50
#include "htslib/hfile.h"
51
#include "htslib/hts_endian.h"
52
#include "htslib/hts_expr.h"
53
#include "header.h"
54
55
#include "htslib/khash.h"
56
KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
57
KHASH_SET_INIT_INT(tag)
58
59
#ifndef EFTYPE
60
0
#define EFTYPE ENOEXEC
61
#endif
62
#ifndef EOVERFLOW
63
#define EOVERFLOW ERANGE
64
#endif
65
66
/**********************
67
 *** BAM header I/O ***
68
 **********************/
69
70
HTSLIB_EXPORT
71
const int8_t bam_cigar_table[256] = {
72
    // 0 .. 47
73
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
74
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
75
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
76
77
    // 48 .. 63  (including =)
78
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, BAM_CEQUAL, -1, -1,
79
80
    // 64 .. 79  (including MIDNHB)
81
    -1, -1, BAM_CBACK, -1,  BAM_CDEL, -1, -1, -1,
82
        BAM_CHARD_CLIP, BAM_CINS, -1, -1,  -1, BAM_CMATCH, BAM_CREF_SKIP, -1,
83
84
    // 80 .. 95  (including SPX)
85
    BAM_CPAD, -1, -1, BAM_CSOFT_CLIP,  -1, -1, -1, -1,
86
        BAM_CDIFF, -1, -1, -1,  -1, -1, -1, -1,
87
88
    // 96 .. 127
89
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
90
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
91
92
    // 128 .. 255
93
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
94
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
95
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
96
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
97
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
98
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
99
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,
100
    -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1
101
};
102
103
sam_hdr_t *sam_hdr_init()
104
1.16k
{
105
1.16k
    sam_hdr_t *bh = (sam_hdr_t*)calloc(1, sizeof(sam_hdr_t));
106
1.16k
    if (bh == NULL) return NULL;
107
108
1.16k
    bh->cigar_tab = bam_cigar_table;
109
1.16k
    return bh;
110
1.16k
}
111
112
void sam_hdr_destroy(sam_hdr_t *bh)
113
4.69k
{
114
4.69k
    int32_t i;
115
116
4.69k
    if (bh == NULL) return;
117
118
1.65k
    if (bh->ref_count > 0) {
119
485
        --bh->ref_count;
120
485
        return;
121
485
    }
122
123
1.16k
    if (bh->target_name) {
124
2.54k
        for (i = 0; i < bh->n_targets; ++i)
125
2.22k
            free(bh->target_name[i]);
126
327
        free(bh->target_name);
127
327
        free(bh->target_len);
128
327
    }
129
1.16k
    free(bh->text);
130
1.16k
    if (bh->hrecs)
131
1.01k
        sam_hrecs_free(bh->hrecs);
132
1.16k
    if (bh->sdict)
133
1.16k
        kh_destroy(s2i, (khash_t(s2i) *) bh->sdict);
134
1.16k
    free(bh);
135
1.16k
}
136
137
// Copy the sam_hdr_t::sdict hash, used to store the real lengths of long
138
// references before sam_hdr_t::hrecs is populated
139
int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h)
140
0
{
141
0
    const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict;
142
0
    khash_t(s2i) *dest_long_refs = kh_init(s2i);
143
0
    int i;
144
0
    if (!dest_long_refs) return -1;
145
146
0
    for (i = 0; i < h->n_targets; i++) {
147
0
        int ret;
148
0
        khiter_t ksrc, kdest;
149
0
        if (h->target_len[i] < UINT32_MAX) continue;
150
0
        ksrc = kh_get(s2i, src_long_refs, h->target_name[i]);
151
0
        if (ksrc == kh_end(src_long_refs)) continue;
152
0
        kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret);
153
0
        if (ret < 0) {
154
0
            kh_destroy(s2i, dest_long_refs);
155
0
            return -1;
156
0
        }
157
0
        kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc);
158
0
    }
159
160
0
    h->sdict = dest_long_refs;
161
0
    return 0;
162
0
}
163
164
sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0)
165
245
{
166
245
    if (h0 == NULL) return NULL;
167
245
    sam_hdr_t *h;
168
245
    if ((h = sam_hdr_init()) == NULL) return NULL;
169
    // copy the simple data
170
245
    h->n_targets = 0;
171
245
    h->ignore_sam_err = h0->ignore_sam_err;
172
245
    h->l_text = 0;
173
174
    // Then the pointery stuff
175
176
245
    if (!h0->hrecs) {
177
0
        h->target_len = (uint32_t*)calloc(h0->n_targets, sizeof(uint32_t));
178
0
        if (!h->target_len) goto fail;
179
0
        h->target_name = (char**)calloc(h0->n_targets, sizeof(char*));
180
0
        if (!h->target_name) goto fail;
181
182
0
        int i;
183
0
        for (i = 0; i < h0->n_targets; ++i) {
184
0
            h->target_len[i] = h0->target_len[i];
185
0
            h->target_name[i] = strdup(h0->target_name[i]);
186
0
            if (!h->target_name[i]) break;
187
0
        }
188
0
        h->n_targets = i;
189
0
        if (i < h0->n_targets) goto fail;
190
191
0
        if (h0->sdict) {
192
0
            if (sam_hdr_dup_sdict(h0, h) < 0) goto fail;
193
0
        }
194
0
    }
195
196
245
    if (h0->hrecs) {
197
245
        kstring_t tmp = { 0, 0, NULL };
198
245
        if (sam_hrecs_rebuild_text(h0->hrecs, &tmp) != 0) {
199
0
            free(ks_release(&tmp));
200
0
            goto fail;
201
0
        }
202
203
245
        h->l_text = tmp.l;
204
245
        h->text   = ks_release(&tmp);
205
206
245
        if (sam_hdr_update_target_arrays(h, h0->hrecs, 0) != 0)
207
0
            goto fail;
208
245
    } else {
209
0
        h->l_text = h0->l_text;
210
0
        h->text = malloc(h->l_text + 1);
211
0
        if (!h->text) goto fail;
212
0
        memcpy(h->text, h0->text, h->l_text);
213
0
        h->text[h->l_text] = '\0';
214
0
    }
215
216
245
    return h;
217
218
0
 fail:
219
0
    sam_hdr_destroy(h);
220
0
    return NULL;
221
245
}
222
223
sam_hdr_t *bam_hdr_read(BGZF *fp)
224
52
{
225
52
    sam_hdr_t *h;
226
52
    uint8_t buf[4];
227
52
    int magic_len, has_EOF;
228
52
    int32_t i, name_len, num_names = 0;
229
52
    size_t bufsize;
230
52
    ssize_t bytes;
231
    // check EOF
232
52
    has_EOF = bgzf_check_EOF(fp);
233
52
    if (has_EOF < 0) {
234
0
        perror("[W::bam_hdr_read] bgzf_check_EOF");
235
52
    } else if (has_EOF == 0) {
236
52
        hts_log_warning("EOF marker is absent. The input is probably truncated");
237
52
    }
238
    // read "BAM1"
239
52
    magic_len = bgzf_read(fp, buf, 4);
240
52
    if (magic_len != 4 || memcmp(buf, "BAM\1", 4)) {
241
0
        hts_log_error("Invalid BAM binary header");
242
0
        return 0;
243
0
    }
244
52
    h = sam_hdr_init();
245
52
    if (!h) goto nomem;
246
247
    // read plain text and the number of reference sequences
248
52
    bytes = bgzf_read(fp, buf, 4);
249
52
    if (bytes != 4) goto read_err;
250
52
    h->l_text = le_to_u32(buf);
251
252
52
    bufsize = h->l_text + 1;
253
52
    if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed
254
52
    h->text = (char*)malloc(bufsize);
255
52
    if (!h->text) goto nomem;
256
52
    h->text[h->l_text] = 0; // make sure it is NULL terminated
257
52
    bytes = bgzf_read(fp, h->text, h->l_text);
258
52
    if (bytes != h->l_text) goto read_err;
259
260
45
    bytes = bgzf_read(fp, &h->n_targets, 4);
261
45
    if (bytes != 4) goto read_err;
262
45
    if (fp->is_be) ed_swap_4p(&h->n_targets);
263
264
45
    if (h->n_targets < 0) goto invalid;
265
266
    // read reference sequence names and lengths
267
44
    if (h->n_targets > 0) {
268
14
        h->target_name = (char**)calloc(h->n_targets, sizeof(char*));
269
14
        if (!h->target_name) goto nomem;
270
14
        h->target_len = (uint32_t*)calloc(h->n_targets, sizeof(uint32_t));
271
14
        if (!h->target_len) goto nomem;
272
14
    }
273
30
    else {
274
30
        h->target_name = NULL;
275
30
        h->target_len = NULL;
276
30
    }
277
278
189
    for (i = 0; i != h->n_targets; ++i) {
279
155
        bytes = bgzf_read(fp, &name_len, 4);
280
155
        if (bytes != 4) goto read_err;
281
155
        if (fp->is_be) ed_swap_4p(&name_len);
282
155
        if (name_len <= 0) goto invalid;
283
284
155
        h->target_name[i] = (char*)malloc(name_len);
285
155
        if (!h->target_name[i]) goto nomem;
286
155
        num_names++;
287
288
155
        bytes = bgzf_read(fp, h->target_name[i], name_len);
289
155
        if (bytes != name_len) goto read_err;
290
291
145
        if (h->target_name[i][name_len - 1] != '\0') {
292
            /* Fix missing NUL-termination.  Is this being too nice?
293
               We could alternatively bail out with an error. */
294
44
            char *new_name;
295
44
            if (name_len == INT32_MAX) goto invalid;
296
44
            new_name = realloc(h->target_name[i], name_len + 1);
297
44
            if (new_name == NULL) goto nomem;
298
44
            h->target_name[i] = new_name;
299
44
            h->target_name[i][name_len] = '\0';
300
44
        }
301
302
145
        bytes = bgzf_read(fp, &h->target_len[i], 4);
303
145
        if (bytes != 4) goto read_err;
304
145
        if (fp->is_be) ed_swap_4p(&h->target_len[i]);
305
145
    }
306
34
    return h;
307
308
0
 nomem:
309
0
    hts_log_error("Out of memory");
310
0
    goto clean;
311
312
17
 read_err:
313
17
    if (bytes < 0) {
314
8
        hts_log_error("Error reading BGZF stream");
315
9
    } else {
316
9
        hts_log_error("Truncated BAM header");
317
9
    }
318
17
    goto clean;
319
320
1
 invalid:
321
1
    hts_log_error("Invalid BAM binary header");
322
323
18
 clean:
324
18
    if (h != NULL) {
325
18
        h->n_targets = num_names; // ensure we free only allocated target_names
326
18
        sam_hdr_destroy(h);
327
18
    }
328
18
    return NULL;
329
1
}
330
331
int bam_hdr_write(BGZF *fp, const sam_hdr_t *h)
332
0
{
333
0
    int32_t i, name_len, x;
334
0
    kstring_t hdr_ks = { 0, 0, NULL };
335
0
    char *text;
336
0
    uint32_t l_text;
337
338
0
    if (!h) return -1;
339
340
0
    if (h->hrecs) {
341
0
        if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1;
342
0
        if (hdr_ks.l > UINT32_MAX) {
343
0
            hts_log_error("Header too long for BAM format");
344
0
            free(hdr_ks.s);
345
0
            return -1;
346
0
        } else if (hdr_ks.l > INT32_MAX) {
347
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
348
0
            hts_log_warning("Output file may not be portable");
349
0
        }
350
0
        text = hdr_ks.s;
351
0
        l_text = hdr_ks.l;
352
0
    } else {
353
0
        if (h->l_text > UINT32_MAX) {
354
0
            hts_log_error("Header too long for BAM format");
355
0
            return -1;
356
0
        } else if (h->l_text > INT32_MAX) {
357
0
            hts_log_warning("Header too long for BAM specification (>2GB)");
358
0
            hts_log_warning("Output file may not be portable");
359
0
        }
360
0
        text = h->text;
361
0
        l_text = h->l_text;
362
0
    }
363
    // write "BAM1"
364
0
    if (bgzf_write(fp, "BAM\1", 4) < 0) { free(hdr_ks.s); return -1; }
365
    // write plain text and the number of reference sequences
366
0
    if (fp->is_be) {
367
0
        x = ed_swap_4(l_text);
368
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
369
0
        if (l_text) {
370
0
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
371
0
        }
372
0
        x = ed_swap_4(h->n_targets);
373
0
        if (bgzf_write(fp, &x, 4) < 0) { free(hdr_ks.s); return -1; }
374
0
    } else {
375
0
        if (bgzf_write(fp, &l_text, 4) < 0) { free(hdr_ks.s); return -1; }
376
0
        if (l_text) {
377
0
            if (bgzf_write(fp, text, l_text) < 0) { free(hdr_ks.s); return -1; }
378
0
        }
379
0
        if (bgzf_write(fp, &h->n_targets, 4) < 0) { free(hdr_ks.s); return -1; }
380
0
    }
381
0
    free(hdr_ks.s);
382
    // write sequence names and lengths
383
0
    for (i = 0; i != h->n_targets; ++i) {
384
0
        char *p = h->target_name[i];
385
0
        name_len = strlen(p) + 1;
386
0
        if (fp->is_be) {
387
0
            x = ed_swap_4(name_len);
388
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
389
0
        } else {
390
0
            if (bgzf_write(fp, &name_len, 4) < 0) return -1;
391
0
        }
392
0
        if (bgzf_write(fp, p, name_len) < 0) return -1;
393
0
        if (fp->is_be) {
394
0
            x = ed_swap_4(h->target_len[i]);
395
0
            if (bgzf_write(fp, &x, 4) < 0) return -1;
396
0
        } else {
397
0
            if (bgzf_write(fp, &h->target_len[i], 4) < 0) return -1;
398
0
        }
399
0
    }
400
0
    if (bgzf_flush(fp) < 0) return -1;
401
0
    return 0;
402
0
}
403
404
const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid,
405
0
                             hts_pos_t *beg, hts_pos_t *end, int flags) {
406
0
    return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags);
407
0
}
408
409
/*************************
410
 *** BAM alignment I/O ***
411
 *************************/
412
413
bam1_t *bam_init1()
414
885
{
415
885
    return (bam1_t*)calloc(1, sizeof(bam1_t));
416
885
}
417
418
int sam_realloc_bam_data(bam1_t *b, size_t desired)
419
2.41k
{
420
2.41k
    uint32_t new_m_data;
421
2.41k
    uint8_t *new_data;
422
2.41k
    new_m_data = desired;
423
2.41k
    kroundup32(new_m_data);
424
2.41k
    if (new_m_data < desired) {
425
0
        errno = ENOMEM; // Not strictly true but we can't store the size
426
0
        return -1;
427
0
    }
428
2.41k
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
429
2.41k
        new_data = realloc(b->data, new_m_data);
430
2.41k
    } else {
431
0
        if ((new_data = malloc(new_m_data)) != NULL) {
432
0
            if (b->l_data > 0)
433
0
                memcpy(new_data, b->data,
434
0
                       b->l_data < b->m_data ? b->l_data : b->m_data);
435
0
            bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA));
436
0
        }
437
0
    }
438
2.41k
    if (!new_data) return -1;
439
2.41k
    b->data = new_data;
440
2.41k
    b->m_data = new_m_data;
441
2.41k
    return 0;
442
2.41k
}
443
444
void bam_destroy1(bam1_t *b)
445
885
{
446
885
    if (b == 0) return;
447
885
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) {
448
885
        free(b->data);
449
885
        if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) != 0) {
450
            // In case of reuse
451
0
            b->data = NULL;
452
0
            b->m_data = 0;
453
0
            b->l_data = 0;
454
0
        }
455
885
    }
456
457
885
    if ((bam_get_mempolicy(b) & BAM_USER_OWNS_STRUCT) == 0)
458
885
        free(b);
459
885
}
460
461
bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc)
462
0
{
463
0
    if (realloc_bam_data(bdst, bsrc->l_data) < 0) return NULL;
464
0
    memcpy(bdst->data, bsrc->data, bsrc->l_data); // copy var-len data
465
0
    memcpy(&bdst->core, &bsrc->core, sizeof(bsrc->core)); // copy the rest
466
0
    bdst->l_data = bsrc->l_data;
467
0
    bdst->id = bsrc->id;
468
0
    return bdst;
469
0
}
470
471
bam1_t *bam_dup1(const bam1_t *bsrc)
472
0
{
473
0
    if (bsrc == NULL) return NULL;
474
0
    bam1_t *bdst = bam_init1();
475
0
    if (bdst == NULL) return NULL;
476
0
    if (bam_copy1(bdst, bsrc) == NULL) {
477
0
        bam_destroy1(bdst);
478
0
        return NULL;
479
0
    }
480
0
    return bdst;
481
0
}
482
483
static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar,
484
                             hts_pos_t *rlen, hts_pos_t *qlen)
485
299
{
486
299
    int k;
487
299
    *rlen = *qlen = 0;
488
86.4k
    for (k = 0; k < n_cigar; ++k) {
489
86.1k
        int type = bam_cigar_type(bam_cigar_op(cigar[k]));
490
86.1k
        int len = bam_cigar_oplen(cigar[k]);
491
86.1k
        if (type & 1) *qlen += len;
492
86.1k
        if (type & 2) *rlen += len;
493
86.1k
    }
494
299
}
495
496
static int subtract_check_underflow(size_t length, size_t *limit)
497
58.8k
{
498
58.8k
    if (length <= *limit) {
499
58.8k
        *limit -= length;
500
58.8k
        return 0;
501
58.8k
    }
502
503
0
    return -1;
504
58.8k
}
505
506
int bam_set1(bam1_t *bam,
507
             size_t l_qname, const char *qname,
508
             uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq,
509
             size_t n_cigar, const uint32_t *cigar,
510
             int32_t mtid, hts_pos_t mpos, hts_pos_t isize,
511
             size_t l_seq, const char *seq, const char *qual,
512
             size_t l_aux)
513
11.7k
{
514
    // use a default qname "*" if none is provided
515
11.7k
    if (l_qname == 0) {
516
10.1k
        l_qname = 1;
517
10.1k
        qname = "*";
518
10.1k
    }
519
520
    // note: the qname is stored nul terminated and padded as described in the
521
    // documentation for the bam1_t struct.
522
11.7k
    size_t qname_nuls = 4 - l_qname % 4;
523
524
    // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos().
525
    // can't use bam_endpos() directly as some fields not yet set up.
526
11.7k
    hts_pos_t rlen = 0, qlen = 0;
527
11.7k
    if (!(flag & BAM_FUNMAP)) {
528
0
        bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen);
529
0
    }
530
11.7k
    if (rlen == 0) {
531
11.7k
        rlen = 1;
532
11.7k
    }
533
534
    // validate parameters
535
11.7k
    if (l_qname > 254) {
536
8
        hts_log_error("Query name too long");
537
8
        errno = EINVAL;
538
8
        return -1;
539
8
    }
540
11.7k
    if (HTS_POS_MAX - rlen <= pos) {
541
0
        hts_log_error("Read ends beyond highest supported position");
542
0
        errno = EINVAL;
543
0
        return -1;
544
0
    }
545
11.7k
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) {
546
0
        hts_log_error("Mapped query must have a CIGAR");
547
0
        errno = EINVAL;
548
0
        return -1;
549
0
    }
550
11.7k
    if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) {
551
0
        hts_log_error("CIGAR and query sequence are of different length");
552
0
        errno = EINVAL;
553
0
        return -1;
554
0
    }
555
556
11.7k
    size_t limit = INT32_MAX;
557
11.7k
    int u = subtract_check_underflow(l_qname + qname_nuls, &limit);
558
11.7k
    u    += subtract_check_underflow(n_cigar * 4, &limit);
559
11.7k
    u    += subtract_check_underflow((l_seq + 1) / 2, &limit);
560
11.7k
    u    += subtract_check_underflow(l_seq, &limit);
561
11.7k
    u    += subtract_check_underflow(l_aux, &limit);
562
11.7k
    if (u != 0) {
563
0
        hts_log_error("Size overflow");
564
0
        errno = EINVAL;
565
0
        return -1;
566
0
    }
567
568
    // re-allocate the data buffer as needed.
569
11.7k
    size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq;
570
11.7k
    if (realloc_bam_data(bam, data_len + l_aux) < 0) {
571
0
        return -1;
572
0
    }
573
574
11.7k
    bam->l_data = (int)data_len;
575
11.7k
    bam->core.pos = pos;
576
11.7k
    bam->core.tid = tid;
577
11.7k
    bam->core.bin = bam_reg2bin(pos, pos + rlen);
578
11.7k
    bam->core.qual = mapq;
579
11.7k
    bam->core.l_extranul = (uint8_t)(qname_nuls - 1);
580
11.7k
    bam->core.flag = flag;
581
11.7k
    bam->core.l_qname = (uint16_t)(l_qname + qname_nuls);
582
11.7k
    bam->core.n_cigar = (uint32_t)n_cigar;
583
11.7k
    bam->core.l_qseq = (int32_t)l_seq;
584
11.7k
    bam->core.mtid = mtid;
585
11.7k
    bam->core.mpos = mpos;
586
11.7k
    bam->core.isize = isize;
587
588
11.7k
    uint8_t *cp = bam->data;
589
11.7k
    strncpy((char *)cp, qname, l_qname);
590
11.7k
    int i;
591
46.2k
    for (i = 0; i < qname_nuls; i++) {
592
34.4k
        cp[l_qname + i] = '\0';
593
34.4k
    }
594
11.7k
    cp += l_qname + qname_nuls;
595
596
11.7k
    if (n_cigar > 0) {
597
0
        memcpy(cp, cigar, n_cigar * 4);
598
0
    }
599
11.7k
    cp += n_cigar * 4;
600
601
99.0M
    for (i = 0; i + 1 < l_seq; i += 2) {
602
99.0M
        *cp++ = (seq_nt16_table[(unsigned char)seq[i]] << 4) | seq_nt16_table[(unsigned char)seq[i + 1]];
603
99.0M
    }
604
13.4k
    for (; i < l_seq; i++) {
605
1.62k
        *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4;
606
1.62k
    }
607
608
11.7k
    if (qual) {
609
199
        memcpy(cp, qual, l_seq);
610
199
    }
611
11.5k
    else {
612
11.5k
        memset(cp, '\xff', l_seq);
613
11.5k
    }
614
615
11.7k
    return (int)data_len;
616
11.7k
}
617
618
hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar)
619
4.68k
{
620
4.68k
    int k;
621
4.68k
    hts_pos_t l;
622
4.94k
    for (k = l = 0; k < n_cigar; ++k)
623
253
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&1)
624
253
            l += bam_cigar_oplen(cigar[k]);
625
4.68k
    return l;
626
4.68k
}
627
628
hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar)
629
836
{
630
836
    int k;
631
836
    hts_pos_t l;
632
84.4k
    for (k = l = 0; k < n_cigar; ++k)
633
83.6k
        if (bam_cigar_type(bam_cigar_op(cigar[k]))&2)
634
82.4k
            l += bam_cigar_oplen(cigar[k]);
635
836
    return l;
636
836
}
637
638
hts_pos_t bam_endpos(const bam1_t *b)
639
350
{
640
350
    hts_pos_t rlen = (b->core.flag & BAM_FUNMAP)? 0 : bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
641
350
    if (rlen == 0) rlen = 1;
642
350
    return b->core.pos + rlen;
643
350
}
644
645
static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 if CIGAR is untouched; 1 if CIGAR is updated with CG
646
5.98k
{
647
5.98k
    bam1_core_t *c = &b->core;
648
5.98k
    uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len, fake_bytes;
649
5.98k
    uint8_t *CG;
650
651
    // test where there is a real CIGAR in the CG tag to move
652
5.98k
    if (c->n_cigar == 0 || c->tid < 0 || c->pos < 0) return 0;
653
1.13k
    cigar0 = bam_get_cigar(b);
654
1.13k
    if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0;
655
970
    fake_bytes = c->n_cigar * 4;
656
970
    int saved_errno = errno;
657
970
    CG = bam_aux_get(b, "CG");
658
970
    if (!CG) {
659
352
        if (errno != ENOENT) return -1;  // Bad aux data
660
352
        errno = saved_errno; // restore errno on expected no-CG-tag case
661
352
        return 0;
662
352
    }
663
618
    if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i'))
664
200
        return 0; // not of type B,I
665
418
    CG_len = le_to_u32(CG + 2);
666
418
    if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length
667
668
    // move from the CG tag to the right position
669
350
    cigar_st = (uint8_t*)cigar0 - b->data;
670
350
    c->n_cigar = CG_len;
671
350
    n_cigar4 = c->n_cigar * 4;
672
350
    CG_st = CG - b->data - 2;
673
350
    CG_en = CG_st + 8 + n_cigar4;
674
350
    if (possibly_expand_bam_data(b, n_cigar4 - fake_bytes) < 0) return -1;
675
350
    b->l_data = b->l_data - fake_bytes + n_cigar4; // we need c->n_cigar-fake_bytes bytes to swap CIGAR to the right place
676
350
    memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + fake_bytes, ori_len - (cigar_st + fake_bytes)); // insert c->n_cigar-fake_bytes empty space to make room
677
350
    memcpy(b->data + cigar_st, b->data + (n_cigar4 - fake_bytes) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -fake_bytes for the fake CIGAR
678
350
    if (ori_len > CG_en) // move data after the CG tag
679
26
        memmove(b->data + CG_st + n_cigar4 - fake_bytes, b->data + CG_en + n_cigar4 - fake_bytes, ori_len - CG_en);
680
350
    b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4)
681
350
    if (recal_bin)
682
350
        b->core.bin = hts_reg2bin(b->core.pos, bam_endpos(b), 14, 5);
683
350
    if (give_warning)
684
350
        hts_log_error("%s encodes a CIGAR with %d operators at the CG tag", bam_get_qname(b), c->n_cigar);
685
350
    return 1;
686
350
}
687
688
static inline int aux_type2size(uint8_t type)
689
4.03M
{
690
4.03M
    switch (type) {
691
1.31M
    case 'A': case 'c': case 'C':
692
1.31M
        return 1;
693
545k
    case 's': case 'S':
694
545k
        return 2;
695
869k
    case 'i': case 'I': case 'f':
696
869k
        return 4;
697
150
    case 'd':
698
150
        return 8;
699
1.30M
    case 'Z': case 'H': case 'B':
700
1.30M
        return type;
701
2
    default:
702
2
        return 0;
703
4.03M
    }
704
4.03M
}
705
706
static void swap_data(const bam1_core_t *c, int l_data, uint8_t *data, int is_host)
707
0
{
708
0
    uint32_t *cigar = (uint32_t*)(data + c->l_qname);
709
0
    uint32_t i;
710
0
    for (i = 0; i < c->n_cigar; ++i) ed_swap_4p(&cigar[i]);
711
0
}
712
713
// Fix bad records where qname is not terminated correctly.
714
229
static int fixup_missing_qname_nul(bam1_t *b) {
715
229
    bam1_core_t *c = &b->core;
716
717
    // Note this is called before c->l_extranul is added to c->l_qname
718
229
    if (c->l_extranul > 0) {
719
184
        b->data[c->l_qname++] = '\0';
720
184
        c->l_extranul--;
721
184
    } else {
722
45
        if (b->l_data > INT_MAX - 4) return -1;
723
45
        if (realloc_bam_data(b, b->l_data + 4) < 0) return -1;
724
45
        b->l_data += 4;
725
45
        b->data[c->l_qname++] = '\0';
726
45
        c->l_extranul = 3;
727
45
    }
728
229
    return 0;
729
229
}
730
731
/*
732
 * Note a second interface that returns a bam pointer instead would avoid bam_copy1
733
 * in multi-threaded handling.  This may be worth considering for htslib2.
734
 */
735
int bam_read1(BGZF *fp, bam1_t *b)
736
357
{
737
357
    bam1_core_t *c = &b->core;
738
357
    int32_t block_len, ret, i;
739
357
    uint32_t x[8], new_l_data;
740
741
357
    b->l_data = 0;
742
743
357
    if ((ret = bgzf_read(fp, &block_len, 4)) != 4) {
744
1
        if (ret == 0) return -1; // normal end-of-file
745
1
        else return -2; // truncated
746
1
    }
747
356
    if (fp->is_be)
748
0
        ed_swap_4p(&block_len);
749
356
    if (block_len < 32) return -4;  // block_len includes core data
750
356
    if (bgzf_read(fp, x, 32) != 32) return -3;
751
353
    if (fp->is_be) {
752
0
        for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
753
0
    }
754
353
    c->tid = x[0]; c->pos = (int32_t)x[1];
755
353
    c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff;
756
353
    c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0;
757
353
    c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff;
758
353
    c->l_qseq = x[4];
759
353
    c->mtid = x[5]; c->mpos = (int32_t)x[6]; c->isize = (int32_t)x[7];
760
761
353
    new_l_data = block_len - 32 + c->l_extranul;
762
353
    if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4;
763
353
    if (((uint64_t) c->n_cigar << 2) + c->l_qname + c->l_extranul
764
353
        + (((uint64_t) c->l_qseq + 1) >> 1) + c->l_qseq > (uint64_t) new_l_data)
765
1
        return -4;
766
352
    if (realloc_bam_data(b, new_l_data) < 0) return -4;
767
352
    b->l_data = new_l_data;
768
769
352
    if (bgzf_read(fp, b->data, c->l_qname) != c->l_qname) return -4;
770
352
    if (b->data[c->l_qname - 1] != '\0') { // Try to fix missing NUL termination
771
229
        if (fixup_missing_qname_nul(b) < 0) return -4;
772
229
    }
773
677
    for (i = 0; i < c->l_extranul; ++i) b->data[c->l_qname+i] = '\0';
774
352
    c->l_qname += c->l_extranul;
775
352
    if (b->l_data < c->l_qname ||
776
352
        bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname)
777
9
        return -4;
778
343
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
779
343
    if (bam_tag2cigar(b, 0, 0) < 0)
780
0
        return -4;
781
782
343
    if (c->n_cigar > 0) { // recompute "bin" and check CIGAR-qlen consistency
783
299
        hts_pos_t rlen, qlen;
784
299
        bam_cigar2rqlens(c->n_cigar, bam_get_cigar(b), &rlen, &qlen);
785
299
        if ((b->core.flag & BAM_FUNMAP) || rlen == 0) rlen = 1;
786
299
        b->core.bin = hts_reg2bin(b->core.pos, b->core.pos + rlen, 14, 5);
787
        // Sanity check for broken CIGAR alignments
788
299
        if (c->l_qseq > 0 && !(c->flag & BAM_FUNMAP) && qlen != c->l_qseq) {
789
0
            hts_log_error("CIGAR and query sequence lengths differ for %s",
790
0
                    bam_get_qname(b));
791
0
            return -4;
792
0
        }
793
299
    }
794
795
343
    return 4 + block_len;
796
343
}
797
798
int bam_write1(BGZF *fp, const bam1_t *b)
799
0
{
800
0
    const bam1_core_t *c = &b->core;
801
0
    uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y;
802
0
    int i, ok;
803
0
    if (c->l_qname - c->l_extranul > 255) {
804
0
        hts_log_error("QNAME \"%s\" is longer than 254 characters", bam_get_qname(b));
805
0
        errno = EOVERFLOW;
806
0
        return -1;
807
0
    }
808
0
    if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR
809
0
    if (c->pos > INT_MAX ||
810
0
        c->mpos > INT_MAX ||
811
0
        c->isize < INT_MIN || c->isize > INT_MAX) {
812
0
        hts_log_error("Positional data is too large for BAM format");
813
0
        return -1;
814
0
    }
815
0
    x[0] = c->tid;
816
0
    x[1] = c->pos;
817
0
    x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul);
818
0
    if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 2;
819
0
    else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff);
820
0
    x[4] = c->l_qseq;
821
0
    x[5] = c->mtid;
822
0
    x[6] = c->mpos;
823
0
    x[7] = c->isize;
824
0
    ok = (bgzf_flush_try(fp, 4 + block_len) >= 0);
825
0
    if (fp->is_be) {
826
0
        for (i = 0; i < 8; ++i) ed_swap_4p(x + i);
827
0
        y = block_len;
828
0
        if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0);
829
0
        swap_data(c, b->l_data, b->data, 1);
830
0
    } else {
831
0
        if (ok) ok = (bgzf_write(fp, &block_len, 4) >= 0);
832
0
    }
833
0
    if (ok) ok = (bgzf_write(fp, x, 32) >= 0);
834
0
    if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0);
835
0
    if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally
836
0
        if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0);
837
0
    } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag
838
0
        uint8_t buf[8];
839
0
        uint32_t cigar_st, cigar_en, cigar[2];
840
0
        hts_pos_t cigreflen = bam_cigar2rlen(c->n_cigar, bam_get_cigar(b));
841
0
        if (cigreflen >= (1<<28)) {
842
            // Length of reference covered is greater than the biggest
843
            // CIGAR operation currently allowed.
844
0
            hts_log_error("Record %s with %d CIGAR ops and ref length %"PRIhts_pos
845
0
                          " cannot be written in BAM.  Try writing SAM or CRAM instead.\n",
846
0
                          bam_get_qname(b), c->n_cigar, cigreflen);
847
0
            return -1;
848
0
        }
849
0
        cigar_st = (uint8_t*)bam_get_cigar(b) - b->data;
850
0
        cigar_en = cigar_st + c->n_cigar * 4;
851
0
        cigar[0] = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP;
852
0
        cigar[1] = (uint32_t)cigreflen << 4 | BAM_CREF_SKIP;
853
0
        u32_to_le(cigar[0], buf);
854
0
        u32_to_le(cigar[1], buf + 4);
855
0
        if (ok) ok = (bgzf_write(fp, buf, 8) >= 0); // write cigar: <read_length>S<ref_length>N
856
0
        if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR
857
0
        if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I
858
0
        u32_to_le(c->n_cigar, buf);
859
0
        if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length
860
0
        if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR
861
0
    }
862
0
    if (fp->is_be) swap_data(c, b->l_data, b->data, 0);
863
0
    return ok? 4 + block_len : -1;
864
0
}
865
866
/*
867
 * Write a BAM file and append to the in-memory index simultaneously.
868
 */
869
0
static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) {
870
0
    BGZF *bfp = fp->fp.bgzf;
871
872
0
    if (!fp->idx)
873
0
        return bam_write1(bfp, b);
874
875
0
    uint32_t block_len = b->l_data - b->core.l_extranul + 32;
876
0
    if (bgzf_flush_try(bfp, 4 + block_len) < 0)
877
0
        return -1;
878
0
    if (!bfp->mt)
879
0
        hts_idx_amend_last(fp->idx, bgzf_tell(bfp));
880
0
    else
881
0
        bgzf_idx_amend_last(bfp, fp->idx, bgzf_tell(bfp));
882
883
0
    int ret = bam_write1(bfp, b);
884
0
    if (ret < 0)
885
0
        return -1;
886
887
0
    if (bgzf_idx_push(bfp, fp->idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(bfp), !(b->core.flag&BAM_FUNMAP)) < 0) {
888
0
        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
889
0
                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
890
0
        ret = -1;
891
0
    }
892
893
0
    return ret;
894
0
}
895
896
/*
897
 * Set the qname in a BAM record
898
 */
899
int bam_set_qname(bam1_t *rec, const char *qname)
900
0
{
901
0
    if (!rec) return -1;
902
0
    if (!qname || !*qname) return -1;
903
904
0
    size_t old_len = rec->core.l_qname;
905
0
    size_t new_len = strlen(qname) + 1;
906
0
    if (new_len < 1 || new_len > 255) return -1;
907
908
0
    int extranul = (new_len%4 != 0) ? (4 - new_len%4) : 0;
909
910
0
    size_t new_data_len = rec->l_data - old_len + new_len + extranul;
911
0
    if (realloc_bam_data(rec, new_data_len) < 0) return -1;
912
913
    // Make room
914
0
    if (new_len + extranul != rec->core.l_qname)
915
0
        memmove(rec->data + new_len + extranul, rec->data + rec->core.l_qname, rec->l_data - rec->core.l_qname);
916
    // Copy in new name and pad if needed
917
0
    memcpy(rec->data, qname, new_len);
918
0
    int n;
919
0
    for (n = 0; n < extranul; n++) rec->data[new_len + n] = '\0';
920
921
0
    rec->l_data = new_data_len;
922
0
    rec->core.l_qname = new_len + extranul;
923
0
    rec->core.l_extranul = extranul;
924
925
0
    return 0;
926
0
}
927
928
/********************
929
 *** BAM indexing ***
930
 ********************/
931
932
static hts_idx_t *sam_index(htsFile *fp, int min_shift)
933
0
{
934
0
    int n_lvls, i, fmt, ret;
935
0
    bam1_t *b;
936
0
    hts_idx_t *idx;
937
0
    sam_hdr_t *h;
938
0
    h = sam_hdr_read(fp);
939
0
    if (h == NULL) return NULL;
940
0
    if (min_shift > 0) {
941
0
        hts_pos_t max_len = 0, s;
942
0
        for (i = 0; i < h->n_targets; ++i) {
943
0
            hts_pos_t len = sam_hdr_tid2len(h, i);
944
0
            if (max_len < len) max_len = len;
945
0
        }
946
0
        max_len += 256;
947
0
        for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
948
0
        fmt = HTS_FMT_CSI;
949
0
    } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
950
0
    idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
951
0
    b = bam_init1();
952
0
    while ((ret = sam_read1(fp, h, b)) >= 0) {
953
0
        ret = hts_idx_push(idx, b->core.tid, b->core.pos, bam_endpos(b), bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP));
954
0
        if (ret < 0) { // unsorted or doesn't fit
955
0
            hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed", bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
956
0
            goto err;
957
0
        }
958
0
    }
959
0
    if (ret < -1) goto err; // corrupted BAM file
960
961
0
    hts_idx_finish(idx, bgzf_tell(fp->fp.bgzf));
962
0
    sam_hdr_destroy(h);
963
0
    bam_destroy1(b);
964
0
    return idx;
965
966
0
err:
967
0
    bam_destroy1(b);
968
0
    hts_idx_destroy(idx);
969
0
    return NULL;
970
0
}
971
972
int sam_index_build3(const char *fn, const char *fnidx, int min_shift, int nthreads)
973
0
{
974
0
    hts_idx_t *idx;
975
0
    htsFile *fp;
976
0
    int ret = 0;
977
978
0
    if ((fp = hts_open(fn, "r")) == 0) return -2;
979
0
    if (nthreads)
980
0
        hts_set_threads(fp, nthreads);
981
982
0
    switch (fp->format.format) {
983
0
    case cram:
984
985
0
        ret = cram_index_build(fp->fp.cram, fn, fnidx);
986
0
        break;
987
988
0
    case bam:
989
0
    case sam:
990
0
        if (fp->format.compression != bgzf) {
991
0
            hts_log_error("%s file \"%s\" not BGZF compressed",
992
0
                          fp->format.format == bam ? "BAM" : "SAM", fn);
993
0
            ret = -1;
994
0
            break;
995
0
        }
996
0
        idx = sam_index(fp, min_shift);
997
0
        if (idx) {
998
0
            ret = hts_idx_save_as(idx, fn, fnidx, (min_shift > 0)? HTS_FMT_CSI : HTS_FMT_BAI);
999
0
            if (ret < 0) ret = -4;
1000
0
            hts_idx_destroy(idx);
1001
0
        }
1002
0
        else ret = -1;
1003
0
        break;
1004
1005
0
    default:
1006
0
        ret = -3;
1007
0
        break;
1008
0
    }
1009
0
    hts_close(fp);
1010
1011
0
    return ret;
1012
0
}
1013
1014
int sam_index_build2(const char *fn, const char *fnidx, int min_shift)
1015
0
{
1016
0
    return sam_index_build3(fn, fnidx, min_shift, 0);
1017
0
}
1018
1019
int sam_index_build(const char *fn, int min_shift)
1020
0
{
1021
0
    return sam_index_build3(fn, NULL, min_shift, 0);
1022
0
}
1023
1024
// Provide bam_index_build() symbol for binary compatibility with earlier HTSlib
1025
#undef bam_index_build
1026
int bam_index_build(const char *fn, int min_shift)
1027
0
{
1028
0
    return sam_index_build2(fn, NULL, min_shift);
1029
0
}
1030
1031
// Initialise fp->idx for the current format type.
1032
// This must be called after the header has been written but no other data.
1033
0
int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx) {
1034
0
    fp->fnidx = fnidx;
1035
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1036
0
        (fp->format.format == sam && fp->format.compression == bgzf)) {
1037
0
        int n_lvls, fmt = HTS_FMT_CSI;
1038
0
        if (min_shift > 0) {
1039
0
            int64_t max_len = 0, s;
1040
0
            int i;
1041
0
            for (i = 0; i < h->n_targets; ++i)
1042
0
                if (max_len < h->target_len[i]) max_len = h->target_len[i];
1043
0
            max_len += 256;
1044
0
            for (n_lvls = 0, s = 1<<min_shift; max_len > s; ++n_lvls, s <<= 3);
1045
1046
0
        } else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_BAI;
1047
1048
0
        fp->idx = hts_idx_init(h->n_targets, fmt, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls);
1049
0
        return fp->idx ? 0 : -1;
1050
0
    }
1051
1052
0
    if (fp->format.format == cram) {
1053
0
        fp->fp.cram->idxfp = bgzf_open(fnidx, "wg");
1054
0
        return fp->fp.cram->idxfp ? 0 : -1;
1055
0
    }
1056
1057
0
    return -1;
1058
0
}
1059
1060
// Finishes an index. Call after the last record has been written.
1061
// Returns 0 on success, <0 on failure.
1062
0
int sam_idx_save(htsFile *fp) {
1063
0
    if (fp->format.format == bam || fp->format.format == bcf ||
1064
0
        fp->format.format == vcf || fp->format.format == sam) {
1065
0
        int ret;
1066
0
        if ((ret = sam_state_destroy(fp)) < 0) {
1067
0
            errno = -ret;
1068
0
            return -1;
1069
0
        }
1070
0
        if (bgzf_flush(fp->fp.bgzf) < 0)
1071
0
            return -1;
1072
0
        hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf));
1073
1074
0
        if (hts_idx_finish(fp->idx, bgzf_tell(fp->fp.bgzf)) < 0)
1075
0
            return -1;
1076
1077
0
        return hts_idx_save_as(fp->idx, NULL, fp->fnidx, hts_idx_fmt(fp->idx));
1078
1079
0
    } else if (fp->format.format == cram) {
1080
        // flushed and closed by cram_close
1081
0
    }
1082
1083
0
    return 0;
1084
0
}
1085
1086
static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1087
0
{
1088
0
    htsFile *fp = (htsFile *)fpv;
1089
0
    bam1_t *b = bv;
1090
0
    fp->line.l = 0;
1091
0
    int ret = sam_read1(fp, fp->bam_header, b);
1092
0
    if (ret >= 0) {
1093
0
        *tid = b->core.tid;
1094
0
        *beg = b->core.pos;
1095
0
        *end = bam_endpos(b);
1096
0
    }
1097
0
    return ret;
1098
0
}
1099
1100
// This is used only with read_rest=1 iterators, so need not set tid/beg/end.
1101
static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1102
0
{
1103
0
    htsFile *fp = (htsFile *)fpv;
1104
0
    bam1_t *b = bv;
1105
0
    fp->line.l = 0;
1106
0
    int ret = sam_read1(fp, fp->bam_header, b);
1107
0
    return ret;
1108
0
}
1109
1110
// Internal (for now) func used by bam_sym_lookup.  This is copied from
1111
// samtools/bam.c.
1112
static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b)
1113
0
{
1114
0
    const char *rg;
1115
0
    kstring_t lib = { 0, 0, NULL };
1116
0
    rg = (char *)bam_aux_get(b, "RG");
1117
1118
0
    if (!rg)
1119
0
        return NULL;
1120
0
    else
1121
0
        rg++;
1122
1123
0
    if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib)  < 0)
1124
0
        return NULL;
1125
1126
0
    static char LB_text[1024];
1127
0
    int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1;
1128
1129
0
    memcpy(LB_text, lib.s, len);
1130
0
    LB_text[len] = 0;
1131
1132
0
    free(lib.s);
1133
1134
0
    return LB_text;
1135
0
}
1136
1137
1138
// Bam record pointer and SAM header combined
1139
typedef struct {
1140
    const sam_hdr_t *h;
1141
    const bam1_t *b;
1142
} hb_pair;
1143
1144
// Looks up variable names in str and replaces them with their value.
1145
// Also supports aux tags.
1146
//
1147
// Note the expression parser deliberately overallocates str size so it
1148
// is safe to use memcmp over strcmp.
1149
static int bam_sym_lookup(void *data, char *str, char **end,
1150
0
                          hts_expr_val_t *res) {
1151
0
    hb_pair *hb = (hb_pair *)data;
1152
0
    const bam1_t *b = hb->b;
1153
1154
0
    res->is_str = 0;
1155
0
    switch(*str) {
1156
0
    case 'c':
1157
0
        if (memcmp(str, "cigar", 5) == 0) {
1158
0
            *end = str+5;
1159
0
            res->is_str = 1;
1160
0
            ks_clear(&res->s);
1161
0
            uint32_t *cigar = bam_get_cigar(b);
1162
0
            int i, n = b->core.n_cigar, r = 0;
1163
0
            if (n) {
1164
0
                for (i = 0; i < n; i++) {
1165
0
                    r |= kputw (bam_cigar_oplen(cigar[i]), &res->s) < 0;
1166
0
                    r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s) < 0;
1167
0
                }
1168
0
                r |= kputs("", &res->s) < 0;
1169
0
            } else {
1170
0
                r |= kputs("*", &res->s) < 0;
1171
0
            }
1172
0
            return r ? -1 : 0;
1173
0
        }
1174
0
        break;
1175
1176
0
    case 'e':
1177
0
        if (memcmp(str, "endpos", 6) == 0) {
1178
0
            *end = str+6;
1179
0
            res->d = bam_endpos(b);
1180
0
            return 0;
1181
0
        }
1182
0
        break;
1183
1184
0
    case 'f':
1185
0
        if (memcmp(str, "flag", 4) == 0) {
1186
0
            str = *end = str+4;
1187
0
            if (*str != '.') {
1188
0
                res->d = b->core.flag;
1189
0
                return 0;
1190
0
            } else {
1191
0
                str++;
1192
0
                if (!memcmp(str, "paired", 6)) {
1193
0
                    *end = str+6;
1194
0
                    res->d = b->core.flag & BAM_FPAIRED;
1195
0
                    return 0;
1196
0
                } else if (!memcmp(str, "proper_pair", 11)) {
1197
0
                    *end = str+11;
1198
0
                    res->d = b->core.flag & BAM_FPROPER_PAIR;
1199
0
                    return 0;
1200
0
                } else if (!memcmp(str, "unmap", 5)) {
1201
0
                    *end = str+5;
1202
0
                    res->d = b->core.flag & BAM_FUNMAP;
1203
0
                    return 0;
1204
0
                } else if (!memcmp(str, "munmap", 6)) {
1205
0
                    *end = str+6;
1206
0
                    res->d = b->core.flag & BAM_FMUNMAP;
1207
0
                    return 0;
1208
0
                } else if (!memcmp(str, "reverse", 7)) {
1209
0
                    *end = str+7;
1210
0
                    res->d = b->core.flag & BAM_FREVERSE;
1211
0
                    return 0;
1212
0
                } else if (!memcmp(str, "mreverse", 8)) {
1213
0
                    *end = str+8;
1214
0
                    res->d = b->core.flag & BAM_FMREVERSE;
1215
0
                    return 0;
1216
0
                } else if (!memcmp(str, "read1", 5)) {
1217
0
                    *end = str+5;
1218
0
                    res->d = b->core.flag & BAM_FREAD1;
1219
0
                    return 0;
1220
0
                } else if (!memcmp(str, "read2", 5)) {
1221
0
                    *end = str+5;
1222
0
                    res->d = b->core.flag & BAM_FREAD2;
1223
0
                    return 0;
1224
0
                } else if (!memcmp(str, "secondary", 9)) {
1225
0
                    *end = str+9;
1226
0
                    res->d = b->core.flag & BAM_FSECONDARY;
1227
0
                    return 0;
1228
0
                } else if (!memcmp(str, "qcfail", 6)) {
1229
0
                    *end = str+6;
1230
0
                    res->d = b->core.flag & BAM_FQCFAIL;
1231
0
                    return 0;
1232
0
                } else if (!memcmp(str, "dup", 3)) {
1233
0
                    *end = str+3;
1234
0
                    res->d = b->core.flag & BAM_FDUP;
1235
0
                    return 0;
1236
0
                } else if (!memcmp(str, "supplementary", 13)) {
1237
0
                    *end = str+13;
1238
0
                    res->d = b->core.flag & BAM_FSUPPLEMENTARY;
1239
0
                    return 0;
1240
0
                } else {
1241
0
                    hts_log_error("Unrecognised flag string");
1242
0
                    return -1;
1243
0
                }
1244
0
            }
1245
0
        }
1246
0
        break;
1247
1248
0
    case 'l':
1249
0
        if (memcmp(str, "library", 7) == 0) {
1250
0
            *end = str+7;
1251
0
            res->is_str = 1;
1252
0
            const char *lib = bam_get_library(hb->h, b);
1253
0
            kputs(lib ? lib : "", ks_clear(&res->s));
1254
0
            return 0;
1255
0
        }
1256
0
        break;
1257
1258
0
    case 'm':
1259
0
        if (memcmp(str, "mapq", 4) == 0) {
1260
0
            *end = str+4;
1261
0
            res->d = b->core.qual;
1262
0
            return 0;
1263
0
        } else if (memcmp(str, "mpos", 4) == 0) {
1264
0
            *end = str+4;
1265
0
            res->d = b->core.mpos+1;
1266
0
            return 0;
1267
0
        } else if (memcmp(str, "mrname", 6) == 0) {
1268
0
            *end = str+6;
1269
0
            res->is_str = 1;
1270
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1271
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1272
0
            return 0;
1273
0
        } else if (memcmp(str, "mrefid", 6) == 0) {
1274
0
            *end = str+6;
1275
0
            res->d = b->core.mtid;
1276
0
            return 0;
1277
0
        }
1278
0
        break;
1279
1280
0
    case 'n':
1281
0
        if (memcmp(str, "ncigar", 6) == 0) {
1282
0
            *end = str+6;
1283
0
            res->d = b->core.n_cigar;
1284
0
            return 0;
1285
0
        }
1286
0
        break;
1287
1288
0
    case 'p':
1289
0
        if (memcmp(str, "pos", 3) == 0) {
1290
0
            *end = str+3;
1291
0
            res->d = b->core.pos+1;
1292
0
            return 0;
1293
0
        } else if (memcmp(str, "pnext", 5) == 0) {
1294
0
            *end = str+5;
1295
0
            res->d = b->core.mpos+1;
1296
0
            return 0;
1297
0
        }
1298
0
        break;
1299
1300
0
    case 'q':
1301
0
        if (memcmp(str, "qlen", 4) == 0) {
1302
0
            *end = str+4;
1303
0
            res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b));
1304
0
            return 0;
1305
0
        } else if (memcmp(str, "qname", 5) == 0) {
1306
0
            *end = str+5;
1307
0
            res->is_str = 1;
1308
0
            kputs(bam_get_qname(b), ks_clear(&res->s));
1309
0
            return 0;
1310
0
        } else if (memcmp(str, "qual", 4) == 0) {
1311
0
            *end = str+4;
1312
0
            ks_clear(&res->s);
1313
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1314
0
                return -1;
1315
0
            memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq);
1316
0
            res->s.l = b->core.l_qseq;
1317
0
            res->is_str = 1;
1318
0
            return 0;
1319
0
        }
1320
0
        break;
1321
1322
0
    case 'r':
1323
0
        if (memcmp(str, "rlen", 4) == 0) {
1324
0
            *end = str+4;
1325
0
            res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
1326
0
            return 0;
1327
0
        } else if (memcmp(str, "rname", 5) == 0) {
1328
0
            *end = str+5;
1329
0
            res->is_str = 1;
1330
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.tid);
1331
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1332
0
            return 0;
1333
0
        } else if (memcmp(str, "rnext", 5) == 0) {
1334
0
            *end = str+5;
1335
0
            res->is_str = 1;
1336
0
            const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid);
1337
0
            kputs(rn ? rn : "*", ks_clear(&res->s));
1338
0
            return 0;
1339
0
        } else if (memcmp(str, "refid", 5) == 0) {
1340
0
            *end = str+5;
1341
0
            res->d = b->core.tid;
1342
0
            return 0;
1343
0
        }
1344
0
        break;
1345
1346
0
    case 's':
1347
0
        if (memcmp(str, "seq", 3) == 0) {
1348
0
            *end = str+3;
1349
0
            ks_clear(&res->s);
1350
0
            if (ks_resize(&res->s, b->core.l_qseq+1) < 0)
1351
0
                return -1;
1352
0
            nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq);
1353
0
            res->s.s[b->core.l_qseq] = 0;
1354
0
            res->s.l = b->core.l_qseq;
1355
0
            res->is_str = 1;
1356
0
            return 0;
1357
0
        } else if (memcmp(str, "sclen", 5) == 0) {
1358
0
            int sclen = 0;
1359
0
            uint32_t *cigar = bam_get_cigar(b);
1360
0
            int ncigar = b->core.n_cigar;
1361
0
            int left = 0;
1362
1363
            // left
1364
0
            if (ncigar > 0
1365
0
                && bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP)
1366
0
                left = 0, sclen += bam_cigar_oplen(cigar[0]);
1367
0
            else if (ncigar > 1
1368
0
                     && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP
1369
0
                     && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP)
1370
0
                left = 1, sclen += bam_cigar_oplen(cigar[1]);
1371
1372
            // right
1373
0
            if (ncigar-1 > left
1374
0
                && bam_cigar_op(cigar[ncigar-1]) == BAM_CSOFT_CLIP)
1375
0
                sclen += bam_cigar_oplen(cigar[ncigar-1]);
1376
0
            else if (ncigar-2 > left
1377
0
                     && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP
1378
0
                     && bam_cigar_op(cigar[ncigar-2]) == BAM_CSOFT_CLIP)
1379
0
                sclen += bam_cigar_oplen(cigar[ncigar-2]);
1380
1381
0
            *end = str+5;
1382
0
            res->d = sclen;
1383
0
            return 0;
1384
0
        }
1385
0
        break;
1386
1387
0
    case 't':
1388
0
        if (memcmp(str, "tlen", 4) == 0) {
1389
0
            *end = str+4;
1390
0
            res->d = b->core.isize;
1391
0
            return 0;
1392
0
        }
1393
0
        break;
1394
1395
0
    case '[':
1396
0
        if (*str == '[' && str[1] && str[2] && str[3] == ']') {
1397
            /* aux tags */
1398
0
            *end = str+4;
1399
1400
0
            uint8_t *aux = bam_aux_get(b, str+1);
1401
0
            if (aux) {
1402
                // we define the truth of a tag to be its presence, even if 0.
1403
0
                res->is_true = 1;
1404
0
                switch (*aux) {
1405
0
                case 'Z':
1406
0
                case 'H':
1407
0
                    res->is_str = 1;
1408
0
                    kputs((char *)aux+1, ks_clear(&res->s));
1409
0
                    break;
1410
1411
0
                case 'A':
1412
0
                    res->is_str = 1;
1413
0
                    kputsn((char *)aux+1, 1, ks_clear(&res->s));
1414
0
                    break;
1415
1416
0
                case 'i': case 'I':
1417
0
                case 's': case 'S':
1418
0
                case 'c': case 'C':
1419
0
                    res->is_str = 0;
1420
0
                    res->d = bam_aux2i(aux);
1421
0
                    break;
1422
1423
0
                case 'f':
1424
0
                case 'd':
1425
0
                    res->is_str = 0;
1426
0
                    res->d = bam_aux2f(aux);
1427
0
                    break;
1428
1429
0
                default:
1430
0
                    hts_log_error("Aux type '%c not yet supported by filters",
1431
0
                                  *aux);
1432
0
                    return -1;
1433
0
                }
1434
0
                return 0;
1435
1436
0
            } else {
1437
                // hence absent tags are always false (and strings)
1438
0
                res->is_str = 1;
1439
0
                res->s.l = 0;
1440
0
                res->d = 0;
1441
0
                res->is_true = 0;
1442
0
                return 0;
1443
0
            }
1444
0
        }
1445
0
        break;
1446
0
    }
1447
1448
    // All successful matches in switch should return 0.
1449
    // So if we didn't match, it's a parse error.
1450
0
    return -1;
1451
0
}
1452
1453
// Returns 1 when accepted by the filter, 0 if not, -1 on error.
1454
int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt)
1455
0
{
1456
0
    hb_pair hb = {h, b};
1457
0
    hts_expr_val_t res = HTS_EXPR_VAL_INIT;
1458
0
    if (hts_filter_eval2(filt, &hb, bam_sym_lookup, &res)) {
1459
0
        hts_log_error("Couldn't process filter expression");
1460
0
        hts_expr_val_free(&res);
1461
0
        return -1;
1462
0
    }
1463
1464
0
    int t = res.is_true;
1465
0
    hts_expr_val_free(&res);
1466
1467
0
    return t;
1468
0
}
1469
1470
static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end)
1471
0
{
1472
0
    htsFile *fp = fpv;
1473
0
    bam1_t *b = bv;
1474
0
    int pass_filter, ret;
1475
1476
0
    do {
1477
0
        ret = cram_get_bam_seq(fp->fp.cram, &b);
1478
0
        if (ret < 0)
1479
0
            return cram_eof(fp->fp.cram) ? -1 : -2;
1480
1481
0
        if (bam_tag2cigar(b, 1, 1) < 0)
1482
0
            return -2;
1483
1484
0
        *tid = b->core.tid;
1485
0
        *beg = b->core.pos;
1486
0
        *end = bam_endpos(b);
1487
1488
0
        if (fp->filter) {
1489
0
            pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter);
1490
0
            if (pass_filter < 0)
1491
0
                return -2;
1492
0
        } else {
1493
0
            pass_filter = 1;
1494
0
        }
1495
0
    } while (pass_filter == 0);
1496
1497
0
    return ret;
1498
0
}
1499
1500
static int cram_pseek(void *fp, int64_t offset, int whence)
1501
0
{
1502
0
    cram_fd *fd =  (cram_fd *)fp;
1503
1504
0
    if ((0 != cram_seek(fd, offset, SEEK_SET))
1505
0
     && (0 != cram_seek(fd, offset - fd->first_container, SEEK_CUR)))
1506
0
        return -1;
1507
1508
0
    fd->curr_position = offset;
1509
1510
0
    if (fd->ctr) {
1511
0
        cram_free_container(fd->ctr);
1512
0
        if (fd->ctr_mt && fd->ctr_mt != fd->ctr)
1513
0
            cram_free_container(fd->ctr_mt);
1514
1515
0
        fd->ctr = NULL;
1516
0
        fd->ctr_mt = NULL;
1517
0
        fd->ooc = 0;
1518
0
    }
1519
1520
0
    return 0;
1521
0
}
1522
1523
/*
1524
 * cram_ptell is a pseudo-tell function, because it matches the position of the disk cursor only
1525
 *   after a fresh seek call. Otherwise it indicates that the read takes place inside the buffered
1526
 *   container previously fetched. It was designed like this to integrate with the functionality
1527
 *   of the iterator stepping logic.
1528
 */
1529
1530
static int64_t cram_ptell(void *fp)
1531
0
{
1532
0
    cram_fd *fd = (cram_fd *)fp;
1533
0
    cram_container *c;
1534
0
    cram_slice *s;
1535
0
    int64_t ret = -1L;
1536
1537
0
    if (fd) {
1538
0
        if ((c = fd->ctr) != NULL) {
1539
0
            if ((s = c->slice) != NULL && s->max_rec) {
1540
0
                if ((c->curr_slice + s->curr_rec/s->max_rec) >= (c->max_slice + 1))
1541
0
                    fd->curr_position += c->offset + c->length;
1542
0
            }
1543
0
        }
1544
0
        ret = fd->curr_position;
1545
0
    }
1546
1547
0
    return ret;
1548
0
}
1549
1550
static int bam_pseek(void *fp, int64_t offset, int whence)
1551
0
{
1552
0
    BGZF *fd = (BGZF *)fp;
1553
1554
0
    return bgzf_seek(fd, offset, whence);
1555
0
}
1556
1557
static int64_t bam_ptell(void *fp)
1558
0
{
1559
0
    BGZF *fd = (BGZF *)fp;
1560
0
    if (!fd)
1561
0
        return -1L;
1562
1563
0
    return bgzf_tell(fd);
1564
0
}
1565
1566
1567
1568
static hts_idx_t *index_load(htsFile *fp, const char *fn, const char *fnidx, int flags)
1569
0
{
1570
0
    switch (fp->format.format) {
1571
0
    case bam:
1572
0
    case sam:
1573
0
        return hts_idx_load3(fn, fnidx, HTS_FMT_BAI, flags);
1574
1575
0
    case cram: {
1576
0
        if (cram_index_load(fp->fp.cram, fn, fnidx) < 0) return NULL;
1577
1578
        // Cons up a fake "index" just pointing at the associated cram_fd:
1579
0
        hts_cram_idx_t *idx = malloc(sizeof (hts_cram_idx_t));
1580
0
        if (idx == NULL) return NULL;
1581
0
        idx->fmt = HTS_FMT_CRAI;
1582
0
        idx->cram = fp->fp.cram;
1583
0
        return (hts_idx_t *) idx;
1584
0
        }
1585
1586
0
    default:
1587
0
        return NULL; // TODO Would use tbx_index_load if it returned hts_idx_t
1588
0
    }
1589
0
}
1590
1591
hts_idx_t *sam_index_load3(htsFile *fp, const char *fn, const char *fnidx, int flags)
1592
0
{
1593
0
    return index_load(fp, fn, fnidx, flags);
1594
0
}
1595
1596
0
hts_idx_t *sam_index_load2(htsFile *fp, const char *fn, const char *fnidx) {
1597
0
    return index_load(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE);
1598
0
}
1599
1600
hts_idx_t *sam_index_load(htsFile *fp, const char *fn)
1601
0
{
1602
0
    return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE);
1603
0
}
1604
1605
static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec)
1606
0
{
1607
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1608
0
    hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t));
1609
0
    if (iter == NULL) return NULL;
1610
1611
    // Cons up a dummy iterator for which hts_itr_next() will simply invoke
1612
    // the readrec function:
1613
0
    iter->is_cram = 1;
1614
0
    iter->read_rest = 1;
1615
0
    iter->off = NULL;
1616
0
    iter->bins.a = NULL;
1617
0
    iter->readrec = readrec;
1618
1619
0
    if (tid >= 0 || tid == HTS_IDX_NOCOOR || tid == HTS_IDX_START) {
1620
0
        cram_range r = { tid, beg+1, end };
1621
0
        int ret = cram_set_option(cidx->cram, CRAM_OPT_RANGE, &r);
1622
1623
0
        iter->curr_off = 0;
1624
        // The following fields are not required by hts_itr_next(), but are
1625
        // filled in in case user code wants to look at them.
1626
0
        iter->tid = tid;
1627
0
        iter->beg = beg;
1628
0
        iter->end = end;
1629
1630
0
        switch (ret) {
1631
0
        case 0:
1632
0
            break;
1633
1634
0
        case -2:
1635
            // No data vs this ref, so mark iterator as completed.
1636
            // Same as HTS_IDX_NONE.
1637
0
            iter->finished = 1;
1638
0
            break;
1639
1640
0
        default:
1641
0
            free(iter);
1642
0
            return NULL;
1643
0
        }
1644
0
    }
1645
0
    else switch (tid) {
1646
0
    case HTS_IDX_REST:
1647
0
        iter->curr_off = 0;
1648
0
        break;
1649
0
    case HTS_IDX_NONE:
1650
0
        iter->curr_off = 0;
1651
0
        iter->finished = 1;
1652
0
        break;
1653
0
    default:
1654
0
        hts_log_error("Query with tid=%d not implemented for CRAM files", tid);
1655
0
        abort();
1656
0
        break;
1657
0
    }
1658
1659
0
    return iter;
1660
0
}
1661
1662
hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end)
1663
0
{
1664
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1665
0
    if (idx == NULL)
1666
0
        return hts_itr_query(NULL, tid, beg, end, sam_readrec_rest);
1667
0
    else if (cidx->fmt == HTS_FMT_CRAI)
1668
0
        return cram_itr_query(idx, tid, beg, end, sam_readrec);
1669
0
    else
1670
0
        return hts_itr_query(idx, tid, beg, end, sam_readrec);
1671
0
}
1672
1673
static int cram_name2id(void *fdv, const char *ref)
1674
0
{
1675
0
    cram_fd *fd = (cram_fd *) fdv;
1676
0
    return sam_hdr_name2tid(fd->header, ref);
1677
0
}
1678
1679
hts_itr_t *sam_itr_querys(const hts_idx_t *idx, sam_hdr_t *hdr, const char *region)
1680
0
{
1681
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1682
0
    return hts_itr_querys(idx, region, (hts_name2id_f)(bam_name2id), hdr,
1683
0
                          cidx->fmt == HTS_FMT_CRAI ? cram_itr_query : hts_itr_query,
1684
0
                          sam_readrec);
1685
0
}
1686
1687
hts_itr_t *sam_itr_regarray(const hts_idx_t *idx, sam_hdr_t *hdr, char **regarray, unsigned int regcount)
1688
0
{
1689
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1690
0
    hts_reglist_t *r_list = NULL;
1691
0
    int r_count = 0;
1692
1693
0
    if (!cidx || !hdr)
1694
0
        return NULL;
1695
1696
0
    hts_itr_t *itr = NULL;
1697
0
    if (cidx->fmt == HTS_FMT_CRAI) {
1698
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, cidx->cram, cram_name2id);
1699
0
        if (!r_list)
1700
0
            return NULL;
1701
0
        itr = hts_itr_regions(idx, r_list, r_count, cram_name2id, cidx->cram,
1702
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1703
0
    } else {
1704
0
        r_list = hts_reglist_create(regarray, regcount, &r_count, hdr, (hts_name2id_f)(bam_name2id));
1705
0
        if (!r_list)
1706
0
            return NULL;
1707
0
        itr = hts_itr_regions(idx, r_list, r_count, (hts_name2id_f)(bam_name2id), hdr,
1708
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1709
0
    }
1710
1711
0
    if (!itr)
1712
0
        hts_reglist_free(r_list, r_count);
1713
1714
0
    return itr;
1715
0
}
1716
1717
hts_itr_t *sam_itr_regions(const hts_idx_t *idx, sam_hdr_t *hdr, hts_reglist_t *reglist, unsigned int regcount)
1718
0
{
1719
0
    const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx;
1720
1721
0
    if(!cidx || !hdr || !reglist)
1722
0
        return NULL;
1723
1724
0
    if (cidx->fmt == HTS_FMT_CRAI)
1725
0
        return hts_itr_regions(idx, reglist, regcount, cram_name2id, cidx->cram,
1726
0
                   hts_itr_multi_cram, cram_readrec, cram_pseek, cram_ptell);
1727
0
    else
1728
0
        return hts_itr_regions(idx, reglist, regcount, (hts_name2id_f)(bam_name2id), hdr,
1729
0
                   hts_itr_multi_bam, sam_readrec, bam_pseek, bam_ptell);
1730
0
}
1731
1732
/**********************
1733
 *** SAM header I/O ***
1734
 **********************/
1735
1736
#include "htslib/kseq.h"
1737
#include "htslib/kstring.h"
1738
1739
sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text)
1740
0
{
1741
0
    sam_hdr_t *bh = sam_hdr_init();
1742
0
    if (!bh) return NULL;
1743
1744
0
    if (sam_hdr_add_lines(bh, text, l_text) != 0) {
1745
0
        sam_hdr_destroy(bh);
1746
0
        return NULL;
1747
0
    }
1748
1749
0
    return bh;
1750
0
}
1751
1752
3.07M
static int valid_sam_header_type(const char *s) {
1753
3.07M
    if (s[0] != '@') return 0;
1754
3.07M
    switch (s[1]) {
1755
11.4k
    case 'H':
1756
11.4k
        return s[2] == 'D' && s[3] == '\t';
1757
2
    case 'S':
1758
2
        return s[2] == 'Q' && s[3] == '\t';
1759
15.9k
    case 'R':
1760
3.05M
    case 'P':
1761
3.05M
        return s[2] == 'G' && s[3] == '\t';
1762
368
    case 'C':
1763
368
        return s[2] == 'O';
1764
3.07M
    }
1765
3
    return 0;
1766
3.07M
}
1767
1768
// Minimal sanitisation of a header to ensure.
1769
// - null terminated string.
1770
// - all lines start with @ (also implies no blank lines).
1771
//
1772
// Much more could be done, but currently is not, including:
1773
// - checking header types are known (HD, SQ, etc).
1774
// - syntax (eg checking tab separated fields).
1775
// - validating n_targets matches @SQ records.
1776
// - validating target lengths against @SQ records.
1777
782
static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) {
1778
782
    if (!h)
1779
18
        return NULL;
1780
1781
    // Special case for empty headers.
1782
764
    if (h->l_text == 0)
1783
252
        return h;
1784
1785
512
    size_t i;
1786
512
    unsigned int lnum = 0;
1787
512
    char *cp = h->text, last = '\n';
1788
118M
    for (i = 0; i < h->l_text; i++) {
1789
        // NB: l_text excludes terminating nul.  This finds early ones.
1790
118M
        if (cp[i] == 0)
1791
256
            break;
1792
1793
        // Error on \n[^@], including duplicate newlines
1794
118M
        if (last == '\n') {
1795
2.71M
            lnum++;
1796
2.71M
            if (cp[i] != '@') {
1797
0
                hts_log_error("Malformed SAM header at line %u", lnum);
1798
0
                sam_hdr_destroy(h);
1799
0
                return NULL;
1800
0
            }
1801
2.71M
        }
1802
1803
118M
        last = cp[i];
1804
118M
    }
1805
1806
512
    if (i < h->l_text) { // Early nul found.  Complain if not just padding.
1807
256
        size_t j = i;
1808
1.14k
        while (j < h->l_text && cp[j] == '\0') j++;
1809
256
        if (j < h->l_text)
1810
256
            hts_log_warning("Unexpected NUL character in header. Possibly truncated");
1811
256
    }
1812
1813
    // Add trailing newline and/or trailing nul if required.
1814
512
    if (last != '\n') {
1815
250
        hts_log_warning("Missing trailing newline on SAM header. Possibly truncated");
1816
1817
250
        if (h->l_text < 2 || i >= h->l_text - 2) {
1818
31
            if (h->l_text >= SIZE_MAX - 2) {
1819
0
                hts_log_error("No room for extra newline");
1820
0
                sam_hdr_destroy(h);
1821
0
                return NULL;
1822
0
            }
1823
1824
31
            cp = realloc(h->text, (size_t) h->l_text+2);
1825
31
            if (!cp) {
1826
0
                sam_hdr_destroy(h);
1827
0
                return NULL;
1828
0
            }
1829
31
            h->text = cp;
1830
31
        }
1831
250
        cp[i++] = '\n';
1832
1833
        // l_text may be larger already due to multiple nul padding
1834
250
        if (h->l_text < i)
1835
0
            h->l_text = i;
1836
250
        cp[h->l_text] = '\0';
1837
250
    }
1838
1839
512
    return h;
1840
512
}
1841
1842
212
static void known_stderr(const char *tool, const char *advice) {
1843
212
    hts_log_warning("SAM file corrupted by embedded %s error/log message", tool);
1844
212
    hts_log_warning("%s", advice);
1845
212
}
1846
1847
1.83k
static void warn_if_known_stderr(const char *line) {
1848
1.83k
    if (strstr(line, "M::bwa_idx_load_from_disk") != NULL)
1849
131
        known_stderr("bwa", "Use `bwa mem -o file.sam ...` or `bwa sampe -f file.sam ...` instead of `bwa ... > file.sam`");
1850
1.70k
    else if (strstr(line, "M::mem_pestat") != NULL)
1851
11
        known_stderr("bwa", "Use `bwa mem -o file.sam ...` instead of `bwa mem ... > file.sam`");
1852
1.68k
    else if (strstr(line, "loaded/built the index") != NULL)
1853
70
        known_stderr("minimap2", "Use `minimap2 -o file.sam ...` instead of `minimap2 ... > file.sam`");
1854
1.83k
}
1855
1856
501
static sam_hdr_t *sam_hdr_create(htsFile* fp) {
1857
501
    kstring_t str = { 0, 0, NULL };
1858
501
    khint_t k;
1859
501
    sam_hdr_t* h = sam_hdr_init();
1860
501
    const char *q, *r;
1861
501
    char* sn = NULL;
1862
501
    khash_t(s2i) *d = kh_init(s2i);
1863
501
    khash_t(s2i) *long_refs = NULL;
1864
501
    if (!h || !d)
1865
0
        goto error;
1866
1867
501
    int ret, has_SQ = 0;
1868
501
    int next_c = '@';
1869
3.07M
    while (next_c == '@' && (ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) >= 0) {
1870
3.07M
        if (fp->line.s[0] != '@')
1871
0
            break;
1872
1873
3.07M
        if (fp->line.l > 3 && strncmp(fp->line.s, "@SQ", 3) == 0) {
1874
5.51k
            has_SQ = 1;
1875
5.51k
            hts_pos_t ln = -1;
1876
27.9k
            for (q = fp->line.s + 4;; ++q) {
1877
27.9k
                if (strncmp(q, "SN:", 3) == 0) {
1878
7.59k
                    q += 3;
1879
151M
                    for (r = q;*r != '\t' && *r != '\n' && *r != '\0';++r);
1880
1881
7.59k
                    if (sn) {
1882
3.65k
                        hts_log_warning("SQ header line has more than one SN: tag");
1883
3.65k
                        free(sn);
1884
3.65k
                    }
1885
7.59k
                    sn = (char*)calloc(r - q + 1, 1);
1886
7.59k
                    if (!sn)
1887
0
                        goto error;
1888
1889
7.59k
                    strncpy(sn, q, r - q);
1890
7.59k
                    q = r;
1891
20.3k
                } else {
1892
20.3k
                    if (strncmp(q, "LN:", 3) == 0)
1893
7.44k
                        ln = strtoll(q + 3, (char**)&q, 10);
1894
20.3k
                }
1895
1896
1.29M
                while (*q != '\t' && *q != '\n' && *q != '\0')
1897
1.27M
                    ++q;
1898
27.9k
                if (*q == '\0' || *q == '\n')
1899
5.51k
                    break;
1900
27.9k
            }
1901
5.51k
            if (sn) {
1902
3.94k
                if (ln >= 0) {
1903
3.69k
                    int absent;
1904
3.69k
                    k = kh_put(s2i, d, sn, &absent);
1905
3.69k
                    if (absent < 0)
1906
0
                        goto error;
1907
1908
3.69k
                    if (!absent) {
1909
1.32k
                        hts_log_warning("Duplicated sequence \"%s\" in file \"%s\"", sn, fp->fn);
1910
1.32k
                        free(sn);
1911
2.36k
                    } else {
1912
2.36k
                        sn = NULL;
1913
2.36k
                        if (ln >= UINT32_MAX) {
1914
                            // Stash away ref length that
1915
                            // doesn't fit in target_len array
1916
1.08k
                            int k2;
1917
1.08k
                            if (!long_refs) {
1918
54
                                long_refs = kh_init(s2i);
1919
54
                                if (!long_refs)
1920
0
                                    goto error;
1921
54
                            }
1922
1.08k
                            k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent);
1923
1.08k
                            if (absent < 0)
1924
0
                                goto error;
1925
1.08k
                            kh_val(long_refs, k2) = ln;
1926
1.08k
                            kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
1927
1.08k
                                            | UINT32_MAX);
1928
1.28k
                        } else {
1929
1.28k
                            kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
1930
1.28k
                        }
1931
2.36k
                    }
1932
3.69k
                } else {
1933
255
                    hts_log_warning("Ignored @SQ SN:%s : bad or missing LN tag", sn);
1934
255
                    warn_if_known_stderr(fp->line.s);
1935
255
                    free(sn);
1936
255
                }
1937
3.94k
            } else {
1938
1.57k
                hts_log_warning("Ignored @SQ line with missing SN: tag");
1939
1.57k
                warn_if_known_stderr(fp->line.s);
1940
1.57k
            }
1941
5.51k
            sn = NULL;
1942
5.51k
        }
1943
3.07M
        else if (!valid_sam_header_type(fp->line.s)) {
1944
6
            hts_log_error("Invalid header line: must start with @HD/@SQ/@RG/@PG/@CO");
1945
6
            warn_if_known_stderr(fp->line.s);
1946
6
            goto error;
1947
6
        }
1948
1949
3.07M
        if (kputsn(fp->line.s, fp->line.l, &str) < 0)
1950
0
            goto error;
1951
1952
3.07M
        if (kputc('\n', &str) < 0)
1953
0
            goto error;
1954
1955
3.07M
        if (fp->is_bgzf) {
1956
3.07M
            next_c = bgzf_peek(fp->fp.bgzf);
1957
3.07M
        } else {
1958
5.50k
            unsigned char nc;
1959
5.50k
            ssize_t pret = hpeek(fp->fp.hfile, &nc, 1);
1960
5.50k
            next_c = pret > 0 ? nc : pret - 1;
1961
5.50k
        }
1962
3.07M
        if (next_c < -1)
1963
1
            goto error;
1964
3.07M
    }
1965
494
    if (next_c != '@')
1966
485
        fp->line.l = 0;
1967
1968
494
    if (ret < -1)
1969
9
        goto error;
1970
1971
485
    if (!has_SQ && fp->fn_aux) {
1972
0
        kstring_t line = { 0, 0, NULL };
1973
1974
        /* The reference index (.fai) is actually needed here */
1975
0
        char *fai_fn = fp->fn_aux;
1976
0
        char *fn_delim = strstr(fp->fn_aux, HTS_IDX_DELIM);
1977
0
        if (fn_delim)
1978
0
            fai_fn = fn_delim + strlen(HTS_IDX_DELIM);
1979
1980
0
        hFILE* f = hopen(fai_fn, "r");
1981
0
        int e = 0, absent;
1982
0
        if (f == NULL)
1983
0
            goto error;
1984
1985
0
        while (line.l = 0, kgetline(&line, (kgets_func*) hgets, f) >= 0) {
1986
0
            char* tab = strchr(line.s, '\t');
1987
0
            hts_pos_t ln;
1988
1989
0
            if (tab == NULL)
1990
0
                continue;
1991
1992
0
            sn = (char*)calloc(tab-line.s+1, 1);
1993
0
            if (!sn) {
1994
0
                e = 1;
1995
0
                break;
1996
0
            }
1997
0
            memcpy(sn, line.s, tab-line.s);
1998
0
            k = kh_put(s2i, d, sn, &absent);
1999
0
            if (absent < 0) {
2000
0
                e = 1;
2001
0
                break;
2002
0
            }
2003
2004
0
            ln = strtoll(tab, NULL, 10);
2005
2006
0
            if (!absent) {
2007
0
                hts_log_warning("Duplicated sequence \"%s\" in the file \"%s\"", sn, fai_fn);
2008
0
                free(sn);
2009
0
                sn = NULL;
2010
0
            } else {
2011
0
                sn = NULL;
2012
0
                if (ln >= UINT32_MAX) {
2013
                    // Stash away ref length that
2014
                    // doesn't fit in target_len array
2015
0
                    khint_t k2;
2016
0
                    int absent = -1;
2017
0
                    if (!long_refs) {
2018
0
                        long_refs = kh_init(s2i);
2019
0
                        if (!long_refs) {
2020
0
                            e = 1;
2021
0
                            break;
2022
0
                        }
2023
0
                    }
2024
0
                    k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent);
2025
0
                    if (absent < 0) {
2026
0
                         e = 1;
2027
0
                         break;
2028
0
                    }
2029
0
                    kh_val(long_refs, k2) = ln;
2030
0
                    kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
2031
0
                                    | UINT32_MAX);
2032
0
                } else {
2033
0
                    kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
2034
0
                }
2035
0
                has_SQ = 1;
2036
0
            }
2037
2038
0
            e |= kputs("@SQ\tSN:", &str) < 0;
2039
0
            e |= kputsn(line.s, tab - line.s, &str) < 0;
2040
0
            e |= kputs("\tLN:", &str) < 0;
2041
0
            e |= kputll(ln, &str) < 0;
2042
0
            e |= kputc('\n', &str) < 0;
2043
0
            if (e)
2044
0
                break;
2045
0
        }
2046
2047
0
        ks_free(&line);
2048
0
        if (hclose(f) != 0) {
2049
0
            hts_log_error("Error on closing %s", fai_fn);
2050
0
            e = 1;
2051
0
        }
2052
0
        if (e)
2053
0
            goto error;
2054
0
    }
2055
2056
485
    if (has_SQ) {
2057
        // Populate the targets array
2058
277
        h->n_targets = kh_size(d);
2059
2060
277
        h->target_name = (char**) malloc(sizeof(char*) * h->n_targets);
2061
277
        if (!h->target_name) {
2062
0
            h->n_targets = 0;
2063
0
            goto error;
2064
0
        }
2065
2066
277
        h->target_len = (uint32_t*) malloc(sizeof(uint32_t) * h->n_targets);
2067
277
        if (!h->target_len) {
2068
0
            h->n_targets = 0;
2069
0
            goto error;
2070
0
        }
2071
2072
3.80k
        for (k = kh_begin(d); k != kh_end(d); ++k) {
2073
3.52k
            if (!kh_exist(d, k))
2074
1.89k
                continue;
2075
2076
1.62k
            h->target_name[kh_val(d, k) >> 32] = (char*) kh_key(d, k);
2077
1.62k
            h->target_len[kh_val(d, k) >> 32] = kh_val(d, k) & 0xffffffffUL;
2078
1.62k
            kh_val(d, k) >>= 32;
2079
1.62k
        }
2080
277
    }
2081
2082
    // Repurpose sdict to hold any references longer than UINT32_MAX
2083
485
    h->sdict = long_refs;
2084
2085
485
    kh_destroy(s2i, d);
2086
2087
485
    if (str.l == 0)
2088
0
        kputsn("", 0, &str);
2089
485
    h->l_text = str.l;
2090
485
    h->text = ks_release(&str);
2091
485
    fp->bam_header = sam_hdr_sanitise(h);
2092
485
    fp->bam_header->ref_count = 1;
2093
2094
485
    return fp->bam_header;
2095
2096
16
 error:
2097
16
    if (h && d && (!h->target_name || !h->target_len)) {
2098
1.58k
        for (k = kh_begin(d); k != kh_end(d); ++k)
2099
1.56k
            if (kh_exist(d, k)) free((void *)kh_key(d, k));
2100
16
    }
2101
16
    sam_hdr_destroy(h);
2102
16
    ks_free(&str);
2103
16
    kh_destroy(s2i, d);
2104
16
    kh_destroy(s2i, long_refs);
2105
16
    if (sn) free(sn);
2106
16
    return NULL;
2107
485
}
2108
2109
sam_hdr_t *sam_hdr_read(htsFile *fp)
2110
919
{
2111
919
    if (!fp) {
2112
0
        errno = EINVAL;
2113
0
        return NULL;
2114
0
    }
2115
2116
919
    switch (fp->format.format) {
2117
52
    case bam:
2118
52
        return sam_hdr_sanitise(bam_hdr_read(fp->fp.bgzf));
2119
2120
245
    case cram:
2121
245
        return sam_hdr_sanitise(sam_hdr_dup(fp->fp.cram->header));
2122
2123
501
    case sam:
2124
501
        return sam_hdr_create(fp);
2125
2126
28
    case fastq_format:
2127
121
    case fasta_format:
2128
121
        return sam_hdr_init();
2129
2130
0
    case empty_format:
2131
0
        errno = EPIPE;
2132
0
        return NULL;
2133
2134
0
    default:
2135
0
        errno = EFTYPE;
2136
0
        return NULL;
2137
919
    }
2138
919
}
2139
2140
int sam_hdr_write(htsFile *fp, const sam_hdr_t *h)
2141
885
{
2142
885
    if (!fp || !h) {
2143
0
        errno = EINVAL;
2144
0
        return -1;
2145
0
    }
2146
2147
885
    switch (fp->format.format) {
2148
0
    case binary_format:
2149
0
        fp->format.category = sequence_data;
2150
0
        fp->format.format = bam;
2151
        /* fall-through */
2152
0
    case bam:
2153
0
        if (bam_hdr_write(fp->fp.bgzf, h) < 0) return -1;
2154
0
        break;
2155
2156
0
    case cram: {
2157
0
        cram_fd *fd = fp->fp.cram;
2158
0
        if (cram_set_header2(fd, h) < 0) return -1;
2159
0
        if (fp->fn_aux)
2160
0
            cram_load_reference(fd, fp->fn_aux);
2161
0
        if (cram_write_SAM_hdr(fd, fd->header) < 0) return -1;
2162
0
        }
2163
0
        break;
2164
2165
885
    case text_format:
2166
885
        fp->format.category = sequence_data;
2167
885
        fp->format.format = sam;
2168
        /* fall-through */
2169
885
    case sam: {
2170
885
        if (!h->hrecs && !h->text)
2171
0
            return 0;
2172
885
        char *text;
2173
885
        kstring_t hdr_ks = { 0, 0, NULL };
2174
885
        size_t l_text;
2175
885
        ssize_t bytes;
2176
885
        int r = 0, no_sq = 0;
2177
2178
885
        if (h->hrecs) {
2179
766
            if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0)
2180
0
                return -1;
2181
766
            text = hdr_ks.s;
2182
766
            l_text = hdr_ks.l;
2183
766
        } else {
2184
119
            const char *p = NULL;
2185
151
            do {
2186
151
                const char *q = p == NULL ? h->text : p + 4;
2187
151
                p = strstr(q, "@SQ\t");
2188
151
            } while (!(p == NULL || p == h->text || *(p - 1) == '\n'));
2189
119
            no_sq = p == NULL;
2190
119
            text = h->text;
2191
119
            l_text = h->l_text;
2192
119
        }
2193
2194
885
        if (fp->is_bgzf) {
2195
0
            bytes = bgzf_write(fp->fp.bgzf, text, l_text);
2196
885
        } else {
2197
885
            bytes = hwrite(fp->fp.hfile, text, l_text);
2198
885
        }
2199
885
        free(hdr_ks.s);
2200
885
        if (bytes != l_text)
2201
0
            return -1;
2202
2203
885
        if (no_sq) {
2204
53
            int i;
2205
383
            for (i = 0; i < h->n_targets; ++i) {
2206
330
                fp->line.l = 0;
2207
330
                r |= kputsn("@SQ\tSN:", 7, &fp->line) < 0;
2208
330
                r |= kputs(h->target_name[i], &fp->line) < 0;
2209
330
                r |= kputsn("\tLN:", 4, &fp->line) < 0;
2210
330
                r |= kputw(h->target_len[i], &fp->line) < 0;
2211
330
                r |= kputc('\n', &fp->line) < 0;
2212
330
                if (r != 0)
2213
0
                    return -1;
2214
2215
330
                if (fp->is_bgzf) {
2216
0
                    bytes = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l);
2217
330
                } else {
2218
330
                    bytes = hwrite(fp->fp.hfile, fp->line.s, fp->line.l);
2219
330
                }
2220
330
                if (bytes != fp->line.l)
2221
0
                    return -1;
2222
330
            }
2223
53
        }
2224
885
        if (fp->is_bgzf) {
2225
0
            if (bgzf_flush(fp->fp.bgzf) != 0) return -1;
2226
885
        } else {
2227
885
            if (hflush(fp->fp.hfile) != 0) return -1;
2228
885
        }
2229
885
        }
2230
885
        break;
2231
2232
885
    case fastq_format:
2233
0
    case fasta_format:
2234
        // Nothing to output; FASTQ has no file headers.
2235
0
        break;
2236
2237
0
    default:
2238
0
        errno = EBADF;
2239
0
        return -1;
2240
885
    }
2241
885
    return 0;
2242
885
}
2243
2244
static int old_sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2245
0
{
2246
0
    char *p, *q, *beg = NULL, *end = NULL, *newtext;
2247
0
    size_t new_l_text;
2248
0
    if (!h || !key)
2249
0
        return -1;
2250
2251
0
    if (h->l_text > 3) {
2252
0
        if (strncmp(h->text, "@HD", 3) == 0) { //@HD line exists
2253
0
            if ((p = strchr(h->text, '\n')) == 0) return -1;
2254
0
            *p = '\0'; // for strstr call
2255
2256
0
            char tmp[5] = { '\t', key[0], key[0] ? key[1] : '\0', ':', '\0' };
2257
2258
0
            if ((q = strstr(h->text, tmp)) != 0) { // key exists
2259
0
                *p = '\n'; // change back
2260
2261
                // mark the key:val
2262
0
                beg = q;
2263
0
                for (q += 4; *q != '\n' && *q != '\t'; ++q);
2264
0
                end = q;
2265
2266
0
                if (val && (strncmp(beg + 4, val, end - beg - 4) == 0)
2267
0
                    && strlen(val) == end - beg - 4)
2268
0
                     return 0; // val is the same, no need to change
2269
2270
0
            } else {
2271
0
                beg = end = p;
2272
0
                *p = '\n';
2273
0
            }
2274
0
        }
2275
0
    }
2276
0
    if (beg == NULL) { // no @HD
2277
0
        new_l_text = h->l_text;
2278
0
        if (new_l_text > SIZE_MAX - strlen(SAM_FORMAT_VERSION) - 9)
2279
0
            return -1;
2280
0
        new_l_text += strlen(SAM_FORMAT_VERSION) + 8;
2281
0
        if (val) {
2282
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2283
0
                return -1;
2284
0
            new_l_text += strlen(val) + 4;
2285
0
        }
2286
0
        newtext = (char*)malloc(new_l_text + 1);
2287
0
        if (!newtext) return -1;
2288
2289
0
        if (val)
2290
0
            snprintf(newtext, new_l_text + 1,
2291
0
                    "@HD\tVN:%s\t%s:%s\n%s", SAM_FORMAT_VERSION, key, val, h->text);
2292
0
        else
2293
0
            snprintf(newtext, new_l_text + 1,
2294
0
                    "@HD\tVN:%s\n%s", SAM_FORMAT_VERSION, h->text);
2295
0
    } else { // has @HD but different or no key
2296
0
        new_l_text = (beg - h->text) + (h->text + h->l_text - end);
2297
0
        if (val) {
2298
0
            if (new_l_text > SIZE_MAX - strlen(val) - 5)
2299
0
                return -1;
2300
0
            new_l_text += strlen(val) + 4;
2301
0
        }
2302
0
        newtext = (char*)malloc(new_l_text + 1);
2303
0
        if (!newtext) return -1;
2304
2305
0
        if (val) {
2306
0
            snprintf(newtext, new_l_text + 1, "%.*s\t%s:%s%s",
2307
0
                    (int) (beg - h->text), h->text, key, val, end);
2308
0
        } else { //delete key
2309
0
            snprintf(newtext, new_l_text + 1, "%.*s%s",
2310
0
                    (int) (beg - h->text), h->text, end);
2311
0
        }
2312
0
    }
2313
0
    free(h->text);
2314
0
    h->text = newtext;
2315
0
    h->l_text = new_l_text;
2316
0
    return 0;
2317
0
}
2318
2319
2320
int sam_hdr_change_HD(sam_hdr_t *h, const char *key, const char *val)
2321
0
{
2322
0
    if (!h || !key)
2323
0
        return -1;
2324
2325
0
    if (!h->hrecs)
2326
0
        return old_sam_hdr_change_HD(h, key, val);
2327
2328
0
    if (val) {
2329
0
        if (sam_hdr_update_line(h, "HD", NULL, NULL, key, val, NULL) != 0)
2330
0
            return -1;
2331
0
    } else {
2332
0
        if (sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key) != 0)
2333
0
            return -1;
2334
0
    }
2335
0
    return sam_hdr_rebuild(h);
2336
0
}
2337
/**********************
2338
 *** SAM record I/O ***
2339
 **********************/
2340
2341
static int sam_parse_B_vals(char type, uint32_t n, char *in, char **end,
2342
                            char *r, bam1_t *b)
2343
1.20M
{
2344
1.20M
    int orig_l = b->l_data;
2345
1.20M
    char *q = in;
2346
1.20M
    int32_t size;
2347
1.20M
    size_t bytes;
2348
1.20M
    int overflow = 0;
2349
2350
1.20M
    size = aux_type2size(type);
2351
1.20M
    if (size <= 0 || size > 4) {
2352
2
        hts_log_error("Unrecognized type B:%c", type);
2353
2
        return -1;
2354
2
    }
2355
2356
    // Ensure space for type + values
2357
1.20M
    bytes = (size_t) n * (size_t) size;
2358
1.20M
    if (bytes / size != n
2359
1.20M
        || possibly_expand_bam_data(b, bytes + 2 + sizeof(uint32_t))) {
2360
0
        hts_log_error("Out of memory");
2361
0
        return -1;
2362
0
    }
2363
2364
1.20M
    b->data[b->l_data++] = 'B';
2365
1.20M
    b->data[b->l_data++] = type;
2366
1.20M
    i32_to_le(n, b->data + b->l_data);
2367
1.20M
    b->l_data += sizeof(uint32_t);
2368
    // This ensures that q always ends up at the next comma after
2369
    // reading a number even if it's followed by junk.  It
2370
    // prevents the possibility of trying to read more than n items.
2371
328M
#define skip_to_comma_(q) do { while (*(q) > '\t' && *(q) != ',') (q)++; } while (0)
2372
1.20M
    if (type == 'c') {
2373
70.2M
        while (q < r) {
2374
69.9M
            *(b->data + b->l_data) = hts_str2int(q + 1, &q, 8, &overflow);
2375
69.9M
            b->l_data++;
2376
69.9M
            skip_to_comma_(q);
2377
69.9M
        }
2378
903k
    } else if (type == 'C') {
2379
21.2M
        while (q < r) {
2380
20.7M
            if (*q != '-') {
2381
20.7M
                *(b->data + b->l_data) = hts_str2uint(q + 1, &q, 8, &overflow);
2382
20.7M
                b->l_data++;
2383
20.7M
            } else {
2384
0
                overflow = 1;
2385
0
            }
2386
20.7M
            skip_to_comma_(q);
2387
20.7M
        }
2388
453k
    } else if (type == 's') {
2389
2.96M
        while (q < r) {
2390
2.82M
            i16_to_le(hts_str2int(q + 1, &q, 16, &overflow), b->data + b->l_data);
2391
2.82M
            b->l_data += 2;
2392
2.82M
            skip_to_comma_(q);
2393
2.82M
        }
2394
311k
    } else if (type == 'S') {
2395
522k
        while (q < r) {
2396
521k
            if (*q != '-') {
2397
521k
                u16_to_le(hts_str2uint(q + 1, &q, 16, &overflow), b->data + b->l_data);
2398
521k
                b->l_data += 2;
2399
521k
            } else {
2400
0
                overflow = 1;
2401
0
            }
2402
521k
            skip_to_comma_(q);
2403
521k
        }
2404
310k
    } else if (type == 'i') {
2405
68.6M
        while (q < r) {
2406
68.4M
            i32_to_le(hts_str2int(q + 1, &q, 32, &overflow), b->data + b->l_data);
2407
68.4M
            b->l_data += 4;
2408
68.4M
            skip_to_comma_(q);
2409
68.4M
        }
2410
247k
    } else if (type == 'I') {
2411
18.1M
        while (q < r) {
2412
18.1M
            if (*q != '-') {
2413
18.1M
                u32_to_le(hts_str2uint(q + 1, &q, 32, &overflow), b->data + b->l_data);
2414
18.1M
                b->l_data += 4;
2415
18.1M
            } else {
2416
0
                overflow = 1;
2417
0
            }
2418
18.1M
            skip_to_comma_(q);
2419
18.1M
        }
2420
62.6k
    } else if (type == 'f') {
2421
348k
        while (q < r) {
2422
347k
            float_to_le(strtod(q + 1, &q), b->data + b->l_data);
2423
347k
            b->l_data += 4;
2424
347k
            skip_to_comma_(q);
2425
347k
        }
2426
199
    } else {
2427
0
        hts_log_error("Unrecognized type B:%c", type);
2428
0
        return -1;
2429
0
    }
2430
2431
1.20M
    if (!overflow) {
2432
757k
        *end = q;
2433
757k
        return 0;
2434
757k
    } else {
2435
446k
        int64_t max = 0, min = 0, val;
2436
        // Given type was incorrect.  Try to rescue the situation.
2437
446k
        q = in;
2438
446k
        overflow = 0;
2439
446k
        b->l_data = orig_l;
2440
        // Find out what range of values is present
2441
89.7M
        while (q < r) {
2442
89.3M
            val = hts_str2int(q + 1, &q, 64, &overflow);
2443
89.3M
            if (max < val) max = val;
2444
89.3M
            if (min > val) min = val;
2445
89.3M
            skip_to_comma_(q);
2446
89.3M
        }
2447
        // Retry with appropriate type
2448
446k
        if (!overflow) {
2449
446k
            if (min < 0) {
2450
383k
                if (min >= INT8_MIN && max <= INT8_MAX) {
2451
0
                    return sam_parse_B_vals('c', n, in, end, r, b);
2452
383k
                } else if (min >= INT16_MIN && max <= INT16_MAX) {
2453
136k
                    return sam_parse_B_vals('s', n, in, end, r, b);
2454
246k
                } else if (min >= INT32_MIN && max <= INT32_MAX) {
2455
246k
                    return sam_parse_B_vals('i', n, in, end, r, b);
2456
246k
                }
2457
383k
            } else {
2458
63.0k
                if (max < UINT8_MAX) {
2459
9
                    return sam_parse_B_vals('C', n, in, end, r, b);
2460
63.0k
                } else if (max <= UINT16_MAX) {
2461
583
                    return sam_parse_B_vals('S', n, in, end, r, b);
2462
62.4k
                } else if (max <= UINT32_MAX) {
2463
62.4k
                    return sam_parse_B_vals('I', n, in, end, r, b);
2464
62.4k
                }
2465
63.0k
            }
2466
446k
        }
2467
        // If here then at least one of the values is too big to store
2468
14
        hts_log_error("Numeric value in B array out of allowed range");
2469
14
        return -1;
2470
446k
    }
2471
1.20M
#undef skip_to_comma_
2472
1.20M
}
2473
2474
5.80k
static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) {
2475
5.80k
    if (*v >= '1' && *v <= '9') {
2476
3.15k
        return hts_str2uint(v, rv, 16, overflow);
2477
3.15k
    }
2478
2.65k
    else if (*v == '0') {
2479
        // handle single-digit "0" directly; otherwise it's hex or octal
2480
886
        if (v[1] == '\t') { *rv = v+1; return 0; }
2481
672
        else {
2482
672
            unsigned long val = strtoul(v, rv, 0);
2483
672
            if (val > 65535) { *overflow = 1; return 65535; }
2484
604
            return val;
2485
672
        }
2486
886
    }
2487
1.76k
    else {
2488
        // TODO implement symbolic flag letters
2489
1.76k
        *rv = v;
2490
1.76k
        return 0;
2491
1.76k
    }
2492
5.80k
}
2493
2494
// Parse tag line and append to bam object b.
2495
// Shared by both SAM and FASTQ parsers.
2496
//
2497
// The difference between the two is how lenient we are to recognising
2498
// non-compliant strings.  The FASTQ parser glosses over arbitrary
2499
// non-SAM looking strings.
2500
static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient,
2501
5.70k
                            khash_t(tag) *tag_whitelist) {
2502
5.70k
    int overflow = 0;
2503
5.70k
    int checkpoint;
2504
5.70k
    char logbuf[40];
2505
5.70k
    char *q = start, *p = end;
2506
2507
5.70k
#define _parse_err(cond, ...)                   \
2508
5.98M
    do {                                        \
2509
14.0M
        if (cond) {                             \
2510
39
            if (lenient) {                      \
2511
0
                while (q < p && !isspace_c(*q))   \
2512
0
                    q++;                        \
2513
0
                while (q < p && isspace_c(*q))    \
2514
0
                    q++;                        \
2515
0
                b->l_data = checkpoint;         \
2516
0
                goto loop;                      \
2517
39
            } else {                            \
2518
39
                hts_log_error(__VA_ARGS__);     \
2519
39
                goto err_ret;                   \
2520
39
            }                                   \
2521
39
        }                                       \
2522
5.98M
    } while (0)
2523
2524
5.22M
    while (q < p) loop: {
2525
5.22M
        char type;
2526
5.22M
        checkpoint = b->l_data;
2527
5.22M
        if (p - q < 5) {
2528
6
            if (lenient) {
2529
0
                break;
2530
6
            } else {
2531
6
                hts_log_error("Incomplete aux field");
2532
6
                goto err_ret;
2533
6
            }
2534
6
        }
2535
2.61M
        _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id");
2536
2537
2.61M
        if (lenient && (q[2] | q[4]) != ':') {
2538
0
            while (q < p && !isspace_c(*q))
2539
0
                q++;
2540
0
            while (q < p && isspace_c(*q))
2541
0
                q++;
2542
0
            continue;
2543
0
        }
2544
2545
2.61M
        if (tag_whitelist) {
2546
0
            int tt = q[0]*256 + q[1];
2547
0
            if (kh_get(tag, tag_whitelist, tt) == kh_end(tag_whitelist)) {
2548
0
                while (q < p && *q != '\t')
2549
0
                    q++;
2550
0
                continue;
2551
0
            }
2552
0
        }
2553
2554
        // Copy over id
2555
2.61M
        if (possibly_expand_bam_data(b, 2) < 0) goto err_ret;
2556
2.61M
        memcpy(b->data + b->l_data, q, 2); b->l_data += 2;
2557
2.61M
        q += 3; type = *q++; ++q; // q points to value
2558
2.61M
        if (type != 'Z' && type != 'H') // the only zero length acceptable fields
2559
2.05M
            _parse_err(*q <= '\t', "incomplete aux field");
2560
2561
        // Ensure enough space for a double + type allocated.
2562
2.61M
        if (possibly_expand_bam_data(b, 16) < 0) goto err_ret;
2563
2564
2.61M
        if (type == 'A' || type == 'a' || type == 'c' || type == 'C') {
2565
258k
            b->data[b->l_data++] = 'A';
2566
258k
            b->data[b->l_data++] = *q++;
2567
2.35M
        } else if (type == 'i' || type == 'I') {
2568
1.02M
            if (*q == '-') {
2569
686k
                int32_t x = hts_str2int(q, &q, 32, &overflow);
2570
686k
                if (x >= INT8_MIN) {
2571
298k
                    b->data[b->l_data++] = 'c';
2572
298k
                    b->data[b->l_data++] = x;
2573
387k
                } else if (x >= INT16_MIN) {
2574
135k
                    b->data[b->l_data++] = 's';
2575
135k
                    i16_to_le(x, b->data + b->l_data);
2576
135k
                    b->l_data += 2;
2577
252k
                } else {
2578
252k
                    b->data[b->l_data++] = 'i';
2579
252k
                    i32_to_le(x, b->data + b->l_data);
2580
252k
                    b->l_data += 4;
2581
252k
                }
2582
686k
            } else {
2583
338k
                uint32_t x = hts_str2uint(q, &q, 32, &overflow);
2584
338k
                if (x <= UINT8_MAX) {
2585
68.2k
                    b->data[b->l_data++] = 'C';
2586
68.2k
                    b->data[b->l_data++] = x;
2587
270k
                } else if (x <= UINT16_MAX) {
2588
176k
                    b->data[b->l_data++] = 'S';
2589
176k
                    u16_to_le(x, b->data + b->l_data);
2590
176k
                    b->l_data += 2;
2591
176k
                } else {
2592
93.9k
                    b->data[b->l_data++] = 'I';
2593
93.9k
                    u32_to_le(x, b->data + b->l_data);
2594
93.9k
                    b->l_data += 4;
2595
93.9k
                }
2596
338k
            }
2597
1.32M
        } else if (type == 'f') {
2598
335
            b->data[b->l_data++] = 'f';
2599
335
            float_to_le(strtod(q, &q), b->data + b->l_data);
2600
335
            b->l_data += sizeof(float);
2601
1.32M
        } else if (type == 'd') {
2602
13.5k
            b->data[b->l_data++] = 'd';
2603
13.5k
            double_to_le(strtod(q, &q), b->data + b->l_data);
2604
13.5k
            b->l_data += sizeof(double);
2605
1.31M
        } else if (type == 'Z' || type == 'H') {
2606
555k
            char *end = strchr(q, '\t');
2607
555k
            if (!end) end = q + strlen(q);
2608
555k
            _parse_err(type == 'H' && ((end-q)&1) != 0,
2609
555k
                       "hex field does not have an even number of digits");
2610
555k
            b->data[b->l_data++] = type;
2611
555k
            if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret;
2612
555k
            memcpy(b->data + b->l_data, q, end - q);
2613
555k
            b->l_data += end - q;
2614
555k
            b->data[b->l_data++] = '\0';
2615
555k
            q = end;
2616
757k
        } else if (type == 'B') {
2617
757k
            uint32_t n;
2618
757k
            char *r;
2619
757k
            type = *q++; // q points to the first ',' following the typing byte
2620
757k
            _parse_err(*q && *q != ',' && *q != '\t',
2621
757k
                       "B aux field type not followed by ','");
2622
2623
117M
            for (r = q, n = 0; *r > '\t'; ++r)
2624
116M
                if (*r == ',') ++n;
2625
2626
757k
            if (sam_parse_B_vals(type, n, q, &q, r, b) < 0)
2627
16
                goto err_ret;
2628
757k
        } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1));
2629
2630
24.9M
        while (*q > '\t') { q++; } // Skip any junk to next tab
2631
2.61M
        q++;
2632
2.61M
    }
2633
2634
5.64k
    _parse_err(!lenient && overflow != 0, "numeric value out of allowed range");
2635
5.64k
#undef _parse_err
2636
2637
5.64k
    return 0;
2638
2639
61
err_ret:
2640
61
    return -2;
2641
5.64k
}
2642
2643
int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b)
2644
5.87k
{
2645
27.5k
#define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0)
2646
2647
5.87k
#if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff
2648
2649
// Macro that operates on 64-bits at a time.
2650
5.87k
#define COPY_MINUS_N(to,from,n,l,failed)                        \
2651
5.87k
    do {                                                        \
2652
5.69k
        uint64_u *from8 = (uint64_u *)(from);                   \
2653
5.69k
        uint64_u *to8 = (uint64_u *)(to);                       \
2654
5.69k
        uint64_t uflow = 0;                                     \
2655
5.69k
        size_t l8 = (l)>>3, i;                                  \
2656
5.90k
        for (i = 0; i < l8; i++) {                              \
2657
219
            to8[i] = from8[i] - (n)*0x0101010101010101UL;       \
2658
219
            uflow |= to8[i];                                    \
2659
219
        }                                                       \
2660
6.10k
        for (i<<=3; i < (l); ++i) {                             \
2661
412
            to[i] = from[i] - (n);                              \
2662
412
            uflow |= to[i];                                     \
2663
412
        }                                                       \
2664
5.69k
        failed = (uflow & 0x8080808080808080UL) > 0;            \
2665
5.69k
    } while (0)
2666
2667
#else
2668
2669
// Basic version which operates a byte at a time
2670
#define COPY_MINUS_N(to,from,n,l,failed) do {                \
2671
        uint8_t uflow = 0;                                   \
2672
        for (i = 0; i < (l); ++i) {                          \
2673
            (to)[i] = (from)[i] - (n);                       \
2674
            uflow |= (uint8_t) (to)[i];                      \
2675
        }                                                    \
2676
        failed = (uflow & 0x80) > 0;                         \
2677
    } while (0)
2678
2679
#endif
2680
2681
10.3k
#define _get_mem(type_t, x, b, l) if (possibly_expand_bam_data((b), (l)) < 0) goto err_ret; *(x) = (type_t*)((b)->data + (b)->l_data); (b)->l_data += (l)
2682
66.7k
#define _parse_err(cond, ...) do { if (cond) { hts_log_error(__VA_ARGS__); goto err_ret; } } while (0)
2683
17.9k
#define _parse_warn(cond, ...) do { if (cond) { hts_log_warning(__VA_ARGS__); } } while (0)
2684
2685
5.87k
    uint8_t *t;
2686
2687
5.87k
    char *p = s->s, *q;
2688
5.87k
    int i, overflow = 0;
2689
5.87k
    char logbuf[40];
2690
5.87k
    hts_pos_t cigreflen;
2691
5.87k
    bam1_core_t *c = &b->core;
2692
2693
5.87k
    b->l_data = 0;
2694
5.87k
    memset(c, 0, 32);
2695
2696
    // qname
2697
5.87k
    q = _read_token(p);
2698
2699
5.81k
    _parse_warn(p - q <= 1, "empty query name");
2700
5.81k
    _parse_err(p - q > 255, "query name too long");
2701
    // resize large enough for name + extranul
2702
5.80k
    if (possibly_expand_bam_data(b, (p - q) + 4) < 0) goto err_ret;
2703
5.80k
    memcpy(b->data + b->l_data, q, p-q); b->l_data += p-q;
2704
2705
5.80k
    c->l_extranul = (4 - (b->l_data & 3)) & 3;
2706
5.80k
    memcpy(b->data + b->l_data, "\0\0\0\0", c->l_extranul);
2707
5.80k
    b->l_data += c->l_extranul;
2708
2709
5.80k
    c->l_qname = p - q + c->l_extranul;
2710
2711
    // flag
2712
5.80k
    c->flag = parse_sam_flag(p, &p, &overflow);
2713
5.80k
    if (*p++ != '\t') goto err_ret; // malformated flag
2714
2715
    // chr
2716
5.78k
    q = _read_token(p);
2717
5.78k
    if (strcmp(q, "*")) {
2718
1.55k
        _parse_err(h->n_targets == 0, "no SQ lines present in the header");
2719
1.55k
        c->tid = bam_name2id(h, q);
2720
1.55k
        _parse_err(c->tid < -1, "failed to parse header");
2721
1.52k
        _parse_warn(c->tid < 0, "unrecognized reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2722
4.22k
    } else c->tid = -1;
2723
2724
    // pos
2725
5.75k
    c->pos = hts_str2uint(p, &p, 63, &overflow) - 1;
2726
5.75k
    if (*p++ != '\t') goto err_ret;
2727
5.75k
    if (c->pos < 0 && c->tid >= 0) {
2728
105
        _parse_warn(1, "mapped query cannot have zero coordinate; treated as unmapped");
2729
105
        c->tid = -1;
2730
105
    }
2731
5.75k
    if (c->tid < 0) c->flag |= BAM_FUNMAP;
2732
2733
    // mapq
2734
5.75k
    c->qual = hts_str2uint(p, &p, 8, &overflow);
2735
5.75k
    if (*p++ != '\t') goto err_ret;
2736
    // cigar
2737
5.75k
    if (*p != '*') {
2738
1.28k
        uint32_t *cigar = NULL;
2739
1.28k
        int old_l_data = b->l_data;
2740
1.28k
        int n_cigar = bam_parse_cigar(p, &p, b);
2741
1.28k
        if (n_cigar < 1 || *p++ != '\t') goto err_ret;
2742
1.26k
        cigar = (uint32_t *)(b->data + old_l_data);
2743
1.26k
        c->n_cigar = n_cigar;
2744
2745
        // can't use bam_endpos() directly as some fields not yet set up
2746
1.26k
        cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1;
2747
1.26k
        if (cigreflen == 0) cigreflen = 1;
2748
4.46k
    } else {
2749
4.46k
        _parse_warn(!(c->flag&BAM_FUNMAP), "mapped query must have a CIGAR; treated as unmapped");
2750
4.46k
        c->flag |= BAM_FUNMAP;
2751
4.46k
        q = _read_token(p);
2752
4.46k
        cigreflen = 1;
2753
4.46k
    }
2754
5.72k
    _parse_err(HTS_POS_MAX - cigreflen <= c->pos,
2755
5.72k
               "read ends beyond highest supported position");
2756
5.72k
    c->bin = hts_reg2bin(c->pos, c->pos + cigreflen, 14, 5);
2757
    // mate chr
2758
5.72k
    q = _read_token(p);
2759
5.72k
    if (strcmp(q, "=") == 0) {
2760
0
        c->mtid = c->tid;
2761
5.72k
    } else if (strcmp(q, "*") == 0) {
2762
163
        c->mtid = -1;
2763
5.56k
    } else {
2764
5.56k
        c->mtid = bam_name2id(h, q);
2765
5.56k
        _parse_err(c->mtid < -1, "failed to parse header");
2766
5.55k
        _parse_warn(c->mtid < 0, "unrecognized mate reference name %s; treated as unmapped", hts_strprint(logbuf, sizeof logbuf, '"', q, SIZE_MAX));
2767
5.55k
    }
2768
    // mpos
2769
5.71k
    c->mpos = hts_str2uint(p, &p, 63, &overflow) - 1;
2770
5.71k
    if (*p++ != '\t') goto err_ret;
2771
5.70k
    if (c->mpos < 0 && c->mtid >= 0) {
2772
525
        _parse_warn(1, "mapped mate cannot have zero coordinate; treated as unmapped");
2773
525
        c->mtid = -1;
2774
525
    }
2775
    // tlen
2776
5.70k
    c->isize = hts_str2int(p, &p, 64, &overflow);
2777
5.70k
    if (*p++ != '\t') goto err_ret;
2778
    // seq
2779
5.70k
    q = _read_token(p);
2780
5.70k
    if (strcmp(q, "*")) {
2781
4.68k
        _parse_err(p - q - 1 > INT32_MAX, "read sequence is too long");
2782
4.68k
        c->l_qseq = p - q - 1;
2783
4.68k
        hts_pos_t ql = bam_cigar2qlen(c->n_cigar, (uint32_t*)(b->data + c->l_qname));
2784
4.68k
        _parse_err(c->n_cigar && ql != c->l_qseq, "CIGAR and query sequence are of different length");
2785
4.68k
        i = (c->l_qseq + 1) >> 1;
2786
4.68k
        _get_mem(uint8_t, &t, b, i);
2787
2788
4.68k
        unsigned int lqs2 = c->l_qseq&~1, i;
2789
45.4k
        for (i = 0; i < lqs2; i+=2)
2790
40.7k
            t[i>>1] = (seq_nt16_table[(unsigned char)q[i]] << 4) | seq_nt16_table[(unsigned char)q[i+1]];
2791
4.91k
        for (; i < c->l_qseq; ++i)
2792
230
            t[i>>1] = seq_nt16_table[(unsigned char)q[i]] << ((~i&1)<<2);
2793
4.68k
    } else c->l_qseq = 0;
2794
    // qual
2795
11.4k
    _get_mem(uint8_t, &t, b, c->l_qseq);
2796
11.4k
    if (p[0] == '*' && (p[1] == '\t' || p[1] == '\0')) {
2797
16
        memset(t, 0xff, c->l_qseq);
2798
16
        p += 2;
2799
5.69k
    } else {
2800
5.69k
        int failed = 0;
2801
5.69k
        _parse_err(s->l - (p - s->s) < c->l_qseq
2802
5.69k
                   || (p[c->l_qseq] != '\t' && p[c->l_qseq] != '\0'),
2803
5.69k
                   "SEQ and QUAL are of different length");
2804
5.69k
        COPY_MINUS_N(t, p, 33, c->l_qseq, failed);
2805
5.69k
        _parse_err(failed, "invalid QUAL character");
2806
5.68k
        p += c->l_qseq + 1;
2807
5.68k
    }
2808
2809
    // aux
2810
5.70k
    if (aux_parse(p, s->s + s->l, b, 0, NULL) < 0)
2811
61
        goto err_ret;
2812
2813
5.64k
    if (bam_tag2cigar(b, 1, 1) < 0)
2814
0
        return -2;
2815
5.64k
    return 0;
2816
2817
0
#undef _parse_warn
2818
0
#undef _parse_err
2819
0
#undef _get_mem
2820
0
#undef _read_token
2821
233
err_ret:
2822
233
    return -2;
2823
5.64k
}
2824
2825
1.28k
static uint32_t read_ncigar(const char *q) {
2826
1.28k
    uint32_t n_cigar = 0;
2827
66.5M
    for (; *q && *q != '\t'; ++q)
2828
66.5M
        if (!isdigit_c(*q)) ++n_cigar;
2829
1.28k
    if (!n_cigar) {
2830
4
        hts_log_error("No CIGAR operations");
2831
4
        return 0;
2832
4
    }
2833
1.28k
    if (n_cigar >= 2147483647) {
2834
0
        hts_log_error("Too many CIGAR operations");
2835
0
        return 0;
2836
0
    }
2837
2838
1.28k
    return n_cigar;
2839
1.28k
}
2840
2841
/*! @function
2842
 @abstract  Parse a CIGAR string into preallocated a uint32_t array
2843
 @param  in      [in]  pointer to the source string
2844
 @param  a_cigar [out]  address of the destination uint32_t buffer
2845
 @return         number of processed input characters; 0 on error
2846
 */
2847
1.28k
static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) {
2848
1.28k
    int i, overflow = 0;
2849
1.28k
    const char *p = in;
2850
3.19k
    for (i = 0; i < n_cigar; i++) {
2851
1.92k
        uint32_t len;
2852
1.92k
        int op;
2853
1.92k
        char *q;
2854
1.92k
        len = hts_str2uint(p, &q, 28, &overflow)<<BAM_CIGAR_SHIFT;
2855
1.92k
        if (q == p) {
2856
12
            hts_log_error("CIGAR length invalid at position %d (%s)", (int)(i+1), p);
2857
12
            return 0;
2858
12
        }
2859
1.91k
        if (overflow) {
2860
0
            hts_log_error("CIGAR length too long at position %d (%.*s)", (int)(i+1), (int)(q-p+1), p);
2861
0
            return 0;
2862
0
        }
2863
1.91k
        p = q;
2864
1.91k
        op = bam_cigar_table[(unsigned char)*p++];
2865
1.91k
        if (op < 0) {
2866
1
            hts_log_error("Unrecognized CIGAR operator");
2867
1
            return 0;
2868
1
        }
2869
1.91k
        a_cigar[i] = len;
2870
1.91k
        a_cigar[i] |= op;
2871
1.91k
    }
2872
2873
1.26k
    return p-in;
2874
1.28k
}
2875
2876
0
ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) {
2877
0
    size_t n_cigar = 0;
2878
0
    int diff;
2879
2880
0
    if (!in || !a_cigar || !a_mem) {
2881
0
        hts_log_error("NULL pointer arguments");
2882
0
        return -1;
2883
0
    }
2884
0
    if (end) *end = (char *)in;
2885
2886
0
    if (*in == '*') {
2887
0
        if (end) (*end)++;
2888
0
        return 0;
2889
0
    }
2890
0
    n_cigar = read_ncigar(in);
2891
0
    if (!n_cigar) return 0;
2892
0
    if (n_cigar > *a_mem) {
2893
0
        uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar));
2894
0
        if (a_tmp) {
2895
0
            *a_cigar = a_tmp;
2896
0
            *a_mem = n_cigar;
2897
0
        } else {
2898
0
            hts_log_error("Memory allocation error");
2899
0
            return -1;
2900
0
        }
2901
0
    }
2902
2903
0
    if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1;
2904
0
    if (end) *end = (char *)in+diff;
2905
2906
0
    return n_cigar;
2907
0
}
2908
2909
1.28k
ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) {
2910
1.28k
    size_t n_cigar = 0;
2911
1.28k
    int diff;
2912
2913
1.28k
    if (!in || !b) {
2914
0
        hts_log_error("NULL pointer arguments");
2915
0
        return -1;
2916
0
    }
2917
1.28k
    if (end) *end = (char *)in;
2918
2919
1.28k
    if (*in == '*') {
2920
0
        if (end) (*end)++;
2921
0
        return 0;
2922
0
    }
2923
1.28k
    n_cigar = read_ncigar(in);
2924
1.28k
    if (!n_cigar) return 0;
2925
1.28k
    if (possibly_expand_bam_data(b, n_cigar * sizeof(uint32_t)) < 0) {
2926
0
        hts_log_error("Memory allocation error");
2927
0
        return -1;
2928
0
    }
2929
2930
1.28k
    if (!(diff = parse_cigar(in, (uint32_t *)(b->data + b->l_data), n_cigar))) return -1;
2931
1.26k
    b->l_data += (n_cigar * sizeof(uint32_t));
2932
1.26k
    if (end) *end = (char *)in+diff;
2933
2934
1.26k
    return n_cigar;
2935
1.28k
}
2936
2937
/*
2938
 * -----------------------------------------------------------------------------
2939
 * SAM threading
2940
 */
2941
// Size of SAM text block (reading)
2942
0
#define SAM_NBYTES 240000
2943
2944
// Number of BAM records (writing, up to NB_mem in size)
2945
0
#define SAM_NBAM 1000
2946
2947
struct SAM_state;
2948
2949
// Output job - a block of BAM records
2950
typedef struct sp_bams {
2951
    struct sp_bams *next;
2952
    int serial;
2953
2954
    bam1_t *bams;
2955
    int nbams, abams; // used and alloc for bams[] array
2956
    size_t bam_mem;   // very approximate total size
2957
2958
    struct SAM_state *fd;
2959
} sp_bams;
2960
2961
// Input job - a block of SAM text
2962
typedef struct sp_lines {
2963
    struct sp_lines *next;
2964
    int serial;
2965
2966
    char *data;
2967
    int data_size;
2968
    int alloc;
2969
2970
    struct SAM_state *fd;
2971
    sp_bams *bams;
2972
} sp_lines;
2973
2974
enum sam_cmd {
2975
    SAM_NONE = 0,
2976
    SAM_CLOSE,
2977
    SAM_CLOSE_DONE,
2978
};
2979
2980
typedef struct SAM_state {
2981
    sam_hdr_t *h;
2982
2983
    hts_tpool *p;
2984
    int own_pool;
2985
    pthread_mutex_t lines_m;
2986
    hts_tpool_process *q;
2987
    pthread_t dispatcher;
2988
    int dispatcher_set;
2989
2990
    sp_lines *lines;
2991
    sp_bams *bams;
2992
2993
    sp_bams *curr_bam;
2994
    int curr_idx;
2995
    int serial;
2996
2997
    // Be warned: moving these mutexes around in this struct can reduce
2998
    // threading performance by up to 70%!
2999
    pthread_mutex_t command_m;
3000
    pthread_cond_t command_c;
3001
    enum sam_cmd command;
3002
3003
    // One of the E* errno codes
3004
    int errcode;
3005
3006
    htsFile *fp;
3007
} SAM_state;
3008
3009
// Returns a SAM_state struct from a generic hFILE.
3010
//
3011
// Returns NULL on failure.
3012
0
static SAM_state *sam_state_create(htsFile *fp) {
3013
    // Ideally sam_open wouldn't be a #define to hts_open but instead would
3014
    // be a redirect call with an additional 'S' mode.  This in turn would
3015
    // correctly set the designed format to sam instead of a generic
3016
    // text_format.
3017
0
    if (fp->format.format != sam && fp->format.format != text_format)
3018
0
        return NULL;
3019
3020
0
    SAM_state *fd = calloc(1, sizeof(*fd));
3021
0
    if (!fd)
3022
0
        return NULL;
3023
3024
0
    fp->state = fd;
3025
0
    fd->fp = fp;
3026
3027
0
    return fd;
3028
0
}
3029
3030
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str);
3031
static void *sam_format_worker(void *arg);
3032
3033
0
static void sam_state_err(SAM_state *fd, int errcode) {
3034
0
    pthread_mutex_lock(&fd->command_m);
3035
0
    if (!fd->errcode)
3036
0
        fd->errcode = errcode;
3037
0
    pthread_mutex_unlock(&fd->command_m);
3038
0
}
3039
3040
0
static void sam_free_sp_bams(sp_bams *b) {
3041
0
    if (!b)
3042
0
        return;
3043
3044
0
    if (b->bams) {
3045
0
        int i;
3046
0
        for (i = 0; i < b->abams; i++) {
3047
0
            if (b->bams[i].data)
3048
0
                free(b->bams[i].data);
3049
0
        }
3050
0
        free(b->bams);
3051
0
    }
3052
0
    free(b);
3053
0
}
3054
3055
// Destroys the state produce by sam_state_create.
3056
1.38k
int sam_state_destroy(htsFile *fp) {
3057
1.38k
    int ret = 0;
3058
3059
1.38k
    if (!fp->state)
3060
1.38k
        return 0;
3061
3062
0
    SAM_state *fd = fp->state;
3063
0
    if (fd->p) {
3064
0
        if (fd->h) {
3065
            // Notify sam_dispatcher we're closing
3066
0
            pthread_mutex_lock(&fd->command_m);
3067
0
            if (fd->command != SAM_CLOSE_DONE)
3068
0
                fd->command = SAM_CLOSE;
3069
0
            pthread_cond_signal(&fd->command_c);
3070
0
            ret = -fd->errcode;
3071
0
            if (fd->q)
3072
0
                hts_tpool_wake_dispatch(fd->q); // unstick the reader
3073
3074
0
            if (!fp->is_write && fd->q && fd->dispatcher_set) {
3075
0
                for (;;) {
3076
                    // Avoid deadlocks with dispatcher
3077
0
                    if (fd->command == SAM_CLOSE_DONE)
3078
0
                        break;
3079
0
                    hts_tpool_wake_dispatch(fd->q);
3080
0
                    pthread_mutex_unlock(&fd->command_m);
3081
0
                    usleep(10000);
3082
0
                    pthread_mutex_lock(&fd->command_m);
3083
0
                }
3084
0
            }
3085
0
            pthread_mutex_unlock(&fd->command_m);
3086
3087
0
            if (fp->is_write) {
3088
                // Dispatch the last partial block.
3089
0
                sp_bams *gb = fd->curr_bam;
3090
0
                if (!ret && gb && gb->nbams > 0 && fd->q)
3091
0
                    ret = hts_tpool_dispatch(fd->p, fd->q, sam_format_worker, gb);
3092
3093
                // Flush and drain output
3094
0
                if (fd->q)
3095
0
                    hts_tpool_process_flush(fd->q);
3096
0
                pthread_mutex_lock(&fd->command_m);
3097
0
                if (!ret) ret = -fd->errcode;
3098
0
                pthread_mutex_unlock(&fd->command_m);
3099
3100
0
                while (!ret && fd->q && !hts_tpool_process_empty(fd->q)) {
3101
0
                    usleep(10000);
3102
0
                    pthread_mutex_lock(&fd->command_m);
3103
0
                    ret = -fd->errcode;
3104
                    // not empty but shutdown implies error
3105
0
                    if (hts_tpool_process_is_shutdown(fd->q) && !ret)
3106
0
                        ret = EIO;
3107
0
                    pthread_mutex_unlock(&fd->command_m);
3108
0
                }
3109
0
                if (fd->q)
3110
0
                    hts_tpool_process_shutdown(fd->q);
3111
0
            }
3112
3113
            // Wait for it to acknowledge
3114
0
            if (fd->dispatcher_set)
3115
0
                pthread_join(fd->dispatcher, NULL);
3116
0
            if (!ret) ret = -fd->errcode;
3117
0
        }
3118
3119
        // Tidy up memory
3120
0
        if (fd->q)
3121
0
            hts_tpool_process_destroy(fd->q);
3122
3123
0
        if (fd->own_pool && fp->format.compression == no_compression) {
3124
0
            hts_tpool_destroy(fd->p);
3125
0
            fd->p = NULL;
3126
0
        }
3127
0
        pthread_mutex_destroy(&fd->lines_m);
3128
0
        pthread_mutex_destroy(&fd->command_m);
3129
0
        pthread_cond_destroy(&fd->command_c);
3130
3131
0
        sp_lines *l = fd->lines;
3132
0
        while (l) {
3133
0
            sp_lines *n = l->next;
3134
0
            free(l->data);
3135
0
            free(l);
3136
0
            l = n;
3137
0
        }
3138
3139
0
        sp_bams *b = fd->bams;
3140
0
        while (b) {
3141
0
            if (fd->curr_bam == b)
3142
0
                fd->curr_bam = NULL;
3143
0
            sp_bams *n = b->next;
3144
0
            sam_free_sp_bams(b);
3145
0
            b = n;
3146
0
        }
3147
3148
0
        if (fd->curr_bam)
3149
0
            sam_free_sp_bams(fd->curr_bam);
3150
3151
        // Decrement counter by one, maybe destroying too.
3152
        // This is to permit the caller using bam_hdr_destroy
3153
        // before sam_close without triggering decode errors
3154
        // in the background threads.
3155
0
        bam_hdr_destroy(fd->h);
3156
0
    }
3157
3158
0
    free(fp->state);
3159
0
    fp->state = NULL;
3160
0
    return ret;
3161
1.38k
}
3162
3163
// Cleanup function - job for sam_parse_worker; result for sam_format_worker
3164
0
static void cleanup_sp_lines(void *arg) {
3165
0
    sp_lines *gl = (sp_lines *)arg;
3166
0
    if (!gl) return;
3167
3168
    // Should always be true for lines passed to / from thread workers.
3169
0
    assert(gl->next == NULL);
3170
3171
0
    free(gl->data);
3172
0
    sam_free_sp_bams(gl->bams);
3173
0
    free(gl);
3174
0
}
3175
3176
// Run from one of the worker threads.
3177
// Convert a passed in array of lines to array of BAMs, returning
3178
// the result back to the thread queue.
3179
0
static void *sam_parse_worker(void *arg) {
3180
0
    sp_lines *gl = (sp_lines *)arg;
3181
0
    sp_bams *gb = NULL;
3182
0
    char *lines = gl->data;
3183
0
    int i;
3184
0
    bam1_t *b;
3185
0
    SAM_state *fd = gl->fd;
3186
3187
    // Use a block of BAM structs we had earlier if available.
3188
0
    pthread_mutex_lock(&fd->lines_m);
3189
0
    if (fd->bams) {
3190
0
        gb = fd->bams;
3191
0
        fd->bams = gb->next;
3192
0
    }
3193
0
    pthread_mutex_unlock(&fd->lines_m);
3194
3195
0
    if (gb == NULL) {
3196
0
        gb = calloc(1, sizeof(*gb));
3197
0
        if (!gb) {
3198
0
            return NULL;
3199
0
        }
3200
0
        gb->abams = 100;
3201
0
        gb->bams = b = calloc(gb->abams, sizeof(*b));
3202
0
        if (!gb->bams) {
3203
0
            sam_state_err(fd, ENOMEM);
3204
0
            goto err;
3205
0
        }
3206
0
        gb->nbams = 0;
3207
0
        gb->bam_mem = 0;
3208
0
    }
3209
0
    gb->serial = gl->serial;
3210
0
    gb->next = NULL;
3211
3212
0
    b = (bam1_t *)gb->bams;
3213
0
    if (!b) {
3214
0
        sam_state_err(fd, ENOMEM);
3215
0
        goto err;
3216
0
    }
3217
3218
0
    i = 0;
3219
0
    char *cp = lines, *cp_end = lines + gl->data_size;
3220
0
    while (cp < cp_end) {
3221
0
        if (i >= gb->abams) {
3222
0
            int old_abams = gb->abams;
3223
0
            gb->abams *= 2;
3224
0
            b = (bam1_t *)realloc(gb->bams, gb->abams*sizeof(bam1_t));
3225
0
            if (!b) {
3226
0
                gb->abams /= 2;
3227
0
                sam_state_err(fd, ENOMEM);
3228
0
                goto err;
3229
0
            }
3230
0
            memset(&b[old_abams], 0, (gb->abams - old_abams)*sizeof(*b));
3231
0
            gb->bams = b;
3232
0
        }
3233
3234
        // Ideally we'd get sam_parse1 to return the number of
3235
        // bytes decoded and to be able to stop on newline as
3236
        // well as \0.
3237
        //
3238
        // We can then avoid the additional strchr loop.
3239
        // It's around 6% of our CPU cost, albeit threadable.
3240
        //
3241
        // However this is an API change so for now we copy.
3242
3243
0
        char *nl = strchr(cp, '\n');
3244
0
        char *line_end;
3245
0
        if (nl) {
3246
0
            line_end = nl;
3247
0
            if (line_end > cp && *(line_end - 1) == '\r')
3248
0
                line_end--;
3249
0
            nl++;
3250
0
        } else {
3251
0
            nl = line_end = cp_end;
3252
0
        }
3253
0
        *line_end = '\0';
3254
0
        kstring_t ks = { line_end - cp, gl->alloc, cp };
3255
0
        if (sam_parse1(&ks, fd->h, &b[i]) < 0) {
3256
0
            sam_state_err(fd, errno ? errno : EIO);
3257
0
            cleanup_sp_lines(gl);
3258
0
            goto err;
3259
0
        }
3260
3261
0
        cp = nl;
3262
0
        i++;
3263
0
    }
3264
0
    gb->nbams = i;
3265
3266
0
    pthread_mutex_lock(&fd->lines_m);
3267
0
    gl->next = fd->lines;
3268
0
    fd->lines = gl;
3269
0
    pthread_mutex_unlock(&fd->lines_m);
3270
0
    return gb;
3271
3272
0
 err:
3273
0
    sam_free_sp_bams(gb);
3274
0
    return NULL;
3275
0
}
3276
3277
0
static void *sam_parse_eof(void *arg) {
3278
0
    return NULL;
3279
0
}
3280
3281
// Cleanup function - result for sam_parse_worker; job for sam_format_worker
3282
0
static void cleanup_sp_bams(void *arg) {
3283
0
    sam_free_sp_bams((sp_bams *) arg);
3284
0
}
3285
3286
// Runs in its own thread.
3287
// Reads a block of text (SAM) and sends a new job to the thread queue to
3288
// translate this to BAM.
3289
0
static void *sam_dispatcher_read(void *vp) {
3290
0
    htsFile *fp = vp;
3291
0
    kstring_t line = {0};
3292
0
    int line_frag = 0;
3293
0
    SAM_state *fd = fp->state;
3294
0
    sp_lines *l = NULL;
3295
3296
    // Pre-allocate buffer for left-over bits of line (exact size doesn't
3297
    // matter as it will grow if necessary).
3298
0
    if (ks_resize(&line, 1000) < 0)
3299
0
        goto err;
3300
3301
0
    for (;;) {
3302
        // Check for command
3303
0
        pthread_mutex_lock(&fd->command_m);
3304
0
        switch (fd->command) {
3305
3306
0
        case SAM_CLOSE:
3307
0
            pthread_cond_signal(&fd->command_c);
3308
0
            pthread_mutex_unlock(&fd->command_m);
3309
0
            hts_tpool_process_shutdown(fd->q);
3310
0
            goto tidyup;
3311
3312
0
        default:
3313
0
            break;
3314
0
        }
3315
0
        pthread_mutex_unlock(&fd->command_m);
3316
3317
0
        pthread_mutex_lock(&fd->lines_m);
3318
0
        if (fd->lines) {
3319
            // reuse existing line buffer
3320
0
            l = fd->lines;
3321
0
            fd->lines = l->next;
3322
0
        }
3323
0
        pthread_mutex_unlock(&fd->lines_m);
3324
3325
0
        if (l == NULL) {
3326
            // none to reuse, to create a new one
3327
0
            l = calloc(1, sizeof(*l));
3328
0
            if (!l)
3329
0
                goto err;
3330
0
            l->alloc = SAM_NBYTES;
3331
0
            l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1
3332
0
            if (!l->data) {
3333
0
                free(l);
3334
0
                l = NULL;
3335
0
                goto err;
3336
0
            }
3337
0
            l->fd = fd;
3338
0
        }
3339
0
        l->next = NULL;
3340
3341
0
        if (l->alloc < line_frag+SAM_NBYTES/2) {
3342
0
            char *rp = realloc(l->data, line_frag+SAM_NBYTES/2 +8);
3343
0
            if (!rp)
3344
0
                goto err;
3345
0
            l->alloc = line_frag+SAM_NBYTES/2;
3346
0
            l->data = rp;
3347
0
        }
3348
0
        memcpy(l->data, line.s, line_frag);
3349
3350
0
        l->data_size = line_frag;
3351
0
        ssize_t nbytes;
3352
0
    longer_line:
3353
0
        if (fp->is_bgzf)
3354
0
            nbytes = bgzf_read(fp->fp.bgzf, l->data + line_frag, l->alloc - line_frag);
3355
0
        else
3356
0
            nbytes = hread(fp->fp.hfile, l->data + line_frag, l->alloc - line_frag);
3357
0
        if (nbytes < 0) {
3358
0
            sam_state_err(fd, errno ? errno : EIO);
3359
0
            goto err;
3360
0
        } else if (nbytes == 0)
3361
0
            break; // EOF
3362
0
        l->data_size += nbytes;
3363
3364
        // trim to last \n. Maybe \r\n, but that's still fine
3365
0
        if (nbytes == l->alloc - line_frag) {
3366
0
            char *cp_end = l->data + l->data_size;
3367
0
            char *cp = cp_end-1;
3368
3369
0
            while (cp > (char *)l->data && *cp != '\n')
3370
0
                cp--;
3371
3372
            // entire buffer is part of a single line
3373
0
            if (cp == l->data) {
3374
0
                line_frag = l->data_size;
3375
0
                char *rp = realloc(l->data, l->alloc * 2 + 8);
3376
0
                if (!rp)
3377
0
                    goto err;
3378
0
                l->alloc *= 2;
3379
0
                l->data = rp;
3380
0
                assert(l->alloc >= l->data_size);
3381
0
                assert(l->alloc >= line_frag);
3382
0
                assert(l->alloc >= l->alloc - line_frag);
3383
0
                goto longer_line;
3384
0
            }
3385
0
            cp++;
3386
3387
            // line holds the remainder of our line.
3388
0
            if (ks_resize(&line, cp_end - cp) < 0)
3389
0
                goto err;
3390
0
            memcpy(line.s, cp, cp_end - cp);
3391
0
            line_frag = cp_end - cp;
3392
0
            l->data_size = l->alloc - line_frag;
3393
0
        } else {
3394
            // out of buffer
3395
0
            line_frag = 0;
3396
0
        }
3397
3398
0
        l->serial = fd->serial++;
3399
        //fprintf(stderr, "Dispatching %p, %d bytes, serial %d\n", l, l->data_size, l->serial);
3400
0
        if (hts_tpool_dispatch3(fd->p, fd->q, sam_parse_worker, l,
3401
0
                                cleanup_sp_lines, cleanup_sp_bams, 0) < 0)
3402
0
            goto err;
3403
0
        pthread_mutex_lock(&fd->command_m);
3404
0
        if (fd->command == SAM_CLOSE) {
3405
0
            pthread_mutex_unlock(&fd->command_m);
3406
0
            l = NULL;
3407
0
            goto tidyup;
3408
0
        }
3409
0
        l = NULL;  // Now "owned" by sam_parse_worker()
3410
0
        pthread_mutex_unlock(&fd->command_m);
3411
0
    }
3412
3413
0
    if (hts_tpool_dispatch(fd->p, fd->q, sam_parse_eof, NULL) < 0)
3414
0
        goto err;
3415
3416
    // At EOF, wait for close request.
3417
    // (In future if we add support for seek, this is where we need to catch it.)
3418
0
    for (;;) {
3419
0
        pthread_mutex_lock(&fd->command_m);
3420
0
        if (fd->command == SAM_NONE)
3421
0
            pthread_cond_wait(&fd->command_c, &fd->command_m);
3422
0
        switch (fd->command) {
3423
0
        case SAM_CLOSE:
3424
0
            pthread_cond_signal(&fd->command_c);
3425
0
            pthread_mutex_unlock(&fd->command_m);
3426
0
            hts_tpool_process_shutdown(fd->q);
3427
0
            goto tidyup;
3428
3429
0
        default:
3430
0
            pthread_mutex_unlock(&fd->command_m);
3431
0
            break;
3432
0
        }
3433
0
    }
3434
3435
0
 tidyup:
3436
0
    pthread_mutex_lock(&fd->command_m);
3437
0
    fd->command = SAM_CLOSE_DONE;
3438
0
    pthread_cond_signal(&fd->command_c);
3439
0
    pthread_mutex_unlock(&fd->command_m);
3440
3441
0
    if (l) {
3442
0
        pthread_mutex_lock(&fd->lines_m);
3443
0
        l->next = fd->lines;
3444
0
        fd->lines = l;
3445
0
        pthread_mutex_unlock(&fd->lines_m);
3446
0
    }
3447
0
    free(line.s);
3448
3449
0
    return NULL;
3450
3451
0
 err:
3452
0
    sam_state_err(fd, errno ? errno : ENOMEM);
3453
0
    hts_tpool_process_shutdown(fd->q);
3454
0
    goto tidyup;
3455
0
}
3456
3457
// Runs in its own thread.
3458
// Takes encoded blocks of SAM off the thread results queue and writes them
3459
// to our output stream.
3460
0
static void *sam_dispatcher_write(void *vp) {
3461
0
    htsFile *fp = vp;
3462
0
    SAM_state *fd = fp->state;
3463
0
    hts_tpool_result *r;
3464
3465
    // Iterates until result queue is shutdown, where it returns NULL.
3466
0
    while ((r = hts_tpool_next_result_wait(fd->q))) {
3467
0
        sp_lines *gl = (sp_lines *)hts_tpool_result_data(r);
3468
0
        if (!gl) {
3469
0
            sam_state_err(fd, ENOMEM);
3470
0
            goto err;
3471
0
        }
3472
3473
0
        if (fp->idx) {
3474
0
            sp_bams *gb = gl->bams;
3475
0
            int i = 0, count = 0;
3476
0
            while (i < gl->data_size) {
3477
0
                int j = i;
3478
0
                while (i < gl->data_size && gl->data[i] != '\n')
3479
0
                    i++;
3480
0
                if (i < gl->data_size)
3481
0
                    i++;
3482
3483
0
                if (fp->is_bgzf) {
3484
0
                    if (bgzf_flush_try(fp->fp.bgzf, i-j) < 0)
3485
0
                        goto err;
3486
0
                    if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j)
3487
0
                        goto err;
3488
0
                } else {
3489
0
                    if (hwrite(fp->fp.hfile, &gl->data[j], i-j) != i-j)
3490
0
                        goto err;
3491
0
                }
3492
3493
0
                bam1_t *b = &gb->bams[count++];
3494
0
                if (fp->format.compression == bgzf) {
3495
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx,
3496
0
                                      b->core.tid, b->core.pos, bam_endpos(b),
3497
0
                                      bgzf_tell(fp->fp.bgzf),
3498
0
                                      !(b->core.flag&BAM_FUNMAP)) < 0) {
3499
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3500
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3501
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3502
0
                        goto err;
3503
0
                    }
3504
0
                } else {
3505
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
3506
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
3507
0
                        sam_state_err(fd, errno ? errno : ENOMEM);
3508
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
3509
0
                                bam_get_qname(b), sam_hdr_tid2name(fd->h, b->core.tid), sam_hdr_tid2len(fd->h, b->core.tid), b->core.flag, b->core.pos+1);
3510
0
                        goto err;
3511
0
                    }
3512
0
                }
3513
0
            }
3514
3515
0
            assert(count == gb->nbams);
3516
3517
            // Add bam array to free-list
3518
0
            pthread_mutex_lock(&fd->lines_m);
3519
0
            gb->next = fd->bams;
3520
0
            fd->bams = gl->bams;
3521
0
            gl->bams = NULL;
3522
0
            pthread_mutex_unlock(&fd->lines_m);
3523
0
        } else {
3524
0
            if (fp->is_bgzf) {
3525
                // We keep track of how much in the current block we have
3526
                // remaining => R.  We look for the last newline in input
3527
                // [i] to [i+R], backwards => position N.
3528
                //
3529
                // If we find a newline, we write out bytes i to N.
3530
                // We know we cannot fit the next record in this bgzf block,
3531
                // so we flush what we have and copy input N to i+R into
3532
                // the start of a new block, and recompute a new R for that.
3533
                //
3534
                // If we don't find a newline (i==N) then we cannot extend
3535
                // the current block at all, so flush whatever is in it now
3536
                // if it ends on a newline.
3537
                // We still copy i(==N) to i+R to the next block and
3538
                // continue as before with a new R.
3539
                //
3540
                // The only exception on the flush is when we run out of
3541
                // data in the input.  In that case we skip it as we don't
3542
                // yet know if the next record will fit.
3543
                //
3544
                // Both conditions share the same code here:
3545
                // - Look for newline (pos N)
3546
                // - Write i to N (which maybe 0)
3547
                // - Flush if block ends on newline and not end of input
3548
                // - write N to i+R
3549
3550
0
                int i = 0;
3551
0
                BGZF *fb = fp->fp.bgzf;
3552
0
                while (i < gl->data_size) {
3553
                    // remaining space in block
3554
0
                    int R = BGZF_BLOCK_SIZE - fb->block_offset;
3555
0
                    int eod = 0;
3556
0
                    if (R > gl->data_size-i)
3557
0
                        R = gl->data_size-i, eod = 1;
3558
3559
                    // Find last newline in input data
3560
0
                    int N = i + R;
3561
0
                    while (--N > i) {
3562
0
                        if (gl->data[N] == '\n')
3563
0
                            break;
3564
0
                    }
3565
3566
0
                    if (N != i) {
3567
                        // Found a newline
3568
0
                        N++;
3569
0
                        if (bgzf_write(fb, &gl->data[i], N-i) != N-i)
3570
0
                            goto err;
3571
0
                    }
3572
3573
                    // Flush bgzf block
3574
0
                    int b_off = fb->block_offset;
3575
0
                    if (!eod && b_off &&
3576
0
                        ((char *)fb->uncompressed_block)[b_off-1] == '\n')
3577
0
                        if (bgzf_flush_try(fb, BGZF_BLOCK_SIZE) < 0)
3578
0
                            goto err;
3579
3580
                    // Copy from N onwards into next block
3581
0
                    if (i+R > N)
3582
0
                        if (bgzf_write(fb, &gl->data[N], i+R - N)
3583
0
                            != i+R - N)
3584
0
                            goto err;
3585
3586
0
                    i = i+R;
3587
0
                }
3588
0
            } else {
3589
0
                if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size)
3590
0
                    goto err;
3591
0
            }
3592
0
        }
3593
3594
0
        hts_tpool_delete_result(r, 0);
3595
3596
        // Also updated by main thread
3597
0
        pthread_mutex_lock(&fd->lines_m);
3598
0
        gl->next = fd->lines;
3599
0
        fd->lines = gl;
3600
0
        pthread_mutex_unlock(&fd->lines_m);
3601
0
    }
3602
3603
0
    sam_state_err(fd, 0); // success
3604
0
    hts_tpool_process_shutdown(fd->q);
3605
0
    return NULL;
3606
3607
0
 err:
3608
0
    sam_state_err(fd, errno ? errno : EIO);
3609
0
    return (void *)-1;
3610
0
}
3611
3612
// Run from one of the worker threads.
3613
// Convert a passed in array of BAMs (sp_bams) and converts to a block
3614
// of text SAM records (sp_lines).
3615
0
static void *sam_format_worker(void *arg) {
3616
0
    sp_bams *gb = (sp_bams *)arg;
3617
0
    sp_lines *gl = NULL;
3618
0
    int i;
3619
0
    SAM_state *fd = gb->fd;
3620
0
    htsFile *fp = fd->fp;
3621
3622
    // Use a block of SAM strings we had earlier if available.
3623
0
    pthread_mutex_lock(&fd->lines_m);
3624
0
    if (fd->lines) {
3625
0
        gl = fd->lines;
3626
0
        fd->lines = gl->next;
3627
0
    }
3628
0
    pthread_mutex_unlock(&fd->lines_m);
3629
3630
0
    if (gl == NULL) {
3631
0
        gl = calloc(1, sizeof(*gl));
3632
0
        if (!gl) {
3633
0
            sam_state_err(fd, ENOMEM);
3634
0
            return NULL;
3635
0
        }
3636
0
        gl->alloc = gl->data_size = 0;
3637
0
        gl->data = NULL;
3638
0
    }
3639
0
    gl->serial = gb->serial;
3640
0
    gl->next = NULL;
3641
3642
0
    kstring_t ks = {0, gl->alloc, gl->data};
3643
3644
0
    for (i = 0; i < gb->nbams; i++) {
3645
0
        if (sam_format1_append(fd->h, &gb->bams[i], &ks) < 0) {
3646
0
            sam_state_err(fd, errno ? errno : EIO);
3647
0
            goto err;
3648
0
        }
3649
0
        kputc('\n', &ks);
3650
0
    }
3651
3652
0
    pthread_mutex_lock(&fd->lines_m);
3653
0
    gl->data_size = ks.l;
3654
0
    gl->alloc = ks.m;
3655
0
    gl->data = ks.s;
3656
3657
0
    if (fp->idx) {
3658
        // Keep hold of the bam array a little longer as
3659
        // sam_dispatcher_write needs to use them for building the index.
3660
0
        gl->bams = gb;
3661
0
    } else {
3662
        // Add bam array to free-list
3663
0
        gb->next = fd->bams;
3664
0
        fd->bams = gb;
3665
0
    }
3666
0
    pthread_mutex_unlock(&fd->lines_m);
3667
3668
0
    return gl;
3669
3670
0
 err:
3671
    // Possible race between this and fd->curr_bam.
3672
    // Easier to not free and leave it on the input list so it
3673
    // gets freed there instead?
3674
    // sam_free_sp_bams(gb);
3675
0
    if (gl) {
3676
0
        free(gl->data);
3677
0
        free(gl);
3678
0
    }
3679
0
    return NULL;
3680
0
}
3681
3682
0
int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) {
3683
0
    if (fp->state)
3684
0
        return 0;
3685
3686
0
    if (!(fp->state = sam_state_create(fp)))
3687
0
        return -1;
3688
0
    SAM_state *fd = (SAM_state *)fp->state;
3689
3690
0
    pthread_mutex_init(&fd->lines_m, NULL);
3691
0
    pthread_mutex_init(&fd->command_m, NULL);
3692
0
    pthread_cond_init(&fd->command_c, NULL);
3693
0
    fd->p = p->pool;
3694
0
    int qsize = p->qsize;
3695
0
    if (!qsize)
3696
0
        qsize = 2*hts_tpool_size(fd->p);
3697
0
    fd->q = hts_tpool_process_init(fd->p, qsize, 0);
3698
0
    if (!fd->q) {
3699
0
        sam_state_destroy(fp);
3700
0
        return -1;
3701
0
    }
3702
3703
0
    if (fp->format.compression == bgzf)
3704
0
        return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize);
3705
3706
0
    return 0;
3707
0
}
3708
3709
0
int sam_set_threads(htsFile *fp, int nthreads) {
3710
0
    if (nthreads <= 0)
3711
0
        return 0;
3712
3713
0
    htsThreadPool p;
3714
0
    p.pool = hts_tpool_init(nthreads);
3715
0
    p.qsize = nthreads*2;
3716
3717
0
    int ret = sam_set_thread_pool(fp, &p);
3718
0
    if (ret < 0)
3719
0
        return ret;
3720
3721
0
    SAM_state *fd = (SAM_state *)fp->state;
3722
0
    fd->own_pool = 1;
3723
3724
0
    return 0;
3725
0
}
3726
3727
typedef struct {
3728
    kstring_t name;
3729
    kstring_t comment; // NB: pointer into name, do not free
3730
    kstring_t seq;
3731
    kstring_t qual;
3732
    int casava;
3733
    int aux;
3734
    int rnum;
3735
    char BC[3];         // aux tag ID for barcode
3736
    khash_t(tag) *tags; // which aux tags to use (if empty, use all).
3737
    char nprefix;
3738
    int sra_names;
3739
} fastq_state;
3740
3741
// Initialise fastq state.
3742
// Name char of '@' or '>' distinguishes fastq vs fasta variant
3743
121
static fastq_state *fastq_state_init(int name_char) {
3744
121
    fastq_state *x = (fastq_state *)calloc(1, sizeof(*x));
3745
121
    if (!x)
3746
0
        return NULL;
3747
121
    strcpy(x->BC, "BC");
3748
121
    x->nprefix = name_char;
3749
3750
121
    return x;
3751
121
}
3752
3753
121
void fastq_state_destroy(htsFile *fp) {
3754
121
    if (fp->state) {
3755
121
        fastq_state *x = (fastq_state *)fp->state;
3756
121
        if (x->tags)
3757
121
            kh_destroy(tag, x->tags);
3758
121
        ks_free(&x->name);
3759
121
        ks_free(&x->seq);
3760
121
        ks_free(&x->qual);
3761
121
        free(fp->state);
3762
121
    }
3763
121
}
3764
3765
0
int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) {
3766
0
    va_list args;
3767
3768
0
    if (!fp)
3769
0
        return -1;
3770
0
    if (!fp->state)
3771
0
        if (!(fp->state = fastq_state_init(fp->format.format == fastq_format
3772
0
                                           ? '@' : '>')))
3773
0
            return -1;
3774
3775
0
    fastq_state *x = (fastq_state *)fp->state;
3776
3777
0
    switch (opt) {
3778
0
    case FASTQ_OPT_CASAVA:
3779
0
        x->casava = 1;
3780
0
        break;
3781
3782
0
    case FASTQ_OPT_NAME2:
3783
0
        x->sra_names = 1;
3784
0
        break;
3785
3786
0
    case FASTQ_OPT_AUX: {
3787
0
        va_start(args, opt);
3788
0
        x->aux = 1;
3789
0
        char *tag = va_arg(args, char *);
3790
0
        va_end(args);
3791
0
        if (tag && strcmp(tag, "1") != 0) {
3792
0
            if (!x->tags)
3793
0
                if (!(x->tags = kh_init(tag)))
3794
0
                    return -1;
3795
3796
0
            size_t i, tlen = strlen(tag);
3797
0
            for (i = 0; i+3 <= tlen+1; i += 3) {
3798
0
                if (tag[i+0] == ',' || tag[i+1] == ',' ||
3799
0
                    !(tag[i+2] == ',' || tag[i+2] == '\0')) {
3800
0
                    hts_log_warning("Bad tag format '%.3s'; skipping option", tag+i);
3801
0
                    break;
3802
0
                }
3803
0
                int ret, tcode = tag[i+0]*256 + tag[i+1];
3804
0
                kh_put(tag, x->tags, tcode, &ret);
3805
0
                if (ret < 0)
3806
0
                    return -1;
3807
0
            }
3808
0
        }
3809
0
        break;
3810
0
    }
3811
3812
0
    case FASTQ_OPT_BARCODE: {
3813
0
        va_start(args, opt);
3814
0
        char *bc = va_arg(args, char *);
3815
0
        va_end(args);
3816
0
        strncpy(x->BC, bc, 2);
3817
0
        x->BC[2] = 0;
3818
0
        break;
3819
0
    }
3820
3821
0
    case FASTQ_OPT_RNUM:
3822
0
        x->rnum = 1;
3823
0
        break;
3824
3825
0
    default:
3826
0
        break;
3827
0
    }
3828
0
    return 0;
3829
0
}
3830
3831
11.9k
static int fastq_parse1(htsFile *fp, bam1_t *b) {
3832
11.9k
    fastq_state *x = (fastq_state *)fp->state;
3833
11.9k
    size_t i, l;
3834
11.9k
    int ret = 0;
3835
3836
11.9k
    if (fp->format.format == fasta_format && fp->line.s) {
3837
        // For FASTA we've already read the >name line; steal it
3838
        // Not the most efficient, but we don't optimise for fasta reading.
3839
11.5k
        if (fp->line.l == 0)
3840
37
            return -1; // EOF
3841
3842
11.5k
        free(x->name.s);
3843
11.5k
        x->name = fp->line;
3844
11.5k
        fp->line.l = fp->line.m = 0;
3845
11.5k
        fp->line.s = NULL;
3846
11.5k
    } else {
3847
        // Read a FASTQ format entry.
3848
320
        ret = hts_getline(fp, KS_SEP_LINE, &x->name);
3849
320
        if (ret == -1)
3850
0
            return -1;  // EOF
3851
320
        else if (ret < -1)
3852
18
            return ret; // ERR
3853
320
    }
3854
3855
    // Name
3856
11.8k
    if (*x->name.s != x->nprefix)
3857
2
        return -2;
3858
3859
    // Reverse the SRA strangeness of putting the run_name.number before
3860
    // the read name.
3861
11.8k
    i = 0;
3862
11.8k
    char *name = x->name.s+1;
3863
11.8k
    if (x->sra_names) {
3864
0
        char *cp = strpbrk(x->name.s, " \t");
3865
0
        if (cp) {
3866
0
            while (*cp == ' ' || *cp == '\t')
3867
0
                cp++;
3868
0
            *--cp = '@';
3869
0
            i = cp - x->name.s;
3870
0
            name = cp+1;
3871
0
        }
3872
0
    }
3873
3874
11.8k
    l = x->name.l;
3875
11.8k
    char *s = x->name.s;
3876
16.6M
    while (i < l && !isspace_c(s[i]))
3877
16.6M
        i++;
3878
11.8k
    if (i < l) {
3879
372
        s[i] = 0;
3880
372
        x->name.l = i++;
3881
372
    }
3882
3883
    // Comment; a kstring struct, but pointer into name line.  (Do not free)
3884
21.3k
    while (i < l && isspace_c(s[i]))
3885
9.48k
        i++;
3886
11.8k
    x->comment.s = s+i;
3887
11.8k
    x->comment.l = l - i;
3888
3889
    // Seq
3890
11.8k
    x->seq.l = 0;
3891
1.27M
    for (;;) {
3892
1.27M
        if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0)
3893
81
            if (fp->format.format == fastq_format || ret < -1)
3894
38
                return -2;
3895
1.27M
        if (ret == -1 ||
3896
1.27M
            *fp->line.s == (fp->format.format == fastq_format ? '+' : '>'))
3897
11.8k
            break;
3898
1.26M
        if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0)
3899
0
            return -2;
3900
1.26M
    }
3901
3902
    // Qual
3903
11.8k
    if (fp->format.format == fastq_format) {
3904
217
        size_t remainder = x->seq.l;
3905
217
        x->qual.l = 0;
3906
3.37k
        do {
3907
3.37k
            if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0)
3908
6
                return -2;
3909
3.36k
            if (fp->line.l > remainder)
3910
12
                return -2;
3911
3.35k
            if (kputsn(fp->line.s, fp->line.l, &x->qual) < 0)
3912
0
                return -2;
3913
3.35k
            remainder -= fp->line.l;
3914
3.35k
        } while (remainder > 0);
3915
3916
        // Decr qual
3917
392
        for (i = 0; i < x->qual.l; i++)
3918
193
            x->qual.s[i] -= '!';
3919
199
    }
3920
3921
11.7k
    int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED;
3922
11.7k
    if (x->name.l > 2 &&
3923
11.7k
        x->name.s[x->name.l-2] == '/' &&
3924
11.7k
        isdigit_c(x->name.s[x->name.l-1])) {
3925
129
        switch(x->name.s[x->name.l-1]) {
3926
0
        case '1': flag |= BAM_FREAD1 | pflag; break;
3927
0
        case '2': flag |= BAM_FREAD2 | pflag; break;
3928
129
        default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
3929
129
        }
3930
129
        x->name.s[x->name.l-=2] = 0;
3931
129
    }
3932
3933
    // Convert to BAM
3934
11.7k
    ret = bam_set1(b,
3935
11.7k
                   x->name.s + x->name.l - name, name,
3936
11.7k
                   flag,
3937
11.7k
                   -1, -1, 0, // ref '*', pos, mapq,
3938
11.7k
                   0, NULL,     // no cigar,
3939
11.7k
                   -1, -1, 0,    // mate
3940
11.7k
                   x->seq.l, x->seq.s, x->qual.s,
3941
11.7k
                   0);
3942
3943
    // Identify Illumina CASAVA strings.
3944
    // <read>:<is_filtered>:<control_bits>:<barcode_sequence>
3945
11.7k
    char *barcode = NULL;
3946
11.7k
    int barcode_len = 0;
3947
11.7k
    kstring_t *kc = &x->comment;
3948
11.7k
    char *endptr;
3949
11.7k
    if (x->casava &&
3950
        // \d:[YN]:\d+:[ACGTN]+
3951
11.7k
        kc->l > 6 && (kc->s[1] | kc->s[3]) == ':' && isdigit_c(kc->s[0]) &&
3952
11.7k
        strtol(kc->s+4, &endptr, 10) >= 0 && endptr != kc->s+4
3953
11.7k
        && *endptr == ':') {
3954
3955
        // read num
3956
0
        switch(kc->s[0]) {
3957
0
        case '1': b->core.flag |= BAM_FREAD1 | pflag; break;
3958
0
        case '2': b->core.flag |= BAM_FREAD2 | pflag; break;
3959
0
        default : b->core.flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break;
3960
0
        }
3961
3962
0
        if (kc->s[2] == 'Y')
3963
0
            b->core.flag |= BAM_FQCFAIL;
3964
3965
        // Barcode, maybe numeric in which case we skip it
3966
0
        if (!isdigit_c(endptr[1])) {
3967
0
            barcode = endptr+1;
3968
0
            for (i = barcode - kc->s; i < kc->l; i++)
3969
0
                if (isspace_c(kc->s[i]))
3970
0
                    break;
3971
3972
0
            kc->s[i] = 0;
3973
0
            barcode_len = i+1-(barcode - kc->s);
3974
0
        }
3975
0
    }
3976
3977
11.7k
    if (ret >= 0 && barcode_len)
3978
0
        if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0)
3979
0
            ret = -2;
3980
3981
11.7k
    if (!x->aux)
3982
11.7k
        return ret;
3983
3984
    // Identify any SAM style aux tags in comments too.
3985
0
    if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0)
3986
0
        ret = -2;
3987
3988
0
    return ret;
3989
11.7k
}
3990
3991
// Internal component of sam_read1 below
3992
357
static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
3993
357
    int ret = bam_read1(fp->fp.bgzf, b);
3994
357
    if (h && ret >= 0) {
3995
343
        if (b->core.tid  >= h->n_targets || b->core.tid  < -1 ||
3996
343
            b->core.mtid >= h->n_targets || b->core.mtid < -1) {
3997
3
            errno = ERANGE;
3998
3
            return -3;
3999
3
        }
4000
343
    }
4001
354
    return ret;
4002
357
}
4003
4004
// Internal component of sam_read1 below
4005
245
static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) {
4006
245
    int ret = cram_get_bam_seq(fp->fp.cram, b);
4007
245
    if (ret < 0)
4008
245
        return cram_eof(fp->fp.cram) ? -1 : -2;
4009
4010
0
    if (bam_tag2cigar(*b, 1, 1) < 0)
4011
0
        return -2;
4012
4013
0
    return ret;
4014
0
}
4015
4016
// Internal component of sam_read1 below
4017
6.12k
static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) {
4018
6.12k
    int ret;
4019
4020
    // Consume 1st line after header parsing as it wasn't using peek
4021
6.12k
    if (fp->line.l != 0) {
4022
0
        ret = sam_parse1(&fp->line, h, b);
4023
0
        fp->line.l = 0;
4024
0
        return ret;
4025
0
    }
4026
4027
6.12k
    if (fp->state) {
4028
0
        SAM_state *fd = (SAM_state *)fp->state;
4029
4030
0
        if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) {
4031
            // We don't support multi-threaded SAM parsing with seeks yet.
4032
0
            int ret;
4033
0
            if ((ret = sam_state_destroy(fp)) < 0) {
4034
0
                errno = -ret;
4035
0
                return -2;
4036
0
            }
4037
0
            if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0)
4038
0
                return -1;
4039
0
            fp->fp.bgzf->seeked = 0;
4040
0
            goto err_recover;
4041
0
        }
4042
4043
0
        if (!fd->h) {
4044
0
            fd->h = h;
4045
0
            fd->h->ref_count++;
4046
            // Ensure hrecs is initialised now as we don't want multiple
4047
            // threads trying to do this simultaneously.
4048
0
            if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0)
4049
0
                return -2;
4050
4051
            // We can only do this once we've got a header
4052
0
            if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read,
4053
0
                               fp) != 0)
4054
0
                return -2;
4055
0
            fd->dispatcher_set = 1;
4056
0
        }
4057
4058
0
        if (fd->h != h) {
4059
0
            hts_log_error("SAM multi-threaded decoding does not support changing header");
4060
0
            return -1;
4061
0
        }
4062
4063
0
        sp_bams *gb = fd->curr_bam;
4064
0
        if (!gb) {
4065
0
            if (fd->errcode) {
4066
                // In case reader failed
4067
0
                errno = fd->errcode;
4068
0
                return -2;
4069
0
            }
4070
0
            hts_tpool_result *r = hts_tpool_next_result_wait(fd->q);
4071
0
            if (!r)
4072
0
                return -2;
4073
0
            fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r);
4074
0
            hts_tpool_delete_result(r, 0);
4075
0
        }
4076
0
        if (!gb)
4077
0
            return fd->errcode ? -2 : -1;
4078
0
        bam1_t *b_array = (bam1_t *)gb->bams;
4079
0
        if (fd->curr_idx < gb->nbams)
4080
0
            if (!bam_copy1(b, &b_array[fd->curr_idx++]))
4081
0
                return -2;
4082
0
        if (fd->curr_idx == gb->nbams) {
4083
0
            pthread_mutex_lock(&fd->lines_m);
4084
0
            gb->next = fd->bams;
4085
0
            fd->bams = gb;
4086
0
            pthread_mutex_unlock(&fd->lines_m);
4087
4088
0
            fd->curr_bam = NULL;
4089
0
            fd->curr_idx = 0;
4090
0
        }
4091
4092
0
        ret = 0;
4093
4094
6.12k
    } else  {
4095
6.12k
    err_recover:
4096
6.12k
        ret = hts_getline(fp, KS_SEP_LINE, &fp->line);
4097
6.12k
        if (ret < 0) return ret;
4098
4099
5.87k
        ret = sam_parse1(&fp->line, h, b);
4100
5.87k
        fp->line.l = 0;
4101
5.87k
        if (ret < 0) {
4102
233
            hts_log_warning("Parse error at line %lld", (long long)fp->lineno);
4103
233
            if (h && h->ignore_sam_err) goto err_recover;
4104
233
        }
4105
5.87k
    }
4106
4107
5.87k
    return ret;
4108
6.12k
}
4109
4110
// Returns 0 on success,
4111
//        -1 on EOF,
4112
//       <-1 on error
4113
int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b)
4114
18.6k
{
4115
18.6k
    int ret, pass_filter;
4116
4117
18.6k
    do {
4118
18.6k
        switch (fp->format.format) {
4119
357
        case bam:
4120
357
            ret = sam_read1_bam(fp, h, b);
4121
357
            break;
4122
4123
245
        case cram:
4124
245
            ret = sam_read1_cram(fp, h, &b);
4125
245
            break;
4126
4127
6.12k
        case sam:
4128
6.12k
            ret = sam_read1_sam(fp, h, b);
4129
6.12k
            break;
4130
4131
11.6k
        case fasta_format:
4132
11.9k
        case fastq_format: {
4133
11.9k
            fastq_state *x = (fastq_state *)fp->state;
4134
11.9k
            if (!x) {
4135
121
                if (!(fp->state = fastq_state_init(fp->format.format
4136
121
                                                   == fastq_format ? '@' : '>')))
4137
0
                    return -2;
4138
121
            }
4139
4140
11.9k
            return fastq_parse1(fp, b);
4141
11.9k
        }
4142
4143
0
        case empty_format:
4144
0
            errno = EPIPE;
4145
0
            return -3;
4146
4147
0
        default:
4148
0
            errno = EFTYPE;
4149
0
            return -3;
4150
18.6k
        }
4151
4152
6.72k
        pass_filter = (ret >= 0 && fp->filter)
4153
6.72k
            ? sam_passes_filter(h, b, fp->filter)
4154
6.72k
            : 1;
4155
6.72k
    } while (pass_filter == 0);
4156
4157
6.72k
    return pass_filter < 0 ? -2 : ret;
4158
18.6k
}
4159
4160
4161
static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4162
17.7k
{
4163
17.7k
    int i, r = 0;
4164
17.7k
    uint8_t *s, *end;
4165
17.7k
    const bam1_core_t *c = &b->core;
4166
4167
17.7k
    if (c->l_qname == 0)
4168
0
        return -1;
4169
17.7k
    r |= kputsn_(bam_get_qname(b), c->l_qname-1-c->l_extranul, str);
4170
17.7k
    r |= kputc_('\t', str); // query name
4171
17.7k
    r |= kputw(c->flag, str); r |= kputc_('\t', str); // flag
4172
17.7k
    if (c->tid >= 0) { // chr
4173
1.17k
        r |= kputs(h->target_name[c->tid] , str);
4174
1.17k
        r |= kputc_('\t', str);
4175
16.5k
    } else r |= kputsn_("*\t", 2, str);
4176
17.7k
    r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos
4177
17.7k
    r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual
4178
17.7k
    if (c->n_cigar) { // cigar
4179
1.53k
        uint32_t *cigar = bam_get_cigar(b);
4180
171k
        for (i = 0; i < c->n_cigar; ++i) {
4181
170k
            r |= kputw(bam_cigar_oplen(cigar[i]), str);
4182
170k
            r |= kputc_(bam_cigar_opchr(cigar[i]), str);
4183
170k
        }
4184
16.2k
    } else r |= kputc_('*', str);
4185
17.7k
    r |= kputc_('\t', str);
4186
17.7k
    if (c->mtid < 0) r |= kputsn_("*\t", 2, str); // mate chr
4187
233
    else if (c->mtid == c->tid) r |= kputsn_("=\t", 2, str);
4188
74
    else {
4189
74
        r |= kputs(h->target_name[c->mtid], str);
4190
74
        r |= kputc_('\t', str);
4191
74
    }
4192
17.7k
    r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos
4193
17.7k
    r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len
4194
17.7k
    if (c->l_qseq) { // seq and qual
4195
2.33k
        uint8_t *s = bam_get_seq(b);
4196
2.33k
        if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err;
4197
2.33k
        char *cp = str->s + str->l;
4198
4199
        // Sequence, 2 bases at a time
4200
2.33k
        nibble2base(s, cp, c->l_qseq);
4201
2.33k
        cp[c->l_qseq] = '\t';
4202
2.33k
        cp += c->l_qseq+1;
4203
4204
        // Quality
4205
2.33k
        s = bam_get_qual(b);
4206
2.33k
        i = 0;
4207
2.33k
        if (s[0] == 0xff) {
4208
2.08k
            cp[i++] = '*';
4209
2.08k
        } else {
4210
            // local copy of c->l_qseq to aid unrolling
4211
250
            uint32_t lqseq = c->l_qseq;
4212
1.08k
            for (i = 0; i < lqseq; ++i)
4213
833
                cp[i]=s[i]+33;
4214
250
        }
4215
2.33k
        cp[i] = 0;
4216
2.33k
        cp += i;
4217
2.33k
        str->l = cp - str->s;
4218
15.4k
    } else r |= kputsn_("*\t*", 3, str);
4219
4220
17.7k
    s = bam_get_aux(b); // aux
4221
17.7k
    end = b->data + b->l_data;
4222
4223
2.61M
    while (end - s >= 4) {
4224
2.59M
        r |= kputc_('\t', str);
4225
2.59M
        if ((s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)) == NULL)
4226
17
            goto bad_aux;
4227
2.59M
    }
4228
17.7k
    r |= kputsn("", 0, str); // nul terminate
4229
17.7k
    if (r < 0) goto mem_err;
4230
4231
17.7k
    return str->l;
4232
4233
17
 bad_aux:
4234
17
    hts_log_error("Corrupted aux data for read %.*s",
4235
17
                  b->core.l_qname, bam_get_qname(b));
4236
17
    errno = EINVAL;
4237
17
    return -1;
4238
4239
0
 mem_err:
4240
0
    hts_log_error("Out of memory");
4241
0
    errno = ENOMEM;
4242
0
    return -1;
4243
17.7k
}
4244
4245
int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str)
4246
17.7k
{
4247
17.7k
    str->l = 0;
4248
17.7k
    return sam_format1_append(h, b, str);
4249
17.7k
}
4250
4251
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end);
4252
int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str)
4253
0
{
4254
0
    unsigned flag = b->core.flag;
4255
0
    int i, e = 0, len = b->core.l_qseq;
4256
0
    uint8_t *seq, *qual;
4257
4258
0
    str->l = 0;
4259
4260
0
    if (len == 0) return 0;
4261
4262
    // Name
4263
0
    if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF)
4264
0
        return -1;
4265
4266
    // /1 or /2 suffix
4267
0
    if (x && x->rnum && (flag & BAM_FPAIRED)) {
4268
0
        int r12 = flag & (BAM_FREAD1 | BAM_FREAD2);
4269
0
        if (r12 == BAM_FREAD1) {
4270
0
            if (kputs("/1", str) == EOF)
4271
0
                return -1;
4272
0
        } else if (r12 == BAM_FREAD2) {
4273
0
            if (kputs("/2", str) == EOF)
4274
0
                return -1;
4275
0
        }
4276
0
    }
4277
4278
    // Illumina CASAVA tag.
4279
    // This is <rnum>:<Y/N qcfail>:<control-bits>:<barcode-or-zero>
4280
0
    if (x && x->casava) {
4281
0
        int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0;
4282
0
        char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N';
4283
0
        uint8_t *bc = bam_aux_get(b, x->BC);
4284
0
        if (ksprintf(str, " %d:%c:0:%s", rnum, filtered,
4285
0
                     bc ? (char *)bc+1 : "0") < 0)
4286
0
            return -1;
4287
4288
0
        if (bc && (*bc != 'Z' || (!isupper_c(bc[1]) && !islower_c(bc[1])))) {
4289
0
            hts_log_warning("BC tag starts with non-sequence base; using '0'");
4290
0
            str->l -= strlen((char *)bc)-2; // limit to 1 char
4291
0
            str->s[str->l-1] = '0';
4292
0
            str->s[str->l] = 0;
4293
0
            bc = NULL;
4294
0
        }
4295
4296
        // Replace any non-alpha with '+'.  Ie seq-seq to seq+seq
4297
0
        if (bc) {
4298
0
            int l = strlen((char *)bc+1);
4299
0
            char *c = (char *)str->s + str->l - l;
4300
0
            for (i = 0; i < l; i++) {
4301
0
                if (!isalpha_c(c[i]))
4302
0
                    c[i] = '+';
4303
0
                else if (islower_c(c[i]))
4304
0
                    c[i] = toupper_c(c[i]);
4305
0
            }
4306
0
        }
4307
0
    }
4308
4309
    // Aux tags
4310
0
    if (x && x->aux) {
4311
0
        uint8_t *s = bam_get_aux(b), *end = b->data + b->l_data;
4312
0
        while (s && end - s >= 4) {
4313
0
            int tt = s[0]*256 + s[1];
4314
0
            if (x->tags == NULL ||
4315
0
                kh_get(tag, x->tags, tt) != kh_end(x->tags)) {
4316
0
                e |= kputc_('\t', str) < 0;
4317
0
                if (!(s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str)))
4318
0
                    return -1;
4319
0
            } else {
4320
0
                s = skip_aux(s+2, end);
4321
0
            }
4322
0
        }
4323
0
        e |= kputsn("", 0, str) < 0; // nul terminate
4324
0
    }
4325
4326
0
    if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1;
4327
0
    e |= kputc_('\n', str) < 0;
4328
4329
    // Seq line
4330
0
    seq = bam_get_seq(b);
4331
0
    if (flag & BAM_FREVERSE)
4332
0
        for (i = len-1; i >= 0; i--)
4333
0
            e |= kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str) < 0;
4334
0
    else
4335
0
        for (i = 0; i < len; i++)
4336
0
            e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0;
4337
4338
4339
    // Qual line
4340
0
    if (x->nprefix == '@') {
4341
0
        kputsn("\n+\n", 3, str);
4342
0
        qual = bam_get_qual(b);
4343
0
        if (qual[0] == 0xff)
4344
0
            for (i = 0; i < len; i++)
4345
0
                e |= kputc_('B', str) < 0;
4346
0
        else if (flag & BAM_FREVERSE)
4347
0
            for (i = len-1; i >= 0; i--)
4348
0
                e |= kputc_(33 + qual[i], str) < 0;
4349
0
        else
4350
0
            for (i = 0; i < len; i++)
4351
0
                e |= kputc_(33 + qual[i], str) < 0;
4352
4353
0
    }
4354
0
    e |= kputc('\n', str) < 0;
4355
4356
0
    return e ? -1 : str->l;
4357
0
}
4358
4359
// Sadly we need to be able to modify the bam_hdr here so we can
4360
// reference count the structure.
4361
int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b)
4362
17.7k
{
4363
17.7k
    switch (fp->format.format) {
4364
0
    case binary_format:
4365
0
        fp->format.category = sequence_data;
4366
0
        fp->format.format = bam;
4367
        /* fall-through */
4368
0
    case bam:
4369
0
        return bam_write_idx1(fp, h, b);
4370
4371
0
    case cram:
4372
0
        return cram_put_bam_seq(fp->fp.cram, (bam1_t *)b);
4373
4374
0
    case text_format:
4375
0
        fp->format.category = sequence_data;
4376
0
        fp->format.format = sam;
4377
        /* fall-through */
4378
17.7k
    case sam:
4379
17.7k
        if (fp->state) {
4380
0
            SAM_state *fd = (SAM_state *)fp->state;
4381
4382
            // Threaded output
4383
0
            if (!fd->h) {
4384
                // NB: discard const.  We don't actually modify sam_hdr_t here,
4385
                // just data pointed to by it (which is a bit weasely still),
4386
                // but out cached pointer must be non-const as we want to
4387
                // destroy it later on and sam_hdr_destroy takes non-const.
4388
                //
4389
                // We do this because some tools do sam_hdr_destroy; sam_close
4390
                // while others do sam_close; sam_hdr_destroy.  The former is
4391
                // an issue as we need the header still when flushing.
4392
0
                fd->h = (sam_hdr_t *)h;
4393
0
                fd->h->ref_count++;
4394
4395
0
                if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write,
4396
0
                                   fp) != 0)
4397
0
                    return -2;
4398
0
                fd->dispatcher_set = 1;
4399
0
            }
4400
4401
0
            if (fd->h != h) {
4402
0
                hts_log_error("SAM multi-threaded decoding does not support changing header");
4403
0
                return -2;
4404
0
            }
4405
4406
            // Find a suitable BAM array to copy to
4407
0
            sp_bams *gb = fd->curr_bam;
4408
0
            if (!gb) {
4409
0
                pthread_mutex_lock(&fd->lines_m);
4410
0
                if (fd->bams) {
4411
0
                    fd->curr_bam = gb = fd->bams;
4412
0
                    fd->bams = gb->next;
4413
0
                    gb->next = NULL;
4414
0
                    gb->nbams = 0;
4415
0
                    gb->bam_mem = 0;
4416
0
                    pthread_mutex_unlock(&fd->lines_m);
4417
0
                } else {
4418
0
                    pthread_mutex_unlock(&fd->lines_m);
4419
0
                    if (!(gb = calloc(1, sizeof(*gb)))) return -1;
4420
0
                    if (!(gb->bams = calloc(SAM_NBAM, sizeof(*gb->bams)))) {
4421
0
                        free(gb);
4422
0
                        return -1;
4423
0
                    }
4424
0
                    gb->nbams = 0;
4425
0
                    gb->abams = SAM_NBAM;
4426
0
                    gb->bam_mem = 0;
4427
0
                    gb->fd = fd;
4428
0
                    fd->curr_idx = 0;
4429
0
                    fd->curr_bam = gb;
4430
0
                }
4431
0
            }
4432
4433
0
            if (!bam_copy1(&gb->bams[gb->nbams++], b))
4434
0
                return -2;
4435
0
            gb->bam_mem += b->l_data + sizeof(*b);
4436
4437
            // Dispatch if full
4438
0
            if (gb->nbams == SAM_NBAM || gb->bam_mem > SAM_NBYTES*0.8) {
4439
0
                gb->serial = fd->serial++;
4440
0
                pthread_mutex_lock(&fd->command_m);
4441
0
                if (fd->errcode != 0) {
4442
0
                    pthread_mutex_unlock(&fd->command_m);
4443
0
                    return -fd->errcode;
4444
0
                }
4445
0
                if (hts_tpool_dispatch3(fd->p, fd->q, sam_format_worker, gb,
4446
0
                                        cleanup_sp_bams,
4447
0
                                        cleanup_sp_lines, 0) < 0) {
4448
0
                    pthread_mutex_unlock(&fd->command_m);
4449
0
                    return -1;
4450
0
                }
4451
0
                pthread_mutex_unlock(&fd->command_m);
4452
0
                fd->curr_bam = NULL;
4453
0
            }
4454
4455
            // Dummy value as we don't know how long it really is.
4456
            // We could track file sizes via a SAM_state field, but I don't think
4457
            // it is necessary.
4458
0
            return 1;
4459
17.7k
        } else {
4460
17.7k
            if (sam_format1(h, b, &fp->line) < 0) return -1;
4461
17.7k
            kputc('\n', &fp->line);
4462
17.7k
            if (fp->is_bgzf) {
4463
0
                if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4464
0
                    return -1;
4465
0
                if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4466
17.7k
            } else {
4467
17.7k
                if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1;
4468
17.7k
            }
4469
4470
17.7k
            if (fp->idx) {
4471
0
                if (fp->format.compression == bgzf) {
4472
0
                    if (bgzf_idx_push(fp->fp.bgzf, fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4473
0
                                      bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4474
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4475
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4476
0
                        return -1;
4477
0
                    }
4478
0
                } else {
4479
0
                    if (hts_idx_push(fp->idx, b->core.tid, b->core.pos, bam_endpos(b),
4480
0
                                     bgzf_tell(fp->fp.bgzf), !(b->core.flag&BAM_FUNMAP)) < 0) {
4481
0
                        hts_log_error("Read '%s' with ref_name='%s', ref_length=%"PRIhts_pos", flags=%d, pos=%"PRIhts_pos" cannot be indexed",
4482
0
                                bam_get_qname(b), sam_hdr_tid2name(h, b->core.tid), sam_hdr_tid2len(h, b->core.tid), b->core.flag, b->core.pos+1);
4483
0
                        return -1;
4484
0
                    }
4485
0
                }
4486
0
            }
4487
4488
17.7k
            return fp->line.l;
4489
17.7k
        }
4490
4491
4492
0
    case fasta_format:
4493
0
    case fastq_format: {
4494
0
        fastq_state *x = (fastq_state *)fp->state;
4495
0
        if (!x) {
4496
0
            if (!(fp->state = fastq_state_init(fp->format.format
4497
0
                                               == fastq_format ? '@' : '>')))
4498
0
                return -2;
4499
0
        }
4500
4501
0
        if (fastq_format1(fp->state, b, &fp->line) < 0)
4502
0
            return -1;
4503
0
        if (fp->is_bgzf) {
4504
0
            if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0)
4505
0
                return -1;
4506
0
            if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l)
4507
0
                return -1;
4508
0
        } else {
4509
0
            if (hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l)
4510
0
                return -1;
4511
0
        }
4512
0
        return fp->line.l;
4513
0
    }
4514
4515
0
    default:
4516
0
        errno = EBADF;
4517
0
        return -1;
4518
17.7k
    }
4519
17.7k
}
4520
4521
/************************
4522
 *** Auxiliary fields ***
4523
 ************************/
4524
#ifndef HTS_LITTLE_ENDIAN
4525
static int aux_to_le(char type, uint8_t *out, const uint8_t *in, size_t len) {
4526
    int tsz = aux_type2size(type);
4527
4528
    if (tsz >= 2 && tsz <= 8 && (len & (tsz - 1)) != 0) return -1;
4529
4530
    switch (tsz) {
4531
        case 'H': case 'Z': case 1:  // Trivial
4532
            memcpy(out, in, len);
4533
            break;
4534
4535
#define aux_val_to_le(type_t, store_le) do {                            \
4536
        type_t v;                                                       \
4537
        size_t i;                                                       \
4538
        for (i = 0; i < len; i += sizeof(type_t), out += sizeof(type_t)) { \
4539
            memcpy(&v, in + i, sizeof(type_t));                         \
4540
            store_le(v, out);                                           \
4541
        }                                                               \
4542
    } while (0)
4543
4544
        case 2: aux_val_to_le(uint16_t, u16_to_le); break;
4545
        case 4: aux_val_to_le(uint32_t, u32_to_le); break;
4546
        case 8: aux_val_to_le(uint64_t, u64_to_le); break;
4547
4548
#undef aux_val_to_le
4549
4550
        case 'B': { // Recurse!
4551
            uint32_t n;
4552
            if (len < 5) return -1;
4553
            memcpy(&n, in + 1, 4);
4554
            out[0] = in[0];
4555
            u32_to_le(n, out + 1);
4556
            return aux_to_le(in[0], out + 5, in + 5, len - 5);
4557
        }
4558
4559
        default: // Unknown type code
4560
            return -1;
4561
    }
4562
4563
4564
4565
    return 0;
4566
}
4567
#endif
4568
4569
int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data)
4570
0
{
4571
0
    uint32_t new_len;
4572
4573
0
    assert(b->l_data >= 0);
4574
0
    new_len = b->l_data + 3 + len;
4575
0
    if (new_len > INT32_MAX || new_len < b->l_data) goto nomem;
4576
4577
0
    if (realloc_bam_data(b, new_len) < 0) return -1;
4578
4579
0
    b->data[b->l_data] = tag[0];
4580
0
    b->data[b->l_data + 1] = tag[1];
4581
0
    b->data[b->l_data + 2] = type;
4582
4583
0
#ifdef HTS_LITTLE_ENDIAN
4584
0
    memcpy(b->data + b->l_data + 3, data, len);
4585
#else
4586
    if (aux_to_le(type, b->data + b->l_data + 3, data, len) != 0) {
4587
        errno = EINVAL;
4588
        return -1;
4589
    }
4590
#endif
4591
4592
0
    b->l_data = new_len;
4593
4594
0
    return 0;
4595
4596
0
 nomem:
4597
0
    errno = ENOMEM;
4598
0
    return -1;
4599
0
}
4600
4601
static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
4602
2.08M
{
4603
2.08M
    int size;
4604
2.08M
    uint32_t n;
4605
2.08M
    if (s >= end) return end;
4606
2.08M
    size = aux_type2size(*s); ++s; // skip type
4607
2.08M
    switch (size) {
4608
554k
    case 'Z':
4609
554k
    case 'H':
4610
246M
        while (s < end && *s) ++s;
4611
554k
        return s < end ? s + 1 : end;
4612
752k
    case 'B':
4613
752k
        if (end - s < 5) return NULL;
4614
752k
        size = aux_type2size(*s); ++s;
4615
752k
        n = le_to_u32(s);
4616
752k
        s += 4;
4617
752k
        if (size == 0 || end - s < size * n) return NULL;
4618
752k
        return s + size * n;
4619
0
    case 0:
4620
0
        return NULL;
4621
776k
    default:
4622
776k
        if (end - s < size) return NULL;
4623
776k
        return s + size;
4624
2.08M
    }
4625
2.08M
}
4626
4627
uint8_t *bam_aux_first(const bam1_t *b)
4628
970
{
4629
970
    uint8_t *s = bam_get_aux(b);
4630
970
    uint8_t *end = b->data + b->l_data;
4631
970
    if (s >= end) { errno = ENOENT; return NULL; }
4632
781
    return s+2;
4633
970
}
4634
4635
uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s)
4636
2.08M
{
4637
2.08M
    uint8_t *end = b->data + b->l_data;
4638
2.08M
    uint8_t *next = s? skip_aux((uint8_t *) s, end) : end;
4639
2.08M
    if (next == NULL) goto bad_aux;
4640
2.08M
    if (next >= end) { errno = ENOENT; return NULL; }
4641
2.08M
    return next+2;
4642
4643
0
 bad_aux:
4644
0
    hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
4645
0
    errno = EINVAL;
4646
0
    return NULL;
4647
2.08M
}
4648
4649
uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
4650
970
{
4651
970
    uint8_t *s;
4652
2.08M
    for (s = bam_aux_first(b); s; s = bam_aux_next(b, s))
4653
2.08M
        if (s[-2] == tag[0] && s[-1] == tag[1]) {
4654
            // Check the tag value is valid and complete
4655
618
            uint8_t *e = skip_aux(s, b->data + b->l_data);
4656
618
            if (e == NULL) goto bad_aux;
4657
618
            if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux;
4658
4659
618
            return s;
4660
618
        }
4661
4662
    // errno now as set by bam_aux_first()/bam_aux_next()
4663
352
    return NULL;
4664
4665
0
 bad_aux:
4666
0
    hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
4667
0
    errno = EINVAL;
4668
0
    return NULL;
4669
970
}
4670
4671
int bam_aux_del(bam1_t *b, uint8_t *s)
4672
0
{
4673
0
    s = bam_aux_remove(b, s);
4674
0
    return (s || errno == ENOENT)? 0 : -1;
4675
0
}
4676
4677
uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s)
4678
0
{
4679
0
    uint8_t *end = b->data + b->l_data;
4680
0
    uint8_t *next = skip_aux(s, end);
4681
0
    if (next == NULL) goto bad_aux;
4682
4683
0
    b->l_data -= next - (s-2);
4684
0
    if (next >= end) { errno = ENOENT; return NULL; }
4685
4686
0
    memmove(s-2, next, end - next);
4687
0
    return s;
4688
4689
0
 bad_aux:
4690
0
    hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
4691
0
    errno = EINVAL;
4692
0
    return NULL;
4693
0
}
4694
4695
int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)
4696
0
{
4697
    // FIXME: This is not at all efficient!
4698
0
    size_t ln = len >= 0 ? len : strlen(data) + 1;
4699
0
    size_t old_ln = 0;
4700
0
    int need_nul = ln == 0 || data[ln - 1] != '\0';
4701
0
    int save_errno = errno;
4702
0
    int new_tag = 0;
4703
0
    uint8_t *s = bam_aux_get(b,tag), *e;
4704
4705
0
    if (s) {  // Replacing existing tag
4706
0
        char type = *s;
4707
0
        if (type != 'Z') {
4708
0
            hts_log_error("Called bam_aux_update_str for type '%c' instead of 'Z'", type);
4709
0
            errno = EINVAL;
4710
0
            return -1;
4711
0
        }
4712
0
        s++;
4713
0
        e = memchr(s, '\0', b->data + b->l_data - s);
4714
0
        old_ln = (e ? e - s : b->data + b->l_data - s) + 1;
4715
0
        s -= 3;
4716
0
    } else {
4717
0
        if (errno != ENOENT) { // Invalid aux data, give up
4718
0
            return -1;
4719
0
        } else { // Tag doesn't exist - put it on the end
4720
0
            errno = save_errno;
4721
0
            s = b->data + b->l_data;
4722
0
            new_tag = 3;
4723
0
        }
4724
0
    }
4725
4726
0
    if (old_ln < ln + need_nul + new_tag) {
4727
0
        ptrdiff_t s_offset = s - b->data;
4728
0
        if (possibly_expand_bam_data(b, ln + need_nul + new_tag - old_ln) < 0)
4729
0
            return -1;
4730
0
        s = b->data + s_offset;
4731
0
    }
4732
0
    if (!new_tag) {
4733
0
        memmove(s + 3 + ln + need_nul,
4734
0
                s + 3 + old_ln,
4735
0
                b->l_data - (s + 3 - b->data) - old_ln);
4736
0
    }
4737
0
    b->l_data += new_tag + ln + need_nul - old_ln;
4738
4739
0
    s[0] = tag[0];
4740
0
    s[1] = tag[1];
4741
0
    s[2] = 'Z';
4742
0
    memmove(s+3,data,ln);
4743
0
    if (need_nul) s[3 + ln] = '\0';
4744
0
    return 0;
4745
0
}
4746
4747
int bam_aux_update_int(bam1_t *b, const char tag[2], int64_t val)
4748
0
{
4749
0
    uint32_t sz, old_sz = 0, new = 0;
4750
0
    uint8_t *s, type;
4751
4752
0
    if (val < INT32_MIN || val > UINT32_MAX) {
4753
0
        errno = EOVERFLOW;
4754
0
        return -1;
4755
0
    }
4756
0
    if (val < INT16_MIN)       { type = 'i'; sz = 4; }
4757
0
    else if (val < INT8_MIN)   { type = 's'; sz = 2; }
4758
0
    else if (val < 0)          { type = 'c'; sz = 1; }
4759
0
    else if (val < UINT8_MAX)  { type = 'C'; sz = 1; }
4760
0
    else if (val < UINT16_MAX) { type = 'S'; sz = 2; }
4761
0
    else                       { type = 'I'; sz = 4; }
4762
4763
0
    s = bam_aux_get(b, tag);
4764
0
    if (s) {  // Tag present - how big was the old one?
4765
0
        switch (*s) {
4766
0
            case 'c': case 'C': old_sz = 1; break;
4767
0
            case 's': case 'S': old_sz = 2; break;
4768
0
            case 'i': case 'I': old_sz = 4; break;
4769
0
            default: errno = EINVAL; return -1;  // Not an integer
4770
0
        }
4771
0
    } else {
4772
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
4773
0
            s = b->data + b->l_data;
4774
0
            new = 1;
4775
0
        }  else { // Invalid aux data, give up.
4776
0
            return -1;
4777
0
        }
4778
0
    }
4779
4780
0
    if (new || old_sz < sz) {
4781
        // Make room for new tag
4782
0
        ptrdiff_t s_offset = s - b->data;
4783
0
        if (possibly_expand_bam_data(b, (new ? 3 : 0) + sz - old_sz) < 0)
4784
0
            return -1;
4785
0
        s =  b->data + s_offset;
4786
0
        if (new) { // Add tag id
4787
0
            *s++ = tag[0];
4788
0
            *s++ = tag[1];
4789
0
        } else {   // Shift following data so we have space
4790
0
            memmove(s + sz, s + old_sz, b->l_data - s_offset - old_sz);
4791
0
        }
4792
0
    } else {
4793
        // Reuse old space.  Data value may be bigger than necessary but
4794
        // we avoid having to move everything else
4795
0
        sz = old_sz;
4796
0
        type = (val < 0 ? "\0cs\0i" : "\0CS\0I")[old_sz];
4797
0
        assert(type > 0);
4798
0
    }
4799
0
    *s++ = type;
4800
0
#ifdef HTS_LITTLE_ENDIAN
4801
0
    memcpy(s, &val, sz);
4802
#else
4803
    switch (sz) {
4804
        case 4:  u32_to_le(val, s); break;
4805
        case 2:  u16_to_le(val, s); break;
4806
        default: *s = val; break;
4807
    }
4808
#endif
4809
0
    b->l_data += (new ? 3 : 0) + sz - old_sz;
4810
0
    return 0;
4811
0
}
4812
4813
int bam_aux_update_float(bam1_t *b, const char tag[2], float val)
4814
0
{
4815
0
    uint8_t *s = bam_aux_get(b, tag);
4816
0
    int shrink = 0, new = 0;
4817
4818
0
    if (s) { // Tag present - what was it?
4819
0
        switch (*s) {
4820
0
            case 'f': break;
4821
0
            case 'd': shrink = 1; break;
4822
0
            default: errno = EINVAL; return -1;  // Not a float
4823
0
        }
4824
0
    } else {
4825
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
4826
0
            new = 1;
4827
0
        }  else { // Invalid aux data, give up.
4828
0
            return -1;
4829
0
        }
4830
0
    }
4831
4832
0
    if (new) { // Ensure there's room
4833
0
        if (possibly_expand_bam_data(b, 3 + 4) < 0)
4834
0
            return -1;
4835
0
        s = b->data + b->l_data;
4836
0
        *s++ = tag[0];
4837
0
        *s++ = tag[1];
4838
0
    } else if (shrink) { // Convert non-standard double tag to float
4839
0
        memmove(s + 5, s + 9, b->l_data - ((s + 9) - b->data));
4840
0
        b->l_data -= 4;
4841
0
    }
4842
0
    *s++ = 'f';
4843
0
    float_to_le(val, s);
4844
0
    if (new) b->l_data += 7;
4845
4846
0
    return 0;
4847
0
}
4848
4849
int bam_aux_update_array(bam1_t *b, const char tag[2],
4850
                         uint8_t type, uint32_t items, void *data)
4851
0
{
4852
0
    uint8_t *s = bam_aux_get(b, tag);
4853
0
    size_t old_sz = 0, new_sz;
4854
0
    int new = 0;
4855
4856
0
    if (s) { // Tag present
4857
0
        if (*s != 'B') { errno = EINVAL; return -1; }
4858
0
        old_sz = aux_type2size(s[1]);
4859
0
        if (old_sz < 1 || old_sz > 4) { errno = EINVAL; return -1; }
4860
0
        old_sz *= le_to_u32(s + 2);
4861
0
    } else {
4862
0
        if (errno == ENOENT) {  // Tag doesn't exist - add a new one
4863
0
            s = b->data + b->l_data;
4864
0
            new = 1;
4865
0
        }  else { // Invalid aux data, give up.
4866
0
            return -1;
4867
0
        }
4868
0
    }
4869
4870
0
    new_sz = aux_type2size(type);
4871
0
    if (new_sz < 1 || new_sz > 4) { errno = EINVAL; return -1; }
4872
0
    if (items > INT32_MAX / new_sz) { errno = ENOMEM; return -1; }
4873
0
    new_sz *= items;
4874
4875
0
    if (new || old_sz < new_sz) {
4876
        // Make room for new tag
4877
0
        ptrdiff_t s_offset = s - b->data;
4878
0
        if (possibly_expand_bam_data(b, (new ? 8 : 0) + new_sz - old_sz) < 0)
4879
0
            return -1;
4880
0
        s =  b->data + s_offset;
4881
0
    }
4882
0
    if (new) { // Add tag id and type
4883
0
        *s++ = tag[0];
4884
0
        *s++ = tag[1];
4885
0
        *s = 'B';
4886
0
        b->l_data += 8 + new_sz;
4887
0
    } else if (old_sz != new_sz) { // shift following data if necessary
4888
0
        memmove(s + 6 + new_sz, s + 6 + old_sz,
4889
0
                b->l_data - ((s + 6 + old_sz) - b->data));
4890
0
        b->l_data -= old_sz;
4891
0
        b->l_data += new_sz;
4892
0
    }
4893
4894
0
    s[1] = type;
4895
0
    u32_to_le(items, s + 2);
4896
0
#ifdef HTS_LITTLE_ENDIAN
4897
0
    memcpy(s + 6, data, new_sz);
4898
0
    return 0;
4899
#else
4900
    return aux_to_le(type, s + 6, data, new_sz);
4901
#endif
4902
0
}
4903
4904
static inline int64_t get_int_aux_val(uint8_t type, const uint8_t *s,
4905
                                      uint32_t idx)
4906
0
{
4907
0
    switch (type) {
4908
0
        case 'c': return le_to_i8(s + idx);
4909
0
        case 'C': return s[idx];
4910
0
        case 's': return le_to_i16(s + 2 * idx);
4911
0
        case 'S': return le_to_u16(s + 2 * idx);
4912
0
        case 'i': return le_to_i32(s + 4 * idx);
4913
0
        case 'I': return le_to_u32(s + 4 * idx);
4914
0
        default:
4915
0
            errno = EINVAL;
4916
0
            return 0;
4917
0
    }
4918
0
}
4919
4920
int64_t bam_aux2i(const uint8_t *s)
4921
0
{
4922
0
    int type;
4923
0
    type = *s++;
4924
0
    return get_int_aux_val(type, s, 0);
4925
0
}
4926
4927
double bam_aux2f(const uint8_t *s)
4928
0
{
4929
0
    int type;
4930
0
    type = *s++;
4931
0
    if (type == 'd') return le_to_double(s);
4932
0
    else if (type == 'f') return le_to_float(s);
4933
0
    else return get_int_aux_val(type, s, 0);
4934
0
}
4935
4936
char bam_aux2A(const uint8_t *s)
4937
0
{
4938
0
    int type;
4939
0
    type = *s++;
4940
0
    if (type == 'A') return *(char*)s;
4941
0
    errno = EINVAL;
4942
0
    return 0;
4943
0
}
4944
4945
char *bam_aux2Z(const uint8_t *s)
4946
0
{
4947
0
    int type;
4948
0
    type = *s++;
4949
0
    if (type == 'Z' || type == 'H') return (char*)s;
4950
0
    errno = EINVAL;
4951
0
    return 0;
4952
0
}
4953
4954
uint32_t bam_auxB_len(const uint8_t *s)
4955
0
{
4956
0
    if (s[0] != 'B') {
4957
0
        errno = EINVAL;
4958
0
        return 0;
4959
0
    }
4960
0
    return le_to_u32(s + 2);
4961
0
}
4962
4963
int64_t bam_auxB2i(const uint8_t *s, uint32_t idx)
4964
0
{
4965
0
    uint32_t len = bam_auxB_len(s);
4966
0
    if (idx >= len) {
4967
0
        errno = ERANGE;
4968
0
        return 0;
4969
0
    }
4970
0
    return get_int_aux_val(s[1], s + 6, idx);
4971
0
}
4972
4973
double bam_auxB2f(const uint8_t *s, uint32_t idx)
4974
0
{
4975
0
    uint32_t len = bam_auxB_len(s);
4976
0
    if (idx >= len) {
4977
0
        errno = ERANGE;
4978
0
        return 0.0;
4979
0
    }
4980
0
    if (s[1] == 'f') return le_to_float(s + 6 + 4 * idx);
4981
0
    else return get_int_aux_val(s[1], s + 6, idx);
4982
0
}
4983
4984
int sam_open_mode(char *mode, const char *fn, const char *format)
4985
0
{
4986
    // TODO Parse "bam5" etc for compression level
4987
0
    if (format == NULL) {
4988
        // Try to pick a format based on the filename extension
4989
0
        char extension[HTS_MAX_EXT_LEN];
4990
0
        if (find_file_extension(fn, extension) < 0) return -1;
4991
0
        return sam_open_mode(mode, fn, extension);
4992
0
    }
4993
0
    else if (strcasecmp(format, "bam") == 0) strcpy(mode, "b");
4994
0
    else if (strcasecmp(format, "cram") == 0) strcpy(mode, "c");
4995
0
    else if (strcasecmp(format, "sam") == 0) strcpy(mode, "");
4996
0
    else if (strcasecmp(format, "sam.gz") == 0) strcpy(mode, "z");
4997
0
    else if (strcasecmp(format, "fastq") == 0 ||
4998
0
             strcasecmp(format, "fq") == 0) strcpy(mode, "f");
4999
0
    else if (strcasecmp(format, "fastq.gz") == 0 ||
5000
0
             strcasecmp(format, "fq.gz") == 0) strcpy(mode, "fz");
5001
0
    else if (strcasecmp(format, "fasta") == 0 ||
5002
0
             strcasecmp(format, "fa") == 0) strcpy(mode, "F");
5003
0
    else if (strcasecmp(format, "fasta.gz") == 0 ||
5004
0
             strcasecmp(format, "fa.gz") == 0) strcpy(mode, "Fz");
5005
0
    else return -1;
5006
5007
0
    return 0;
5008
0
}
5009
5010
// A version of sam_open_mode that can handle ,key=value options.
5011
// The format string is allocated and returned, to be freed by the caller.
5012
// Prefix should be "r" or "w",
5013
char *sam_open_mode_opts(const char *fn,
5014
                         const char *mode,
5015
                         const char *format)
5016
0
{
5017
0
    char *mode_opts = malloc((format ? strlen(format) : 1) +
5018
0
                             (mode   ? strlen(mode)   : 1) + 12);
5019
0
    char *opts, *cp;
5020
0
    int format_len;
5021
5022
0
    if (!mode_opts)
5023
0
        return NULL;
5024
5025
0
    strcpy(mode_opts, mode ? mode : "r");
5026
0
    cp = mode_opts + strlen(mode_opts);
5027
5028
0
    if (format == NULL) {
5029
        // Try to pick a format based on the filename extension
5030
0
        char extension[HTS_MAX_EXT_LEN];
5031
0
        if (find_file_extension(fn, extension) < 0) {
5032
0
            free(mode_opts);
5033
0
            return NULL;
5034
0
        }
5035
0
        if (sam_open_mode(cp, fn, extension) == 0) {
5036
0
            return mode_opts;
5037
0
        } else {
5038
0
            free(mode_opts);
5039
0
            return NULL;
5040
0
        }
5041
0
    }
5042
5043
0
    if ((opts = strchr(format, ','))) {
5044
0
        format_len = opts-format;
5045
0
    } else {
5046
0
        opts="";
5047
0
        format_len = strlen(format);
5048
0
    }
5049
5050
0
    if (strncmp(format, "bam", format_len) == 0) {
5051
0
        *cp++ = 'b';
5052
0
    } else if (strncmp(format, "cram", format_len) == 0) {
5053
0
        *cp++ = 'c';
5054
0
    } else if (strncmp(format, "cram2", format_len) == 0) {
5055
0
        *cp++ = 'c';
5056
0
        strcpy(cp, ",VERSION=2.1");
5057
0
        cp += 12;
5058
0
    } else if (strncmp(format, "cram3", format_len) == 0) {
5059
0
        *cp++ = 'c';
5060
0
        strcpy(cp, ",VERSION=3.0");
5061
0
        cp += 12;
5062
0
    } else if (strncmp(format, "sam", format_len) == 0) {
5063
0
        ; // format mode=""
5064
0
    } else if (strncmp(format, "sam.gz", format_len) == 0) {
5065
0
        *cp++ = 'z';
5066
0
    } else if (strncmp(format, "fastq", format_len) == 0 ||
5067
0
               strncmp(format, "fq", format_len) == 0) {
5068
0
        *cp++ = 'f';
5069
0
    } else if (strncmp(format, "fastq.gz", format_len) == 0 ||
5070
0
               strncmp(format, "fq.gz", format_len) == 0) {
5071
0
        *cp++ = 'f';
5072
0
        *cp++ = 'z';
5073
0
    } else if (strncmp(format, "fasta", format_len) == 0 ||
5074
0
               strncmp(format, "fa", format_len) == 0) {
5075
0
        *cp++ = 'F';
5076
0
    } else if (strncmp(format, "fasta.gz", format_len) == 0 ||
5077
0
               strncmp(format, "fa", format_len) == 0) {
5078
0
        *cp++ = 'F';
5079
0
        *cp++ = 'z';
5080
0
    } else {
5081
0
        free(mode_opts);
5082
0
        return NULL;
5083
0
    }
5084
5085
0
    strcpy(cp, opts);
5086
5087
0
    return mode_opts;
5088
0
}
5089
5090
0
#define STRNCMP(a,b,n) (strncasecmp((a),(b),(n)) || strlen(a)!=(n))
5091
int bam_str2flag(const char *str)
5092
0
{
5093
0
    char *end, *beg = (char*) str;
5094
0
    long int flag = strtol(str, &end, 0);
5095
0
    if ( end!=str ) return flag;    // the conversion was successful
5096
0
    flag = 0;
5097
0
    while ( *str )
5098
0
    {
5099
0
        end = beg;
5100
0
        while ( *end && *end!=',' ) end++;
5101
0
        if ( !STRNCMP("PAIRED",beg,end-beg) ) flag |= BAM_FPAIRED;
5102
0
        else if ( !STRNCMP("PROPER_PAIR",beg,end-beg) ) flag |= BAM_FPROPER_PAIR;
5103
0
        else if ( !STRNCMP("UNMAP",beg,end-beg) ) flag |= BAM_FUNMAP;
5104
0
        else if ( !STRNCMP("MUNMAP",beg,end-beg) ) flag |= BAM_FMUNMAP;
5105
0
        else if ( !STRNCMP("REVERSE",beg,end-beg) ) flag |= BAM_FREVERSE;
5106
0
        else if ( !STRNCMP("MREVERSE",beg,end-beg) ) flag |= BAM_FMREVERSE;
5107
0
        else if ( !STRNCMP("READ1",beg,end-beg) ) flag |= BAM_FREAD1;
5108
0
        else if ( !STRNCMP("READ2",beg,end-beg) ) flag |= BAM_FREAD2;
5109
0
        else if ( !STRNCMP("SECONDARY",beg,end-beg) ) flag |= BAM_FSECONDARY;
5110
0
        else if ( !STRNCMP("QCFAIL",beg,end-beg) ) flag |= BAM_FQCFAIL;
5111
0
        else if ( !STRNCMP("DUP",beg,end-beg) ) flag |= BAM_FDUP;
5112
0
        else if ( !STRNCMP("SUPPLEMENTARY",beg,end-beg) ) flag |= BAM_FSUPPLEMENTARY;
5113
0
        else return -1;
5114
0
        if ( !*end ) break;
5115
0
        beg = end + 1;
5116
0
    }
5117
0
    return flag;
5118
0
}
5119
5120
char *bam_flag2str(int flag)
5121
0
{
5122
0
    kstring_t str = {0,0,0};
5123
0
    if ( flag&BAM_FPAIRED ) ksprintf(&str,"%s%s", str.l?",":"","PAIRED");
5124
0
    if ( flag&BAM_FPROPER_PAIR ) ksprintf(&str,"%s%s", str.l?",":"","PROPER_PAIR");
5125
0
    if ( flag&BAM_FUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","UNMAP");
5126
0
    if ( flag&BAM_FMUNMAP ) ksprintf(&str,"%s%s", str.l?",":"","MUNMAP");
5127
0
    if ( flag&BAM_FREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","REVERSE");
5128
0
    if ( flag&BAM_FMREVERSE ) ksprintf(&str,"%s%s", str.l?",":"","MREVERSE");
5129
0
    if ( flag&BAM_FREAD1 ) ksprintf(&str,"%s%s", str.l?",":"","READ1");
5130
0
    if ( flag&BAM_FREAD2 ) ksprintf(&str,"%s%s", str.l?",":"","READ2");
5131
0
    if ( flag&BAM_FSECONDARY ) ksprintf(&str,"%s%s", str.l?",":"","SECONDARY");
5132
0
    if ( flag&BAM_FQCFAIL ) ksprintf(&str,"%s%s", str.l?",":"","QCFAIL");
5133
0
    if ( flag&BAM_FDUP ) ksprintf(&str,"%s%s", str.l?",":"","DUP");
5134
0
    if ( flag&BAM_FSUPPLEMENTARY ) ksprintf(&str,"%s%s", str.l?",":"","SUPPLEMENTARY");
5135
0
    if ( str.l == 0 ) kputsn("", 0, &str);
5136
0
    return str.s;
5137
0
}
5138
5139
5140
/**************************
5141
 *** Pileup and Mpileup ***
5142
 **************************/
5143
5144
#if !defined(BAM_NO_PILEUP)
5145
5146
#include <assert.h>
5147
5148
/*******************
5149
 *** Memory pool ***
5150
 *******************/
5151
5152
typedef struct {
5153
    int k, y;
5154
    hts_pos_t x, end;
5155
} cstate_t;
5156
5157
static cstate_t g_cstate_null = { -1, 0, 0, 0 };
5158
5159
typedef struct __linkbuf_t {
5160
    bam1_t b;
5161
    hts_pos_t beg, end;
5162
    cstate_t s;
5163
    struct __linkbuf_t *next;
5164
    bam_pileup_cd cd;
5165
} lbnode_t;
5166
5167
typedef struct {
5168
    int cnt, n, max;
5169
    lbnode_t **buf;
5170
} mempool_t;
5171
5172
static mempool_t *mp_init(void)
5173
0
{
5174
0
    mempool_t *mp;
5175
0
    mp = (mempool_t*)calloc(1, sizeof(mempool_t));
5176
0
    return mp;
5177
0
}
5178
static void mp_destroy(mempool_t *mp)
5179
0
{
5180
0
    int k;
5181
0
    for (k = 0; k < mp->n; ++k) {
5182
0
        free(mp->buf[k]->b.data);
5183
0
        free(mp->buf[k]);
5184
0
    }
5185
0
    free(mp->buf);
5186
0
    free(mp);
5187
0
}
5188
static inline lbnode_t *mp_alloc(mempool_t *mp)
5189
0
{
5190
0
    ++mp->cnt;
5191
0
    if (mp->n == 0) return (lbnode_t*)calloc(1, sizeof(lbnode_t));
5192
0
    else return mp->buf[--mp->n];
5193
0
}
5194
static inline void mp_free(mempool_t *mp, lbnode_t *p)
5195
0
{
5196
0
    --mp->cnt; p->next = 0; // clear lbnode_t::next here
5197
0
    if (mp->n == mp->max) {
5198
0
        mp->max = mp->max? mp->max<<1 : 256;
5199
0
        mp->buf = (lbnode_t**)realloc(mp->buf, sizeof(lbnode_t*) * mp->max);
5200
0
    }
5201
0
    mp->buf[mp->n++] = p;
5202
0
}
5203
5204
/**********************
5205
 *** CIGAR resolver ***
5206
 **********************/
5207
5208
/* s->k: the index of the CIGAR operator that has just been processed.
5209
   s->x: the reference coordinate of the start of s->k
5210
   s->y: the query coordinate of the start of s->k
5211
 */
5212
static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s)
5213
0
{
5214
0
#define _cop(c) ((c)&BAM_CIGAR_MASK)
5215
0
#define _cln(c) ((c)>>BAM_CIGAR_SHIFT)
5216
5217
0
    bam1_t *b = p->b;
5218
0
    bam1_core_t *c = &b->core;
5219
0
    uint32_t *cigar = bam_get_cigar(b);
5220
0
    int k;
5221
    // determine the current CIGAR operation
5222
    //fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y);
5223
0
    if (s->k == -1) { // never processed
5224
0
        p->qpos = 0;
5225
0
        if (c->n_cigar == 1) { // just one operation, save a loop
5226
0
          if (_cop(cigar[0]) == BAM_CMATCH || _cop(cigar[0]) == BAM_CEQUAL || _cop(cigar[0]) == BAM_CDIFF) s->k = 0, s->x = c->pos, s->y = 0;
5227
0
        } else { // find the first match or deletion
5228
0
            for (k = 0, s->x = c->pos, s->y = 0; k < c->n_cigar; ++k) {
5229
0
                int op = _cop(cigar[k]);
5230
0
                int l = _cln(cigar[k]);
5231
0
                if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP ||
5232
0
                    op == BAM_CEQUAL || op == BAM_CDIFF) break;
5233
0
                else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5234
0
            }
5235
0
            assert(k < c->n_cigar);
5236
0
            s->k = k;
5237
0
        }
5238
0
    } else { // the read has been processed before
5239
0
        int op, l = _cln(cigar[s->k]);
5240
0
        if (pos - s->x >= l) { // jump to the next operation
5241
0
            assert(s->k < c->n_cigar); // otherwise a bug: this function should not be called in this case
5242
0
            op = _cop(cigar[s->k+1]);
5243
0
            if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { // jump to the next without a loop
5244
0
              if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5245
0
                s->x += l;
5246
0
                ++s->k;
5247
0
            } else { // find the next M/D/N/=/X
5248
0
              if (_cop(cigar[s->k]) == BAM_CMATCH|| _cop(cigar[s->k]) == BAM_CEQUAL || _cop(cigar[s->k]) == BAM_CDIFF) s->y += l;
5249
0
                s->x += l;
5250
0
                for (k = s->k + 1; k < c->n_cigar; ++k) {
5251
0
                    op = _cop(cigar[k]), l = _cln(cigar[k]);
5252
0
                    if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) break;
5253
0
                    else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) s->y += l;
5254
0
                }
5255
0
                s->k = k;
5256
0
            }
5257
0
            assert(s->k < c->n_cigar); // otherwise a bug
5258
0
        } // else, do nothing
5259
0
    }
5260
0
    { // collect pileup information
5261
0
        int op, l;
5262
0
        op = _cop(cigar[s->k]); l = _cln(cigar[s->k]);
5263
0
        p->is_del = p->indel = p->is_refskip = 0;
5264
0
        if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation
5265
0
            int op2 = _cop(cigar[s->k+1]);
5266
0
            int l2 = _cln(cigar[s->k+1]);
5267
0
            if (op2 == BAM_CDEL) p->indel = -(int)l2;
5268
0
            else if (op2 == BAM_CINS) p->indel = l2;
5269
0
            else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding
5270
0
                int l3 = 0;
5271
0
                for (k = s->k + 2; k < c->n_cigar; ++k) {
5272
0
                    op2 = _cop(cigar[k]); l2 = _cln(cigar[k]);
5273
0
                    if (op2 == BAM_CINS) l3 += l2;
5274
0
                    else if (op2 == BAM_CDEL || op2 == BAM_CMATCH || op2 == BAM_CREF_SKIP || op2 == BAM_CEQUAL || op2 == BAM_CDIFF) break;
5275
0
                }
5276
0
                if (l3 > 0) p->indel = l3;
5277
0
            }
5278
0
        }
5279
0
        if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
5280
0
            p->qpos = s->y + (pos - s->x);
5281
0
        } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
5282
0
            p->is_del = 1; p->qpos = s->y; // FIXME: distinguish D and N!!!!!
5283
0
            p->is_refskip = (op == BAM_CREF_SKIP);
5284
0
        } // cannot be other operations; otherwise a bug
5285
0
        p->is_head = (pos == c->pos); p->is_tail = (pos == s->end);
5286
0
    }
5287
0
    p->cigar_ind = s->k;
5288
0
    return 1;
5289
0
}
5290
5291
/*******************************
5292
 *** Expansion of insertions ***
5293
 *******************************/
5294
5295
/*
5296
 * Fills out the kstring with the padded insertion sequence for the current
5297
 * location in 'p'.  If this is not an insertion site, the string is blank.
5298
 *
5299
 * This variant handles base modifications, but only when "m" is non-NULL.
5300
 *
5301
 * Returns the number of inserted base on success, with string length being
5302
 *        accessable via ins->l;
5303
 *        -1 on failure.
5304
 */
5305
int bam_plp_insertion_mod(const bam_pileup1_t *p,
5306
                          hts_base_mod_state *m,
5307
0
                          kstring_t *ins, int *del_len) {
5308
0
    int j, k, indel, nb = 0;
5309
0
    uint32_t *cigar;
5310
5311
0
    if (p->indel <= 0) {
5312
0
        if (ks_resize(ins, 1) < 0)
5313
0
            return -1;
5314
0
        ins->l = 0;
5315
0
        ins->s[0] = '\0';
5316
0
        return 0;
5317
0
    }
5318
5319
0
    if (del_len)
5320
0
        *del_len = 0;
5321
5322
    // Measure indel length including pads
5323
0
    indel = 0;
5324
0
    k = p->cigar_ind+1;
5325
0
    cigar = bam_get_cigar(p->b);
5326
0
    while (k < p->b->core.n_cigar) {
5327
0
        switch (cigar[k] & BAM_CIGAR_MASK) {
5328
0
        case BAM_CPAD:
5329
0
        case BAM_CINS:
5330
0
            indel += (cigar[k] >> BAM_CIGAR_SHIFT);
5331
0
            break;
5332
0
        default:
5333
0
            k = p->b->core.n_cigar;
5334
0
            break;
5335
0
        }
5336
0
        k++;
5337
0
    }
5338
0
    nb = ins->l = indel;
5339
5340
    // Produce sequence
5341
0
    if (ks_resize(ins, indel+1) < 0)
5342
0
        return -1;
5343
0
    indel = 0;
5344
0
    k = p->cigar_ind+1;
5345
0
    j = 1;
5346
0
    while (k < p->b->core.n_cigar) {
5347
0
        int l, c;
5348
0
        switch (cigar[k] & BAM_CIGAR_MASK) {
5349
0
        case BAM_CPAD:
5350
0
            for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++)
5351
0
                ins->s[indel++] = '*';
5352
0
            break;
5353
0
        case BAM_CINS:
5354
0
            for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) {
5355
0
                c = p->qpos + j - p->is_del < p->b->core.l_qseq
5356
0
                    ? seq_nt16_str[bam_seqi(bam_get_seq(p->b),
5357
0
                                            p->qpos + j - p->is_del)]
5358
0
                    : 'N';
5359
0
                ins->s[indel++] = c;
5360
0
                int nm;
5361
0
                hts_base_mod mod[256];
5362
0
                if (m && (nm = bam_mods_at_qpos(p->b, p->qpos + j - p->is_del,
5363
0
                                                m, mod, 256)) > 0) {
5364
0
                    int o_indel = indel;
5365
0
                    if (ks_resize(ins, ins->l + nm*16+3) < 0)
5366
0
                        return -1;
5367
0
                    ins->s[indel++] = '[';
5368
0
                    int j;
5369
0
                    for (j = 0; j < nm; j++) {
5370
0
                        char qual[20];
5371
0
                        if (mod[j].qual >= 0)
5372
0
                            sprintf(qual, "%d", mod[j].qual);
5373
0
                        else
5374
0
                            *qual=0;
5375
0
                        if (mod[j].modified_base < 0)
5376
                            // ChEBI
5377
0
                            indel += sprintf(&ins->s[indel], "%c(%d)%s",
5378
0
                                             "+-"[mod[j].strand],
5379
0
                                             -mod[j].modified_base,
5380
0
                                             qual);
5381
0
                        else
5382
0
                            indel += sprintf(&ins->s[indel], "%c%c%s",
5383
0
                                             "+-"[mod[j].strand],
5384
0
                                             mod[j].modified_base,
5385
0
                                             qual);
5386
0
                    }
5387
0
                    ins->s[indel++] = ']';
5388
0
                    ins->l += indel - o_indel; // grow by amount we used
5389
0
                }
5390
0
            }
5391
0
            break;
5392
0
        case BAM_CDEL:
5393
            // eg cigar 1M2I1D gives mpileup output in T+2AA-1C style
5394
0
            if (del_len)
5395
0
                *del_len = cigar[k]>>BAM_CIGAR_SHIFT;
5396
            // fall through
5397
0
        default:
5398
0
            k = p->b->core.n_cigar;
5399
0
            break;
5400
0
        }
5401
0
        k++;
5402
0
    }
5403
0
    ins->s[indel] = '\0';
5404
0
    ins->l = indel; // string length
5405
5406
0
    return nb;      // base length
5407
0
}
5408
5409
/*
5410
 * Fills out the kstring with the padded insertion sequence for the current
5411
 * location in 'p'.  If this is not an insertion site, the string is blank.
5412
 *
5413
 * This is the original interface with no capability for reporting base
5414
 * modifications.
5415
 *
5416
 * Returns the length of insertion string on success;
5417
 *        -1 on failure.
5418
 */
5419
0
int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) {
5420
0
    return bam_plp_insertion_mod(p, NULL, ins, del_len);
5421
0
}
5422
5423
/***********************
5424
 *** Pileup iterator ***
5425
 ***********************/
5426
5427
// Dictionary of overlapping reads
5428
KHASH_MAP_INIT_STR(olap_hash, lbnode_t *)
5429
typedef khash_t(olap_hash) olap_hash_t;
5430
5431
struct bam_plp_s {
5432
    mempool_t *mp;
5433
    lbnode_t *head, *tail;
5434
    int32_t tid, max_tid;
5435
    hts_pos_t pos, max_pos;
5436
    int is_eof, max_plp, error, maxcnt;
5437
    uint64_t id;
5438
    bam_pileup1_t *plp;
5439
    // for the "auto" interface only
5440
    bam1_t *b;
5441
    bam_plp_auto_f func;
5442
    void *data;
5443
    olap_hash_t *overlaps;
5444
5445
    // For notification of creation and destruction events
5446
    // and associated client-owned pointer.
5447
    int (*plp_construct)(void *data, const bam1_t *b, bam_pileup_cd *cd);
5448
    int (*plp_destruct )(void *data, const bam1_t *b, bam_pileup_cd *cd);
5449
};
5450
5451
bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data)
5452
0
{
5453
0
    bam_plp_t iter;
5454
0
    iter = (bam_plp_t)calloc(1, sizeof(struct bam_plp_s));
5455
0
    iter->mp = mp_init();
5456
0
    iter->head = iter->tail = mp_alloc(iter->mp);
5457
0
    iter->max_tid = iter->max_pos = -1;
5458
0
    iter->maxcnt = 8000;
5459
0
    if (func) {
5460
0
        iter->func = func;
5461
0
        iter->data = data;
5462
0
        iter->b = bam_init1();
5463
0
    }
5464
0
    return iter;
5465
0
}
5466
5467
int bam_plp_init_overlaps(bam_plp_t iter)
5468
0
{
5469
0
    iter->overlaps = kh_init(olap_hash);  // hash for tweaking quality of bases in overlapping reads
5470
0
    return iter->overlaps ? 0 : -1;
5471
0
}
5472
5473
void bam_plp_destroy(bam_plp_t iter)
5474
0
{
5475
0
    lbnode_t *p, *pnext;
5476
0
    if ( iter->overlaps ) kh_destroy(olap_hash, iter->overlaps);
5477
0
    for (p = iter->head; p != NULL; p = pnext) {
5478
0
        pnext = p->next;
5479
0
        mp_free(iter->mp, p);
5480
0
    }
5481
0
    mp_destroy(iter->mp);
5482
0
    if (iter->b) bam_destroy1(iter->b);
5483
0
    free(iter->plp);
5484
0
    free(iter);
5485
0
}
5486
5487
void bam_plp_constructor(bam_plp_t plp,
5488
0
                         int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5489
0
    plp->plp_construct = func;
5490
0
}
5491
5492
void bam_plp_destructor(bam_plp_t plp,
5493
0
                        int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)) {
5494
0
    plp->plp_destruct = func;
5495
0
}
5496
5497
//---------------------------------
5498
//---  Tweak overlapping reads
5499
//---------------------------------
5500
5501
/**
5502
 *  cigar_iref2iseq_set()  - find the first CMATCH setting the ref and the read index
5503
 *  cigar_iref2iseq_next() - get the next CMATCH base
5504
 *  @cigar:       pointer to current cigar block (rw)
5505
 *  @cigar_max:   pointer just beyond the last cigar block
5506
 *  @icig:        position within the current cigar block (rw)
5507
 *  @iseq:        position in the sequence (rw)
5508
 *  @iref:        position with respect to the beginning of the read (iref_pos - b->core.pos) (rw)
5509
 *
5510
 *  Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered,
5511
 *  or -2 on error.
5512
 */
5513
static inline int cigar_iref2iseq_set(const uint32_t **cigar,
5514
                                      const uint32_t *cigar_max,
5515
                                      hts_pos_t *icig,
5516
                                      hts_pos_t *iseq,
5517
                                      hts_pos_t *iref)
5518
0
{
5519
0
    hts_pos_t pos = *iref;
5520
0
    if ( pos < 0 ) return -1;
5521
0
    *icig = 0;
5522
0
    *iseq = 0;
5523
0
    *iref = 0;
5524
0
    while ( *cigar<cigar_max )
5525
0
    {
5526
0
        int cig  = (**cigar) & BAM_CIGAR_MASK;
5527
0
        int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5528
5529
0
        if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5530
0
        if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; }
5531
0
        if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5532
0
        {
5533
0
            pos -= ncig;
5534
0
            if ( pos < 0 ) { *icig = ncig + pos; *iseq += *icig; *iref += *icig; return BAM_CMATCH; }
5535
0
            (*cigar)++; *iseq += ncig; *icig = 0; *iref += ncig;
5536
0
            continue;
5537
0
        }
5538
0
        if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; }
5539
0
        if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP )
5540
0
        {
5541
0
            pos -= ncig;
5542
0
            if ( pos<0 ) pos = 0;
5543
0
            (*cigar)++; *icig = 0; *iref += ncig;
5544
0
            continue;
5545
0
        }
5546
0
        hts_log_error("Unexpected cigar %d", cig);
5547
0
        return -2;
5548
0
    }
5549
0
    *iseq = -1;
5550
0
    return -1;
5551
0
}
5552
static inline int cigar_iref2iseq_next(const uint32_t **cigar,
5553
                                       const uint32_t *cigar_max,
5554
                                       hts_pos_t *icig,
5555
                                       hts_pos_t *iseq,
5556
                                       hts_pos_t *iref)
5557
0
{
5558
0
    while ( *cigar < cigar_max )
5559
0
    {
5560
0
        int cig  = (**cigar) & BAM_CIGAR_MASK;
5561
0
        int ncig = (**cigar) >> BAM_CIGAR_SHIFT;
5562
5563
0
        if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF )
5564
0
        {
5565
0
            if ( *icig >= ncig - 1 ) { *icig = -1;  (*cigar)++; continue; }
5566
0
            (*iseq)++; (*icig)++; (*iref)++;
5567
0
            return BAM_CMATCH;
5568
0
        }
5569
0
        if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = -1; continue; }
5570
0
        if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5571
0
        if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; }
5572
0
        if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = -1; continue; }
5573
0
        hts_log_error("Unexpected cigar %d", cig);
5574
0
        return -2;
5575
0
    }
5576
0
    *iseq = -1;
5577
0
    *iref = -1;
5578
0
    return -1;
5579
0
}
5580
5581
// Given overlapping read 'a' (left) and 'b' (right) on the same
5582
// template, adjust quality values to zero for either a or b.
5583
// Note versions 1.12 and earlier always removed quality from 'b' for
5584
// matching bases.  Now we select a or b semi-randomly based on name hash.
5585
// Returns 0 on success,
5586
//        -1 on failure
5587
static int tweak_overlap_quality(bam1_t *a, bam1_t *b)
5588
0
{
5589
0
    const uint32_t *a_cigar = bam_get_cigar(a),
5590
0
        *a_cigar_max = a_cigar + a->core.n_cigar;
5591
0
    const uint32_t *b_cigar = bam_get_cigar(b),
5592
0
        *b_cigar_max = b_cigar + b->core.n_cigar;
5593
0
    hts_pos_t a_icig = 0, a_iseq = 0;
5594
0
    hts_pos_t b_icig = 0, b_iseq = 0;
5595
0
    uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b);
5596
0
    uint8_t *a_seq  = bam_get_seq(a), *b_seq = bam_get_seq(b);
5597
5598
0
    hts_pos_t iref   = b->core.pos;
5599
0
    hts_pos_t a_iref = iref - a->core.pos;
5600
0
    hts_pos_t b_iref = iref - b->core.pos;
5601
5602
0
    int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max,
5603
0
                                    &a_icig, &a_iseq, &a_iref);
5604
0
    if ( a_ret<0 )
5605
        // no overlap or error
5606
0
        return a_ret<-1 ? -1:0;
5607
5608
0
    int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max,
5609
0
                                    &b_icig, &b_iseq, &b_iref);
5610
0
    if ( b_ret<0 )
5611
        // no overlap or error
5612
0
        return b_ret<-1 ? -1:0;
5613
5614
    // Determine which seq is the one getting modified qualities.
5615
0
    uint8_t amul, bmul;
5616
0
    if (__ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(a))) & 1) {
5617
0
        amul = 1;
5618
0
        bmul = 0;
5619
0
    } else {
5620
0
        amul = 0;
5621
0
        bmul = 1;
5622
0
    }
5623
5624
    // Loop over the overlapping region nulling qualities in either
5625
    // seq a or b.
5626
0
    int err = 0;
5627
0
    while ( 1 )
5628
0
    {
5629
        // Step to next matching reference position in a and b
5630
0
        while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos )
5631
0
            a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max,
5632
0
                                         &a_icig, &a_iseq, &a_iref);
5633
0
        if ( a_ret<0 ) { // done
5634
0
            err = a_ret<-1?-1:0;
5635
0
            break;
5636
0
        }
5637
0
        if ( iref < a_iref + a->core.pos )
5638
0
            iref = a_iref + a->core.pos;
5639
5640
0
        while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos )
5641
0
            b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig,
5642
0
                                         &b_iseq, &b_iref);
5643
0
        if ( b_ret<0 ) { // done
5644
0
            err = b_ret<-1?-1:0;
5645
0
            break;
5646
0
        }
5647
0
        if ( iref < b_iref + b->core.pos )
5648
0
            iref = b_iref + b->core.pos;
5649
5650
0
        iref++;
5651
5652
0
        if ( a_iref+a->core.pos != b_iref+b->core.pos )
5653
            // only CMATCH positions, don't know what to do with indels
5654
0
            continue;
5655
5656
0
        if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq)
5657
            // Fell off end of sequence, bad CIGAR?
5658
0
            return -1;
5659
5660
        // We're finally at the same ref base in both a and b.
5661
        // Check if the bases match (confident) or mismatch
5662
        // (not so confident).
5663
0
        if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) {
5664
            // We are very confident about this base.  Use sum of quals
5665
0
            int qual = a_qual[a_iseq] + b_qual[b_iseq];
5666
0
            a_qual[a_iseq] = amul * (qual>200 ? 200 : qual);
5667
0
            b_qual[b_iseq] = bmul * (qual>200 ? 200 : qual);;
5668
0
        } else {
5669
            // Not so confident about anymore given the mismatch.
5670
            // Reduce qual for lowest quality base.
5671
0
            if ( a_qual[a_iseq] > b_qual[b_iseq] ) {
5672
                // A highest qual base; keep
5673
0
                a_qual[a_iseq] = 0.8 * a_qual[a_iseq];
5674
0
                b_qual[b_iseq] = 0;
5675
0
            } else if (a_qual[a_iseq] < b_qual[b_iseq] ) {
5676
                // B highest qual base; keep
5677
0
                b_qual[b_iseq] = 0.8 * b_qual[b_iseq];
5678
0
                a_qual[a_iseq] = 0;
5679
0
            } else {
5680
                // Both equal, so pick randomly
5681
0
                a_qual[a_iseq] = amul * 0.8 * a_qual[a_iseq];
5682
0
                b_qual[b_iseq] = bmul * 0.8 * b_qual[b_iseq];
5683
0
            }
5684
0
        }
5685
0
    }
5686
5687
0
    return err;
5688
0
}
5689
5690
// Fix overlapping reads. Simple soft-clipping did not give good results.
5691
// Lowering qualities of unwanted bases is more selective and works better.
5692
//
5693
// Returns 0 on success, -1 on failure
5694
static int overlap_push(bam_plp_t iter, lbnode_t *node)
5695
0
{
5696
0
    if ( !iter->overlaps ) return 0;
5697
5698
    // mapped mates and paired reads only
5699
0
    if ( node->b.core.flag&BAM_FMUNMAP || !(node->b.core.flag&BAM_FPROPER_PAIR) ) return 0;
5700
5701
    // no overlap possible, unless some wild cigar
5702
0
    if ( (node->b.core.mtid >= 0 && node->b.core.tid != node->b.core.mtid)
5703
0
         || (llabs(node->b.core.isize) >= 2*node->b.core.l_qseq
5704
0
         && node->b.core.mpos >= node->end) // for those wild cigars
5705
0
       ) return 0;
5706
5707
0
    khiter_t kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(&node->b));
5708
0
    if ( kitr==kh_end(iter->overlaps) )
5709
0
    {
5710
        // Only add reads where the mate is still to arrive
5711
0
        if (node->b.core.mpos >= node->b.core.pos ||
5712
0
            ((node->b.core.flag & BAM_FPAIRED) && node->b.core.mpos == -1)) {
5713
0
            int ret;
5714
0
            kitr = kh_put(olap_hash, iter->overlaps, bam_get_qname(&node->b), &ret);
5715
0
            if (ret < 0) return -1;
5716
0
            kh_value(iter->overlaps, kitr) = node;
5717
0
        }
5718
0
    }
5719
0
    else
5720
0
    {
5721
0
        lbnode_t *a = kh_value(iter->overlaps, kitr);
5722
0
        int err = tweak_overlap_quality(&a->b, &node->b);
5723
0
        kh_del(olap_hash, iter->overlaps, kitr);
5724
0
        assert(a->end-1 == a->s.end);
5725
0
        return err;
5726
0
    }
5727
0
    return 0;
5728
0
}
5729
5730
static void overlap_remove(bam_plp_t iter, const bam1_t *b)
5731
0
{
5732
0
    if ( !iter->overlaps ) return;
5733
5734
0
    khiter_t kitr;
5735
0
    if ( b )
5736
0
    {
5737
0
        kitr = kh_get(olap_hash, iter->overlaps, bam_get_qname(b));
5738
0
        if ( kitr!=kh_end(iter->overlaps) )
5739
0
            kh_del(olap_hash, iter->overlaps, kitr);
5740
0
    }
5741
0
    else
5742
0
    {
5743
        // remove all
5744
0
        for (kitr = kh_begin(iter->overlaps); kitr<kh_end(iter->overlaps); kitr++)
5745
0
            if ( kh_exist(iter->overlaps, kitr) ) kh_del(olap_hash, iter->overlaps, kitr);
5746
0
    }
5747
0
}
5748
5749
5750
5751
// Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns
5752
// pointer to the piled records if next position is ready or NULL if there is not enough records in the
5753
// buffer yet (the current position is still the maximum position across all buffered reads).
5754
const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
5755
0
{
5756
0
    if (iter->error) { *_n_plp = -1; return NULL; }
5757
0
    *_n_plp = 0;
5758
0
    if (iter->is_eof && iter->head == iter->tail) return NULL;
5759
0
    while (iter->is_eof || iter->max_tid > iter->tid || (iter->max_tid == iter->tid && iter->max_pos > iter->pos)) {
5760
0
        int n_plp = 0;
5761
        // write iter->plp at iter->pos
5762
0
        lbnode_t **pptr = &iter->head;
5763
0
        while (*pptr != iter->tail) {
5764
0
            lbnode_t *p = *pptr;
5765
0
            if (p->b.core.tid < iter->tid || (p->b.core.tid == iter->tid && p->end <= iter->pos)) { // then remove
5766
0
                overlap_remove(iter, &p->b);
5767
0
                if (iter->plp_destruct)
5768
0
                    iter->plp_destruct(iter->data, &p->b, &p->cd);
5769
0
                *pptr = p->next; mp_free(iter->mp, p);
5770
0
            }
5771
0
            else {
5772
0
                if (p->b.core.tid == iter->tid && p->beg <= iter->pos) { // here: p->end > pos; then add to pileup
5773
0
                    if (n_plp == iter->max_plp) { // then double the capacity
5774
0
                        iter->max_plp = iter->max_plp? iter->max_plp<<1 : 256;
5775
0
                        iter->plp = (bam_pileup1_t*)realloc(iter->plp, sizeof(bam_pileup1_t) * iter->max_plp);
5776
0
                    }
5777
0
                    iter->plp[n_plp].b = &p->b;
5778
0
                    iter->plp[n_plp].cd = p->cd;
5779
0
                    if (resolve_cigar2(iter->plp + n_plp, iter->pos, &p->s)) ++n_plp; // actually always true...
5780
0
                }
5781
0
                pptr = &(*pptr)->next;
5782
0
            }
5783
0
        }
5784
0
        *_n_plp = n_plp; *_tid = iter->tid; *_pos = iter->pos;
5785
        // update iter->tid and iter->pos
5786
0
        if (iter->head != iter->tail) {
5787
0
            if (iter->tid > iter->head->b.core.tid) {
5788
0
                hts_log_error("Unsorted input. Pileup aborts");
5789
0
                iter->error = 1;
5790
0
                *_n_plp = -1;
5791
0
                return NULL;
5792
0
            }
5793
0
        }
5794
0
        if (iter->tid < iter->head->b.core.tid) { // come to a new reference sequence
5795
0
            iter->tid = iter->head->b.core.tid; iter->pos = iter->head->beg; // jump to the next reference
5796
0
        } else if (iter->pos < iter->head->beg) { // here: tid == head->b.core.tid
5797
0
            iter->pos = iter->head->beg; // jump to the next position
5798
0
        } else ++iter->pos; // scan contiguously
5799
        // return
5800
0
        if (n_plp) return iter->plp;
5801
0
        if (iter->is_eof && iter->head == iter->tail) break;
5802
0
    }
5803
0
    return NULL;
5804
0
}
5805
5806
const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
5807
0
{
5808
0
    hts_pos_t pos64 = 0;
5809
0
    const bam_pileup1_t *p = bam_plp64_next(iter, _tid, &pos64, _n_plp);
5810
0
    if (pos64 < INT_MAX) {
5811
0
        *_pos = pos64;
5812
0
    } else {
5813
0
        hts_log_error("Position %"PRId64" too large", pos64);
5814
0
        *_pos = INT_MAX;
5815
0
        iter->error = 1;
5816
0
        *_n_plp = -1;
5817
0
        return NULL;
5818
0
    }
5819
0
    return p;
5820
0
}
5821
5822
int bam_plp_push(bam_plp_t iter, const bam1_t *b)
5823
0
{
5824
0
    if (iter->error) return -1;
5825
0
    if (b) {
5826
0
        if (b->core.tid < 0) { overlap_remove(iter, b); return 0; }
5827
        // Skip only unmapped reads here, any additional filtering must be done in iter->func
5828
0
        if (b->core.flag & BAM_FUNMAP) { overlap_remove(iter, b); return 0; }
5829
0
        if (iter->tid == b->core.tid && iter->pos == b->core.pos && iter->mp->cnt > iter->maxcnt)
5830
0
        {
5831
0
            overlap_remove(iter, b);
5832
0
            return 0;
5833
0
        }
5834
0
        if (bam_copy1(&iter->tail->b, b) == NULL)
5835
0
            return -1;
5836
0
        iter->tail->b.id = iter->id++;
5837
0
        iter->tail->beg = b->core.pos;
5838
        // Use raw rlen rather than bam_endpos() which adjusts rlen=0 to rlen=1
5839
0
        iter->tail->end = b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b));
5840
0
        iter->tail->s = g_cstate_null; iter->tail->s.end = iter->tail->end - 1; // initialize cstate_t
5841
0
        if (b->core.tid < iter->max_tid) {
5842
0
            hts_log_error("The input is not sorted (chromosomes out of order)");
5843
0
            iter->error = 1;
5844
0
            return -1;
5845
0
        }
5846
0
        if ((b->core.tid == iter->max_tid) && (iter->tail->beg < iter->max_pos)) {
5847
0
            hts_log_error("The input is not sorted (reads out of order)");
5848
0
            iter->error = 1;
5849
0
            return -1;
5850
0
        }
5851
0
        iter->max_tid = b->core.tid; iter->max_pos = iter->tail->beg;
5852
0
        if (iter->tail->end > iter->pos || iter->tail->b.core.tid > iter->tid) {
5853
0
            lbnode_t *next = mp_alloc(iter->mp);
5854
0
            if (!next) {
5855
0
                iter->error = 1;
5856
0
                return -1;
5857
0
            }
5858
0
            if (iter->plp_construct) {
5859
0
                if (iter->plp_construct(iter->data, &iter->tail->b,
5860
0
                                        &iter->tail->cd) < 0) {
5861
0
                    mp_free(iter->mp, next);
5862
0
                    iter->error = 1;
5863
0
                    return -1;
5864
0
                }
5865
0
            }
5866
0
            if (overlap_push(iter, iter->tail) < 0) {
5867
0
                mp_free(iter->mp, next);
5868
0
                iter->error = 1;
5869
0
                return -1;
5870
0
            }
5871
0
            iter->tail->next = next;
5872
0
            iter->tail = iter->tail->next;
5873
0
        }
5874
0
    } else iter->is_eof = 1;
5875
0
    return 0;
5876
0
}
5877
5878
const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp)
5879
0
{
5880
0
    const bam_pileup1_t *plp;
5881
0
    if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; }
5882
0
    if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
5883
0
    else { // no pileup line can be obtained; read alignments
5884
0
        *_n_plp = 0;
5885
0
        if (iter->is_eof) return 0;
5886
0
        int ret;
5887
0
        while ( (ret=iter->func(iter->data, iter->b)) >= 0) {
5888
0
            if (bam_plp_push(iter, iter->b) < 0) {
5889
0
                *_n_plp = -1;
5890
0
                return 0;
5891
0
            }
5892
0
            if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
5893
            // otherwise no pileup line can be returned; read the next alignment.
5894
0
        }
5895
0
        if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; }
5896
0
        if (bam_plp_push(iter, 0) < 0) {
5897
0
            *_n_plp = -1;
5898
0
            return 0;
5899
0
        }
5900
0
        if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp;
5901
0
        return 0;
5902
0
    }
5903
0
}
5904
5905
const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp)
5906
0
{
5907
0
    hts_pos_t pos64 = 0;
5908
0
    const bam_pileup1_t *p = bam_plp64_auto(iter, _tid, &pos64, _n_plp);
5909
0
    if (pos64 < INT_MAX) {
5910
0
        *_pos = pos64;
5911
0
    } else {
5912
0
        hts_log_error("Position %"PRId64" too large", pos64);
5913
0
        *_pos = INT_MAX;
5914
0
        iter->error = 1;
5915
0
        *_n_plp = -1;
5916
0
        return NULL;
5917
0
    }
5918
0
    return p;
5919
0
}
5920
5921
void bam_plp_reset(bam_plp_t iter)
5922
0
{
5923
0
    overlap_remove(iter, NULL);
5924
0
    iter->max_tid = iter->max_pos = -1;
5925
0
    iter->tid = iter->pos = 0;
5926
0
    iter->is_eof = 0;
5927
0
    while (iter->head != iter->tail) {
5928
0
        lbnode_t *p = iter->head;
5929
0
        iter->head = p->next;
5930
0
        mp_free(iter->mp, p);
5931
0
    }
5932
0
}
5933
5934
void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt)
5935
0
{
5936
0
    iter->maxcnt = maxcnt;
5937
0
}
5938
5939
/************************
5940
 *** Mpileup iterator ***
5941
 ************************/
5942
5943
struct bam_mplp_s {
5944
    int n;
5945
    int32_t min_tid, *tid;
5946
    hts_pos_t min_pos, *pos;
5947
    bam_plp_t *iter;
5948
    int *n_plp;
5949
    const bam_pileup1_t **plp;
5950
};
5951
5952
bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data)
5953
0
{
5954
0
    int i;
5955
0
    bam_mplp_t iter;
5956
0
    iter = (bam_mplp_t)calloc(1, sizeof(struct bam_mplp_s));
5957
0
    iter->pos = (hts_pos_t*)calloc(n, sizeof(hts_pos_t));
5958
0
    iter->tid = (int32_t*)calloc(n, sizeof(int32_t));
5959
0
    iter->n_plp = (int*)calloc(n, sizeof(int));
5960
0
    iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*));
5961
0
    iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t));
5962
0
    iter->n = n;
5963
0
    iter->min_pos = HTS_POS_MAX;
5964
0
    iter->min_tid = (uint32_t)-1;
5965
0
    for (i = 0; i < n; ++i) {
5966
0
        iter->iter[i] = bam_plp_init(func, data[i]);
5967
0
        iter->pos[i] = iter->min_pos;
5968
0
        iter->tid[i] = iter->min_tid;
5969
0
    }
5970
0
    return iter;
5971
0
}
5972
5973
int bam_mplp_init_overlaps(bam_mplp_t iter)
5974
0
{
5975
0
    int i, r = 0;
5976
0
    for (i = 0; i < iter->n; ++i)
5977
0
        r |= bam_plp_init_overlaps(iter->iter[i]);
5978
0
    return r == 0 ? 0 : -1;
5979
0
}
5980
5981
void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt)
5982
0
{
5983
0
    int i;
5984
0
    for (i = 0; i < iter->n; ++i)
5985
0
        iter->iter[i]->maxcnt = maxcnt;
5986
0
}
5987
5988
void bam_mplp_destroy(bam_mplp_t iter)
5989
0
{
5990
0
    int i;
5991
0
    for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]);
5992
0
    free(iter->iter); free(iter->pos); free(iter->tid);
5993
0
    free(iter->n_plp); free(iter->plp);
5994
0
    free(iter);
5995
0
}
5996
5997
int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp)
5998
0
{
5999
0
    int i, ret = 0;
6000
0
    hts_pos_t new_min_pos = HTS_POS_MAX;
6001
0
    uint32_t new_min_tid = (uint32_t)-1;
6002
0
    for (i = 0; i < iter->n; ++i) {
6003
0
        if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
6004
0
            int tid;
6005
0
            hts_pos_t pos;
6006
0
            iter->plp[i] = bam_plp64_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]);
6007
0
            if ( iter->iter[i]->error ) return -1;
6008
0
            if (iter->plp[i]) {
6009
0
                iter->tid[i] = tid;
6010
0
                iter->pos[i] = pos;
6011
0
            } else {
6012
0
                iter->tid[i] = 0;
6013
0
                iter->pos[i] = 0;
6014
0
            }
6015
0
        }
6016
0
        if (iter->plp[i]) {
6017
0
            if (iter->tid[i] < new_min_tid) {
6018
0
                new_min_tid = iter->tid[i];
6019
0
                new_min_pos = iter->pos[i];
6020
0
            } else if (iter->tid[i] == new_min_tid && iter->pos[i] < new_min_pos) {
6021
0
                new_min_pos = iter->pos[i];
6022
0
            }
6023
0
        }
6024
0
    }
6025
0
    iter->min_pos = new_min_pos;
6026
0
    iter->min_tid = new_min_tid;
6027
0
    if (new_min_pos == HTS_POS_MAX) return 0;
6028
0
    *_tid = new_min_tid; *_pos = new_min_pos;
6029
0
    for (i = 0; i < iter->n; ++i) {
6030
0
        if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) {
6031
0
            n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i];
6032
0
            ++ret;
6033
0
        } else n_plp[i] = 0, plp[i] = 0;
6034
0
    }
6035
0
    return ret;
6036
0
}
6037
6038
int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp)
6039
0
{
6040
0
    hts_pos_t pos64 = 0;
6041
0
    int ret = bam_mplp64_auto(iter, _tid, &pos64, n_plp, plp);
6042
0
    if (ret >= 0) {
6043
0
        if (pos64 < INT_MAX) {
6044
0
            *_pos = pos64;
6045
0
        } else {
6046
0
            hts_log_error("Position %"PRId64" too large", pos64);
6047
0
            *_pos = INT_MAX;
6048
0
            return -1;
6049
0
        }
6050
0
    }
6051
0
    return ret;
6052
0
}
6053
6054
void bam_mplp_reset(bam_mplp_t iter)
6055
0
{
6056
0
    int i;
6057
0
    iter->min_pos = HTS_POS_MAX;
6058
0
    iter->min_tid = (uint32_t)-1;
6059
0
    for (i = 0; i < iter->n; ++i) {
6060
0
        bam_plp_reset(iter->iter[i]);
6061
0
        iter->pos[i] = HTS_POS_MAX;
6062
0
        iter->tid[i] = (uint32_t)-1;
6063
0
        iter->n_plp[i] = 0;
6064
0
        iter->plp[i] = NULL;
6065
0
    }
6066
0
}
6067
6068
void bam_mplp_constructor(bam_mplp_t iter,
6069
0
                          int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
6070
0
    int i;
6071
0
    for (i = 0; i < iter->n; ++i)
6072
0
        bam_plp_constructor(iter->iter[i], func);
6073
0
}
6074
6075
void bam_mplp_destructor(bam_mplp_t iter,
6076
0
                         int (*func)(void *arg, const bam1_t *b, bam_pileup_cd *cd)) {
6077
0
    int i;
6078
0
    for (i = 0; i < iter->n; ++i)
6079
0
        bam_plp_destructor(iter->iter[i], func);
6080
0
}
6081
6082
#endif // ~!defined(BAM_NO_PILEUP)
6083
6084
// ---------------------------
6085
// Base Modification retrieval
6086
//
6087
// These operate by recording state in an opaque type, allocated and freed
6088
// via the functions below.
6089
//
6090
// Initially we call bam_parse_basemod to process the tags and record the
6091
// modifications in the state structure, and then functions such as
6092
// bam_next_basemod can iterate over this cached state.
6093
6094
/*
6095
 * Base modification are stored in MM/Mm tags as <mod_list> defined as
6096
 *
6097
 * <mod_list>        ::= <mod_chain><mod_list> | ""
6098
 * <mod_chain>       ::= <canonical_base><strand><mod-list><delta-list>
6099
 *
6100
 * <canonical_base>  ::= "A" | "C" | "G" | "T" | "N".
6101
 *
6102
 * <strand>          ::= "+" | "-".
6103
 *
6104
 * <mod-list>        ::= <simple-mod-list> | <ChEBI-code>
6105
 * <simple-mod-list> ::= <simple-mod><simple-mod-list> | <simple-mod>
6106
 * <ChEBI-code>      ::= <integer>
6107
 * <simple-mod>      ::= <letter>
6108
 *
6109
 * <delta-list>      ::= "," <integer> <delta-list> | ";"
6110
 *
6111
 * We do not allocate additional memory other than the fixed size
6112
 * state, thus we track up to 256 pointers to different locations
6113
 * within the MM and ML tags.  Each pointer is for a distinct
6114
 * modification code (simple or ChEBI), meaning some may point to the
6115
 * same delta-list when multiple codes are combined together
6116
 * (e.g. "C+mh,1,5,18,3;").  This is the MM[] array.
6117
 *
6118
 * Each numeric in the delta-list is tracked in MMcount[], counted
6119
 * down until it hits zero in which case the next delta is fetched.
6120
 *
6121
 * ML array similarly holds the locations in the quality (ML) tag per
6122
 * type, but these are interleaved so C+mhfc,10,15 will have 4 types
6123
 * all pointing to the same delta position, but in ML we store
6124
 * Q(m0)Q(h0)Q(f0)Q(c0) followed by Q(m1)Q(h1)Q(f1)Q(c1).  This ML
6125
 * also has MLstride indicating how many positions along ML to jump
6126
 * each time we consume a base. (4 in our above example, but usually 1
6127
 * for the simple case).
6128
 *
6129
 * One complexity of the base modification system is that mods are
6130
 * always stored in the original DNA orientation.  This is so that
6131
 * tools that may reverse-complement a sequence (eg "samtools fastq -T
6132
 * MM,ML") can pass through these modification tags irrespective of
6133
 * whether they have any knowledge of their internal workings.
6134
 *
6135
 * Because we don't wish to allocate extra memory, we cannot simply
6136
 * reverse the MM and ML tags.  Sadly this means we have to manage the
6137
 * reverse complementing ourselves on-the-fly.
6138
 * For reversed reads we start at the right end of MM and no longer
6139
 * stop at the semicolon.  Instead we use MMend[] array to mark the
6140
 * termination point.
6141
 */
6142
0
#define MAX_BASE_MOD 256
6143
struct hts_base_mod_state {
6144
    int type[MAX_BASE_MOD];     // char or minus-CHEBI
6145
    int canonical[MAX_BASE_MOD];// canonical base, as seqi (1,2,4,8,15)
6146
    char strand[MAX_BASE_MOD];  // strand of modification; + or -
6147
    int MMcount[MAX_BASE_MOD];  // no. canonical bases left until next mod
6148
    char *MM[MAX_BASE_MOD];     // next pos delta (string)
6149
    char *MMend[MAX_BASE_MOD];  // end of pos-delta string
6150
    uint8_t *ML[MAX_BASE_MOD];  // next qual
6151
    int MLstride[MAX_BASE_MOD]; // bytes between quals for this type
6152
    int implicit[MAX_BASE_MOD]; // treat unlisted positions as non-modified?
6153
    int seq_pos;                // current position along sequence
6154
    int nmods;                  // used array size (0 to MAX_BASE_MOD-1).
6155
};
6156
6157
0
hts_base_mod_state *hts_base_mod_state_alloc(void) {
6158
0
    return calloc(1, sizeof(hts_base_mod_state));
6159
0
}
6160
6161
0
void hts_base_mod_state_free(hts_base_mod_state *state) {
6162
0
    free(state);
6163
0
}
6164
6165
/*
6166
 * Count frequency of A, C, G, T and N canonical bases in the sequence
6167
 */
6168
0
static void seq_freq(const bam1_t *b, int freq[16]) {
6169
0
    int i;
6170
6171
0
    memset(freq, 0, 16*sizeof(*freq));
6172
0
    uint8_t *seq = bam_get_seq(b);
6173
0
    for (i = 0; i < b->core.l_qseq; i++)
6174
0
        freq[bam_seqi(seq, i)]++;
6175
0
    freq[15] = b->core.l_qseq; // all bases count as N for base mods
6176
0
}
6177
6178
//0123456789ABCDEF
6179
//=ACMGRSVTWYHKDBN  aka seq_nt16_str[]
6180
//=TGKCYSBAWRDMHVN  comp1ement of seq_nt16_str
6181
//084C2A6E195D3B7F
6182
static int seqi_rc[] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 };
6183
6184
/*
6185
 * Parse the MM and ML tags to populate the base mod state.
6186
 * This structure will have been previously allocated via
6187
 * hts_base_mod_state_alloc, but it does not need to be repeatedly
6188
 * freed and allocated for each new bam record. (Although obviously
6189
 * it requires a new call to this function.)
6190
 *
6191
 */
6192
0
int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) {
6193
    // Read MM and ML tags
6194
0
    uint8_t *mm = bam_aux_get(b, "MM");
6195
0
    if (!mm) mm = bam_aux_get(b, "Mm");
6196
0
    if (!mm)
6197
0
        return 0;
6198
0
    if (mm[0] != 'Z') {
6199
0
        hts_log_error("MM tag is not of type Z");
6200
0
        return -1;
6201
0
    }
6202
6203
0
    uint8_t *ml = bam_aux_get(b, "ML");
6204
0
    if (!ml) ml = bam_aux_get(b, "Ml");
6205
0
    if (ml && (ml[0] != 'B' || ml[1] != 'C')) {
6206
0
        hts_log_error("ML tag is not of type B,C");
6207
0
        return -1;
6208
0
    }
6209
0
    uint8_t *ml_end = ml ? ml+6 + le_to_u32(ml+2) : NULL;
6210
0
    if (ml) ml += 6;
6211
6212
0
    state->seq_pos = 0;
6213
6214
    // Aggregate freqs of ACGTN if reversed, to get final-delta (later)
6215
0
    int freq[16];
6216
0
    if (b->core.flag & BAM_FREVERSE)
6217
0
        seq_freq(b, freq);
6218
6219
0
    char *cp = (char *)mm+1;
6220
0
    int mod_num = 0;
6221
0
    int implicit = 1;
6222
0
    while (*cp) {
6223
0
        for (; *cp; cp++) {
6224
            // cp should be [ACGTNU][+-]([a-zA-Z]+|[0-9]+)[.?]?(,\d+)*;
6225
0
            unsigned char btype = *cp++;
6226
6227
0
            if (btype != 'A' && btype != 'C' &&
6228
0
                btype != 'G' && btype != 'T' &&
6229
0
                btype != 'U' && btype != 'N')
6230
0
                return -1;
6231
0
            if (btype == 'U') btype = 'T';
6232
6233
0
            btype = seq_nt16_table[btype];
6234
6235
            // Strand
6236
0
            if (*cp != '+' && *cp != '-')
6237
0
                return -1; // malformed
6238
0
            char strand = *cp++;
6239
6240
            // List of modification types
6241
0
            char *ms = cp, *me; // mod code start and end
6242
0
            char *cp_end = NULL;
6243
0
            int chebi = 0;
6244
0
            if (isdigit_c(*cp)) {
6245
0
                chebi = strtol(cp, &cp_end, 10);
6246
0
                cp = cp_end;
6247
0
                ms = cp-1;
6248
0
            } else {
6249
0
                while (*cp && isalpha_c(*cp))
6250
0
                    cp++;
6251
0
                if (*cp == '\0')
6252
0
                    return -1;
6253
0
            }
6254
6255
0
            me = cp;
6256
6257
            // Optional explicit vs implicit marker
6258
0
            if (*cp == '.') {
6259
                // default is implicit = 1;
6260
0
                cp++;
6261
0
            } else if (*cp == '?') {
6262
0
                implicit = 0;
6263
0
                cp++;
6264
0
            } else if (*cp != ',' && *cp != ';') {
6265
                // parse error
6266
0
                return -1;
6267
0
            }
6268
6269
0
            long delta;
6270
0
            int n = 0; // nth symbol in a multi-mod string
6271
0
            int stride = me-ms;
6272
0
            int ndelta = 0;
6273
6274
0
            if (b->core.flag & BAM_FREVERSE) {
6275
                // We process the sequence in left to right order,
6276
                // but delta is successive count of bases to skip
6277
                // counting right to left.  This also means the number
6278
                // of bases to skip at left edge is unrecorded (as it's
6279
                // the remainder).
6280
                //
6281
                // To output mods in left to right, we step through the
6282
                // MM list in reverse and need to identify the left-end
6283
                // "remainder" delta.
6284
0
                int total_seq = 0;
6285
0
                for (;;) {
6286
0
                    cp += (*cp == ',');
6287
0
                    if (*cp == 0 || *cp == ';')
6288
0
                        break;
6289
6290
0
                    delta = strtol(cp, &cp_end, 10);
6291
0
                    if (cp_end == cp) {
6292
0
                        hts_log_error("Hit end of MM tag. Missing semicolon?");
6293
0
                        return -1;
6294
0
                    }
6295
6296
0
                    cp = cp_end;
6297
0
                    total_seq += delta+1;
6298
0
                    ndelta++;
6299
0
                }
6300
0
                delta = freq[seqi_rc[btype]] - total_seq; // remainder
6301
0
            } else {
6302
0
                delta = *cp == ','
6303
0
                    ? strtol(cp+1, &cp_end, 10)
6304
0
                    : 0;
6305
0
                if (!cp_end) {
6306
                    // empty list
6307
0
                    delta = INT_MAX;
6308
0
                    cp_end = cp+1;
6309
0
                }
6310
0
            }
6311
            // Now delta is first in list or computed remainder,
6312
            // and cp_end is either start or end of the MM list.
6313
0
            while (ms < me) {
6314
0
                state->type     [mod_num] = chebi ? -chebi : *ms;
6315
0
                state->strand   [mod_num] = (strand == '-');
6316
0
                state->canonical[mod_num] = btype;
6317
0
                state->MLstride [mod_num] = stride;
6318
0
                state->implicit [mod_num] = implicit;
6319
6320
0
                if (delta < 0) {
6321
0
                    hts_log_error("MM tag refers to bases beyond sequence "
6322
0
                                  "length");
6323
0
                    return -1;
6324
0
                }
6325
0
                state->MMcount  [mod_num] = delta;
6326
0
                if (b->core.flag & BAM_FREVERSE) {
6327
0
                    state->MM   [mod_num] = cp+1;
6328
0
                    state->MMend[mod_num] = cp_end;
6329
0
                    state->ML   [mod_num] = ml ? ml+n +(ndelta-1)*stride: NULL;
6330
0
                } else {
6331
0
                    state->MM   [mod_num] = cp_end;
6332
0
                    state->MMend[mod_num] = NULL;
6333
0
                    state->ML   [mod_num] = ml ? ml+n : NULL;
6334
0
                }
6335
6336
0
                if (++mod_num >= MAX_BASE_MOD) {
6337
0
                    hts_log_error("Too many base modification types");
6338
0
                    return -1;
6339
0
                }
6340
0
                ms++; n++;
6341
0
            }
6342
6343
            // Skip modification deltas
6344
0
            if (ml) {
6345
0
                if (b->core.flag & BAM_FREVERSE) {
6346
0
                    ml += ndelta*stride;
6347
0
                } else {
6348
0
                    while (*cp && *cp != ';') {
6349
0
                        if (*cp == ',')
6350
0
                            ml+=stride;
6351
0
                        cp++;
6352
0
                    }
6353
0
                }
6354
0
                if (ml > ml_end) {
6355
0
                    hts_log_error("Insufficient number of entries in ML tag");
6356
0
                    return -1;
6357
0
                }
6358
0
            } else {
6359
                // cp_end already known if FREVERSE
6360
0
                if (cp_end && (b->core.flag & BAM_FREVERSE))
6361
0
                    cp = cp_end;
6362
0
                else
6363
0
                    while (*cp && *cp != ';')
6364
0
                        cp++;
6365
0
            }
6366
0
            if (!*cp) {
6367
0
                hts_log_error("Hit end of MM tag. Missing semicolon?");
6368
0
                return -1;
6369
0
            }
6370
0
        }
6371
0
    }
6372
6373
0
    state->nmods = mod_num;
6374
6375
0
    return 0;
6376
0
}
6377
6378
/*
6379
 * Fills out mods[] with the base modifications found.
6380
 * Returns the number found (0 if none), which may be more than
6381
 * the size of n_mods if more were found than reported.
6382
 * Returns <= -1 on error.
6383
 *
6384
 * This always marches left to right along sequence, irrespective of
6385
 * reverse flag or modification strand.
6386
 */
6387
int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state,
6388
0
                         hts_base_mod *mods, int n_mods) {
6389
0
    if (b->core.flag & BAM_FREVERSE) {
6390
0
        if (state->seq_pos < 0)
6391
0
            return -1;
6392
0
    } else {
6393
0
        if (state->seq_pos >= b->core.l_qseq)
6394
0
            return -1;
6395
0
    }
6396
6397
0
    int i, j, n = 0;
6398
0
    unsigned char base = bam_seqi(bam_get_seq(b), state->seq_pos);
6399
0
    state->seq_pos++;
6400
0
    if (b->core.flag & BAM_FREVERSE)
6401
0
        base = seqi_rc[base];
6402
6403
0
    for (i = 0; i < state->nmods; i++) {
6404
0
        if (state->canonical[i] != base && state->canonical[i] != 15/*N*/)
6405
0
            continue;
6406
6407
0
        if (state->MMcount[i]-- > 0)
6408
0
            continue;
6409
6410
0
        char *MMptr = state->MM[i];
6411
0
        if (n < n_mods) {
6412
0
            mods[n].modified_base = state->type[i];
6413
0
            mods[n].canonical_base = seq_nt16_str[state->canonical[i]];
6414
0
            mods[n].strand = state->strand[i];
6415
0
            mods[n].qual = state->ML[i] ? *state->ML[i] : -1;
6416
0
        }
6417
0
        n++;
6418
0
        if (state->ML[i])
6419
0
            state->ML[i] += (b->core.flag & BAM_FREVERSE)
6420
0
                ? -state->MLstride[i]
6421
0
                : +state->MLstride[i];
6422
6423
0
        if (b->core.flag & BAM_FREVERSE) {
6424
            // process MM list backwards
6425
0
            char *cp;
6426
0
            for (cp = state->MMend[i]-1; cp != state->MM[i]; cp--)
6427
0
                if (*cp == ',')
6428
0
                    break;
6429
0
            state->MMend[i] = cp;
6430
0
            if (cp != state->MM[i])
6431
0
                state->MMcount[i] = strtol(cp+1, NULL, 10);
6432
0
            else
6433
0
                state->MMcount[i] = INT_MAX;
6434
0
        } else {
6435
0
            if (*state->MM[i] == ',')
6436
0
                state->MMcount[i] = strtol(state->MM[i]+1, &state->MM[i], 10);
6437
0
            else
6438
0
                state->MMcount[i] = INT_MAX;
6439
0
        }
6440
6441
        // Multiple mods at the same coords.
6442
0
        for (j=i+1; j < state->nmods && state->MM[j] == MMptr; j++) {
6443
0
            if (n < n_mods) {
6444
0
                mods[n].modified_base = state->type[j];
6445
0
                mods[n].canonical_base = seq_nt16_str[state->canonical[j]];
6446
0
                mods[n].strand = state->strand[j];
6447
0
                mods[n].qual = state->ML[j] ? *state->ML[j] : -1;
6448
0
            }
6449
0
            n++;
6450
0
            state->MMcount[j] = state->MMcount[i];
6451
0
            state->MM[j]      = state->MM[i];
6452
0
            if (state->ML[j])
6453
0
                state->ML[j] += (b->core.flag & BAM_FREVERSE)
6454
0
                    ? -state->MLstride[j]
6455
0
                    : +state->MLstride[j];
6456
0
        }
6457
0
        i = j-1;
6458
0
    }
6459
6460
0
    return n;
6461
0
}
6462
6463
/*
6464
 * Looks for the next location with a base modification.
6465
 */
6466
int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state,
6467
0
                     hts_base_mod *mods, int n_mods, int *pos) {
6468
0
    if (state->seq_pos >= b->core.l_qseq)
6469
0
        return 0;
6470
6471
    // Look through state->MMcount arrays to see when the next lowest is
6472
    // per base type;
6473
0
    int next[16], freq[16] = {0}, i;
6474
0
    memset(next, 0x7f, 16*sizeof(*next));
6475
0
    if (b->core.flag & BAM_FREVERSE) {
6476
0
        for (i = 0; i < state->nmods; i++) {
6477
0
            if (next[seqi_rc[state->canonical[i]]] > state->MMcount[i])
6478
0
                next[seqi_rc[state->canonical[i]]] = state->MMcount[i];
6479
0
        }
6480
0
    } else {
6481
0
        for (i = 0; i < state->nmods; i++) {
6482
0
            if (next[state->canonical[i]] > state->MMcount[i])
6483
0
                next[state->canonical[i]] = state->MMcount[i];
6484
0
        }
6485
0
    }
6486
6487
    // Now step through the sequence counting off base types.
6488
0
    for (i = state->seq_pos; i < b->core.l_qseq; i++) {
6489
0
        unsigned char bc = bam_seqi(bam_get_seq(b), i);
6490
0
        if (next[bc] <= freq[bc] || next[15] <= freq[15])
6491
0
            break;
6492
0
        freq[bc]++;
6493
0
        if (bc != 15) // N
6494
0
            freq[15]++;
6495
0
    }
6496
0
    *pos = state->seq_pos = i;
6497
6498
0
    if (i >= b->core.l_qseq) {
6499
        // Check for more MM elements than bases present.
6500
0
        for (i = 0; i < state->nmods; i++) {
6501
0
            if (!(b->core.flag & BAM_FREVERSE) &&
6502
0
                state->MMcount[i] < 0x7f000000) {
6503
0
                hts_log_warning("MM tag refers to bases beyond sequence length");
6504
0
                return -1;
6505
0
            }
6506
0
        }
6507
0
        return 0;
6508
0
    }
6509
6510
0
    if (b->core.flag & BAM_FREVERSE) {
6511
0
        for (i = 0; i < state->nmods; i++)
6512
0
            state->MMcount[i] -= freq[seqi_rc[state->canonical[i]]];
6513
0
    } else {
6514
0
        for (i = 0; i < state->nmods; i++)
6515
0
            state->MMcount[i] -= freq[state->canonical[i]];
6516
0
    }
6517
6518
0
    int r = bam_mods_at_next_pos(b, state, mods, n_mods);
6519
0
    return r > 0 ? r : 0;
6520
0
}
6521
6522
/*
6523
 * As per bam_mods_at_next_pos, but at a specific qpos >= the previous qpos.
6524
 * This can only march forwards along the read, but can do so by more than
6525
 * one base-pair.
6526
 *
6527
 * This makes it useful for calling from pileup iterators where qpos may
6528
 * start part way through a read for the first occurrence of that record.
6529
 */
6530
int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state,
6531
0
                    hts_base_mod *mods, int n_mods) {
6532
    // FIXME: for now this is inefficient in implementation.
6533
0
    int r = 0;
6534
0
    while (state->seq_pos <= qpos)
6535
0
        if ((r = bam_mods_at_next_pos(b, state, mods, n_mods)) < 0)
6536
0
            break;
6537
6538
0
    return r;
6539
0
}
6540
6541
/*
6542
 * Returns the list of base modification codes provided for this
6543
 * alignment record as an array of character codes (+ve) or ChEBI numbers
6544
 * (negative).
6545
 *
6546
 * Returns the array, with *ntype filled out with the size.
6547
 *         The array returned should not be freed.
6548
 *         It is a valid pointer until the state is freed using
6549
 *         hts_base_mod_free().
6550
 */
6551
0
int *bam_mods_recorded(hts_base_mod_state *state, int *ntype) {
6552
0
    *ntype = state->nmods;
6553
0
    return state->type;
6554
0
}
6555
6556
/*
6557
 * Returns data about a specific modification type for the alignment record.
6558
 * Code is either positive (eg 'm') or negative for ChEBI numbers.
6559
 *
6560
 * Return 0 on success or -1 if not found.  The strand, implicit and canonical
6561
 * fields are filled out if passed in as non-NULL pointers.
6562
 */
6563
int bam_mods_query_type(hts_base_mod_state *state, int code,
6564
0
                        int *strand, int *implicit, char *canonical) {
6565
    // Find code entry
6566
0
    int i;
6567
0
    for (i = 0; i < state->nmods; i++) {
6568
0
        if (state->type[i] == code)
6569
0
            break;
6570
0
    }
6571
0
    if (i == state->nmods)
6572
0
        return -1;
6573
6574
    // Return data
6575
0
    if (strand)    *strand    = state->strand[i];
6576
0
    if (implicit)  *implicit  = state->implicit[i];
6577
0
    if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]];
6578
6579
0
    return 0;
6580
0
}